26 files changed, 4069 insertions, 23 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 5502372dfc94..55e135f6dd61 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -2,6 +2,7 @@
 config BCACHE
 	tristate "Block device as cache"
 	select LIBCRC32C
+	select FS_POSIX_ACL
 	---help---
 	Allows a block device to be used as cache for other devices; uses
 	a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 02ef2612777e..0dd3db8a5ef4 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,9 +1,10 @@
 
 obj-$(CONFIG_BCACHE)	+= bcache.o
 
-bcache-y		:= alloc.o bkey.o bkey_methods.o blockdev.o bset.o\
-	btree.o buckets.o clock.o closure.o debug.o extents.o gc.o inode.o io.o\
-	journal.o keybuf.o keylist.o migrate.o move.o movinggc.o notify.o\
-	request.o six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o
+bcache-y		:= acl.o alloc.o bkey.o bkey_methods.o blockdev.o\
+	bset.o btree.o buckets.o clock.o closure.o debug.o dirent.o extents.o\
+	fs.o fs-gc.o gc.o inode.o io.o journal.o keybuf.o keylist.o migrate.o\
+	move.o movinggc.o notify.o request.o siphash.o six.o stats.o super.o\
+	sysfs.o tier.o trace.o util.o writeback.o xattr.o
 
 ccflags-y		:= -Werror
diff --git a/drivers/md/bcache/acl.c b/drivers/md/bcache/acl.c
new file mode 100644
index 000000000000..51f04ab2a9d5
--- /dev/null
+++ b/drivers/md/bcache/acl.c
@@ -0,0 +1,245 @@
+#include "bcache.h"
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		return ERR_PTR(-EINVAL);
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(bch_acl_header);
+	count = bch_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		bch_acl_entry *entry =
+			(bch_acl_entry *)value;
+		if ((char *)value + sizeof(bch_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(bch_acl_entry_short);
+			break;
+
+		case ACL_USER:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(bch_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	bch_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = bch_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count *
+			sizeof(bch_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(bch_acl_header);
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+		bch_acl_entry *entry = (bch_acl_entry *)e;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(bch_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *bch_get_acl(struct inode *inode, int type)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
+	}
+	ret = bch_xattr_get(c, inode->i_ino, "", NULL, 0, name_index);
+	if (ret > 0) {
+		value = kmalloc(ret, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		ret = bch_xattr_get(c, inode->i_ino, "", value,
+				    ret, name_index);
+	}
+	if (ret > 0)
+		acl = bch_acl_from_disk(value, ret);
+	else if (ret == -ENODATA || ret == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(ret);
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = bch_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	ret = bch_xattr_set(inode, "", value, size, 0, name_index);
+
+	kfree(value);
+
+	if (ret == -ERANGE)
+		ret = -E2BIG;
+
+	if (!ret)
+		set_cached_acl(inode, type, acl);
+
+	return ret;
+}
+
+int bch_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *default_acl, *acl;
+	int ret;
+
+	ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (ret)
+		return ret;
+
+	if (default_acl) {
+		ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!ret)
+			ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		posix_acl_release(acl);
+	}
+	return ret;
+}
diff --git a/drivers/md/bcache/acl.h b/drivers/md/bcache/acl.h
new file mode 100644
index 000000000000..03f93fa0ff1b
--- /dev/null
+++ b/drivers/md/bcache/acl.h
@@ -0,0 +1,57 @@
+/*
+  File: fs/bch/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+static inline size_t bch_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(bch_acl_header) +
+		       count * sizeof(bch_acl_entry_short);
+	} else {
+		return sizeof(bch_acl_header) +
+		       4 * sizeof(bch_acl_entry_short) +
+		       (count - 4) * sizeof(bch_acl_entry);
+	}
+}
+
+static inline int bch_acl_count(size_t size)
+{
+	ssize_t s;
+
+	size -= sizeof(bch_acl_header);
+	s = size - 4 * sizeof(bch_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(bch_acl_entry_short))
+			return -1;
+		return size / sizeof(bch_acl_entry_short);
+	} else {
+		if (s % sizeof(bch_acl_entry))
+			return -1;
+		return s / sizeof(bch_acl_entry) + 4;
+	}
+}
+
+extern struct posix_acl *bch_get_acl(struct inode *, int);
+extern int bch_set_acl(struct inode *, struct posix_acl *, int);
+extern int bch_init_acl(struct inode *, struct inode *);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5aa2c2863c3c..b203e28c48ca 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -351,11 +351,12 @@ struct cache {
 };
 
 struct gc_stat {
-	size_t			nodes;
-	size_t			key_bytes;
+	u64			nodes;
+	u64			key_bytes;
+	u64			nkeys;
 
-	size_t			nkeys;
-	uint64_t		data;	/* sectors */
+	u64			data;	/* sectors */
+	u64			inodes;
 };
 
 /*
@@ -384,6 +385,7 @@ enum {
 	CACHE_SET_RO,
 	CACHE_SET_GC_STOPPING,
 	CACHE_SET_GC_FAILURE,
+	CACHE_SET_BDEV_MOUNTED,
 };
 
 struct cache_member_rcu {
@@ -404,6 +406,7 @@ struct cache_set {
 	struct list_head	list;
 	struct kobject		kobj;
 	struct kobject		internal;
+	struct completion	*stop_completion;
 	unsigned long		flags;
 
 	/* Counts outstanding writes, for clean transition to read-only */
@@ -423,10 +426,13 @@ struct cache_set {
 
 	struct bio_set		bio_split;
 
+	/* For punting bio submissions to workqueue, io.c */
 	struct bio_list		bio_submit_list;
 	struct work_struct	bio_submit_work;
 	spinlock_t		bio_submit_lock;
 
+	struct backing_dev_info bdi;
+
 	/* BTREE CACHE */
 	struct bio_set		btree_bio;
 
@@ -481,9 +487,16 @@ struct cache_set {
 
 	struct timer_list	foreground_write_wakeup;
 
+	/*
+	 * These contain all r/w devices - i.e. devices we can currently
+	 * allocate from:
+	 */
 	struct cache_group	cache_all;
 	struct cache_group	cache_tiers[CACHE_TIERS];
+
 	u64			capacity; /* sectors */
+	atomic_long_t		sectors_reserved;
+	atomic_long_t		sectors_reserved_cache;
 
 	struct mutex		bucket_lock;
 
@@ -567,6 +580,9 @@ struct cache_set {
 	struct work_struct	read_race_work;
 	spinlock_t		read_race_lock;
 
+	/* FILESYSTEM */
+	atomic_long_t		nr_inodes;
+
 	/* TIERING */
 	struct task_struct	*tiering_read;
 	struct bch_pd_controller tiering_pd;
@@ -757,5 +773,7 @@ do {									\
 
 void bch_debug_exit(void);
 int bch_debug_init(void);
+void bch_fs_exit(void);
+int bch_fs_init(void);
 
 #endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h
index 1a82e57ab420..5bb19a700788 100644
--- a/drivers/md/bcache/bkey.h
+++ b/drivers/md/bcache/bkey.h
@@ -493,6 +493,10 @@ BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT);
 BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
 BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
 
+BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
+
 /* byte order helpers */
 
 #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c
index 90e1c9e7df38..03affccac1ce 100644
--- a/drivers/md/bcache/bkey_methods.c
+++ b/drivers/md/bcache/bkey_methods.c
@@ -2,12 +2,16 @@
 #include "bcache.h"
 #include "bkey_methods.h"
 #include "btree.h"
+#include "dirent.h"
 #include "extents.h"
 #include "inode.h"
+#include "xattr.h"
 
 static const struct bkey_ops *bch_bkey_ops[] = {
 	[BKEY_TYPE_EXTENTS]	= &bch_bkey_extent_ops,
 	[BKEY_TYPE_INODES]	= &bch_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= &bch_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= &bch_bkey_xattr_ops,
 	[BKEY_TYPE_BTREE]	= &bch_bkey_btree_ops,
 };
 
diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h
index 3644e7e110ab..cd58d86af3bb 100644
--- a/drivers/md/bcache/buckets.h
+++ b/drivers/md/bcache/buckets.h
@@ -229,26 +229,27 @@ static inline size_t buckets_free_cache(struct cache *ca,
 	return __buckets_free_cache(ca, bch_bucket_stats_read(ca), reserve);
 }
 
-static inline u64 cache_sectors_used(struct cache *ca)
-{
-	struct bucket_stats stats = bch_bucket_stats_read(ca);
-
-	return (stats.buckets_meta << ca->bucket_bits) +
-		stats.sectors_dirty;
-}
-
-static inline bool cache_set_full(struct cache_set *c)
+static inline u64 cache_set_sectors_used(struct cache_set *c)
 {
 	struct cache *ca;
 	unsigned i;
 	u64 used = 0;
 
 	rcu_read_lock();
-	for_each_cache_rcu(ca, c, i)
-		used += cache_sectors_used(ca);
+	for_each_cache_rcu(ca, c, i) {
+		struct bucket_stats stats = bch_bucket_stats_read(ca);
+
+		used += (stats.buckets_meta << ca->bucket_bits) +
+			stats.sectors_dirty;
+	}
 	rcu_read_unlock();
 
-	return used >= c->capacity;
+	return min(c->capacity, used + atomic_long_read(&c->sectors_reserved));
+}
+
+static inline bool cache_set_full(struct cache_set *c)
+{
+	return cache_set_sectors_used(c) >= c->capacity;
 }
 
 static inline bool is_available_bucket(struct bucket_mark mark)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index b0d22579ea0b..967420d7c078 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -10,6 +10,7 @@
 #include "buckets.h"
 #include "debug.h"
 #include "extents.h"
+#include "inode.h"
 #include "io.h"
 #include "super.h"
 
@@ -182,6 +183,95 @@ out_put:
 	bio_put(check);
 }
 
+void bch_verify_inode_refs(struct cache_set *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_inode inode;
+	u64 cur_inum = 0;
+	char buf[100];
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		if (k.k->type == KEY_TYPE_DISCARD)
+			continue;
+
+		if (k.k->p.inode != cur_inum &&
+		    bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+			bch_bkey_val_to_text(c, iter.nodes[0], buf,
+					     sizeof(buf), k);
+			bch_cache_set_error(c,
+				"extent for missing inode %llu\n%s",
+				k.k->p.inode, buf);
+			bch_btree_iter_unlock(&iter);
+			return;
+		}
+
+		cur_inum = k.k->p.inode;
+
+		if (!S_ISREG(inode.v.i_mode) &&
+		    !S_ISLNK(inode.v.i_mode))
+			bch_cache_set_error(c,
+				"extent for non regular file, inode %llu mode %u",
+				k.k->p.inode, inode.v.i_mode);
+
+		BUG_ON(inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY);
+
+		if (k.k->p.offset > round_up(inode.v.i_size, PAGE_SIZE) >> 9) {
+			bch_bkey_val_to_text(c, iter.nodes[0], buf,
+					     sizeof(buf), k);
+			bch_cache_set_error(c,
+				"extent past end of inode %llu: i_size %llu extent\n%s",
+				k.k->p.inode, inode.v.i_size, buf);
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		/* XXX: skipping whiteouts for now */
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		if (k.k->p.inode != cur_inum &&
+		    bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+			bch_cache_set_error(c, "dirent for missing inode %llu",
+					    k.k->p.inode);
+			bch_btree_iter_unlock(&iter);
+			return;
+		}
+
+		cur_inum = k.k->p.inode;
+
+		if (!S_ISDIR(inode.v.i_mode))
+			bch_cache_set_error(c,
+				"dirent for non directory, inode %llu mode %u",
+				k.k->p.inode, inode.v.i_mode);
+	}
+	bch_btree_iter_unlock(&iter);
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+			   POS(BCACHE_ROOT_INO, 0), k) {
+		if (k.k->p.inode != cur_inum &&
+		    bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+			bch_cache_set_error(c,
+				"xattr for missing inode %llu",
+					    k.k->p.inode);
+			bch_btree_iter_unlock(&iter);
+			return;
+		}
+
+		cur_inum = k.k->p.inode;
+
+		if (!S_ISREG(inode.v.i_mode) &&
+		    !S_ISDIR(inode.v.i_mode))
+			bch_cache_set_error(c,
+				"xattr for non file/directory, inode %llu mode %u",
+				k.k->p.inode, inode.v.i_mode);
+	}
+	bch_btree_iter_unlock(&iter);
+}
+
 #endif
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index da35861aa3cb..b3cbb0bd9cd3 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -10,6 +10,7 @@ struct cache_set;
 
 void bch_btree_verify(struct cache_set *, struct btree *);
 void bch_data_verify(struct cached_dev *, struct bio *);
+void bch_verify_inode_refs(struct cache_set *);
 
 #define expensive_debug_checks(c)	((c)->expensive_debug_checks)
 #define key_merging_disabled(c)		((c)->key_merging_disabled)
@@ -19,6 +20,7 @@ void bch_data_verify(struct cached_dev *, struct bio *);
 
 static inline void bch_btree_verify(struct cache_set *c, struct btree *b) {}
 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+static inline void bch_verify_inode_refs(struct cache_set *c) {}
 
 #define expensive_debug_checks(c)	0
 #define key_merging_disabled(c)		0
diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c
new file mode 100644
index 000000000000..999538c71391
--- /dev/null
+++ b/drivers/md/bcache/dirent.c
@@ -0,0 +1,379 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "extents.h"
+#include "dirent.h"
+#include "keylist.h"
+#include "siphash.h"
+
+#include "linux/crc32c.h"
+#include "linux/cryptohash.h"
+
+#if 0
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+	union {
+		u32 b[SHA_DIGEST_WORDS];
+		u64 ret;
+	} digest;
+
+	unsigned done = 0;
+
+	sha_init(digest.b);
+
+	while (done < name->len) {
+		u32 workspace[SHA_WORKSPACE_WORDS];
+		u8 message[SHA_MESSAGE_BYTES];
+		unsigned bytes = min_t(unsigned, name->len - done,
+				       SHA_MESSAGE_BYTES);
+
+		memcpy(message, name->name + done, bytes);
+		memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes);
+		sha_transform(digest.b, message, workspace);
+		done += bytes;
+	}
+
+	/* [0,2) reserved for dots */
+
+	return (digest.ret >= 2 ? digest.ret : 2) & S64_MAX;
+}
+
+static const SIPHASH_KEY bch_siphash_key;
+
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+	u64 hash = SipHash24(&bch_siphash_key,
+			     name->name, name->len) >> 1;
+
+	/* [0,2) reserved for dots */
+
+	return (hash >= 2 ? hash : 2);
+}
+#endif
+
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+	u64 hash = crc32c(0, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+
+	return (hash >= 2 ? hash : 2);
+}
+
+static unsigned dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+
+	while (len && !d.v->d_name[len - 1])
+		--len;
+
+	return len;
+}
+
+static int dirent_cmp(struct bkey_s_c_dirent d,
+		      const struct qstr *q)
+{
+	int len = dirent_name_bytes(d);
+
+	return len - q->len ?: memcmp(d.v->d_name, q->name, len);
+}
+
+static bool bch_dirent_invalid(const struct cache_set *c, struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+			return true;
+
+		return false;
+	case BCH_DIRENT_WHITEOUT:
+		if (bkey_val_bytes(k.k))
+			return true;
+
+		return false;
+	default:
+		return true;
+	}
+}
+
+static void bch_dirent_to_text(struct cache_set *c, char *buf,
+			       size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		d = bkey_s_c_to_dirent(k);
+
+		if (size) {
+			unsigned n = min_t(unsigned, size,
+					   dirent_name_bytes(d));
+			memcpy(buf, d.v->d_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		scnprintf(buf, size, " -> %llu", d.v->d_inum);
+		break;
+	case BCH_DIRENT_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct btree_keys_ops bch_dirent_ops = {
+};
+
+const struct bkey_ops bch_bkey_dirent_ops = {
+	.key_invalid	= bch_dirent_invalid,
+	.val_to_text	= bch_dirent_to_text,
+};
+
+static int __bch_dirent_create(struct cache_set *c, u64 dir_inum,
+			       u8 type, const struct qstr *name,
+			       u64 dst_inum, bool update,
+			       u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct keylist keys;
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s +
+		DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
+			     sizeof(u64));
+	int ret = -ENOENT;
+
+	bch_keylist_init(&keys);
+
+	if (bch_keylist_realloc(&keys, u64s))
+		return -ENOMEM;
+
+	dirent = bkey_dirent_init(keys.top);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = dst_inum;
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       (sizeof(struct bch_dirent) + name->len));
+
+	BUG_ON(dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+	BUG_ON(dirent_cmp(dirent_i_to_s_c(dirent), name));
+
+	bch_keylist_enqueue(&keys);
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS,
+				   POS(dir_inum, bch_dirent_hash(name)));
+
+	while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+		/* hole? */
+		if (k.k->type != BCH_DIRENT) {
+			if (!update)
+				goto insert;
+			break;
+		}
+
+		if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) {
+			/* found: */
+			if (!update) {
+				ret = -EEXIST;
+				break;
+			}
+insert:
+			dirent->k.p = k.k->p;
+
+			ret = bch_btree_insert_at(&iter, &keys, NULL,
+						  journal_seq,
+						  BTREE_INSERT_ATOMIC);
+			if (ret != -EINTR && ret != -EAGAIN)
+				break;
+		} else {
+			/* collision */
+			bch_btree_iter_advance_pos(&iter);
+		}
+	}
+	bch_btree_iter_unlock(&iter);
+	bch_keylist_free(&keys);
+
+	return ret;
+}
+
+int bch_dirent_create(struct cache_set *c, u64 dir_inum, u8 type,
+		      const struct qstr *name, u64 dst_inum,
+		      u64 *journal_seq)
+{
+	return __bch_dirent_create(c, dir_inum, type,
+				   name, dst_inum, false,
+				   journal_seq);
+}
+
+int bch_dirent_update(struct cache_set *c, u64 dir_inum,
+		      const struct qstr *name, u64 dst_inum,
+		      u64 *journal_seq)
+{
+	return __bch_dirent_create(c, dir_inum, DT_UNKNOWN,
+				   name, dst_inum, true,
+				   journal_seq);
+}
+
+int bch_dirent_delete(struct cache_set *c, u64 dir_inum,
+		      const struct qstr *name)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 hash = bch_dirent_hash(name);
+	int ret = -ENOENT;
+
+	pr_debug("deleting %llu:%llu (%s)",
+		 dir_inum, hash, name->name);
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS,
+				   POS(dir_inum, bch_dirent_hash(name)));
+
+	while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) {
+				struct bkey_i delete;
+
+				bkey_init(&delete.k);
+				delete.k.p = k.k->p;
+				delete.k.type = BCH_DIRENT_WHITEOUT;
+
+				ret = bch_btree_insert_at(&iter,
+						&keylist_single(&delete),
+						NULL, NULL,
+						BTREE_INSERT_NOFAIL|
+						BTREE_INSERT_ATOMIC);
+				if (ret == -EINTR || ret == -EAGAIN)
+					continue;
+			}
+			break;
+		case BCH_DIRENT_WHITEOUT:
+			break;
+		default:
+			/* hole, not found */
+			goto out;
+		}
+
+		bch_btree_iter_advance_pos(&iter);
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+u64 bch_dirent_lookup(struct cache_set *c, u64 dir_inum,
+		      const struct qstr *name)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	u64 hash = bch_dirent_hash(name);
+
+	pr_debug("searching for %llu:%llu (%s)",
+		 dir_inum, hash, name->name);
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_DIRENTS,
+				      POS(dir_inum, bch_dirent_hash(name)), k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			dirent = bkey_s_c_to_dirent(k);
+
+			/* collision? */
+			if (!dirent_cmp(dirent, name)) {
+				u64 inum = dirent.v->d_inum;
+
+				bch_btree_iter_unlock(&iter);
+				pr_debug("found %s: %llu", name->name, inum);
+				return inum;
+			}
+			break;
+		case BCH_DIRENT_WHITEOUT:
+			break;
+		default:
+			/* hole, not found */
+			goto out;
+		}
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+
+	pr_debug("%s not found", name->name);
+	return 0;
+}
+
+int bch_empty_dir(struct cache_set *c, u64 dir_inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == BCH_DIRENT) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct cache_set *c = sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	unsigned len;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(inode->i_ino, ctx->pos), k) {
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		pr_debug("saw %llu:%llu (%s) -> %llu",
+			 k.k->p.inode, k.k->p.offset,
+			 dirent.v->d_name, dirent.v->d_inum);
+
+		if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0)
+			continue;
+
+		if (k.k->p.inode > inode->i_ino)
+			break;
+
+		len = dirent_name_bytes(dirent);
+
+		pr_debug("emitting %s", dirent.v->d_name);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		if (!dir_emit(ctx, dirent.v->d_name, len,
+			      dirent.v->d_inum, dirent.v->d_type))
+			break;
+
+		ctx->pos = k.k->p.offset + 1;
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return 0;
+}
diff --git a/drivers/md/bcache/dirent.h b/drivers/md/bcache/dirent.h
new file mode 100644
index 000000000000..4de22a53c875
--- /dev/null
+++ b/drivers/md/bcache/dirent.h
@@ -0,0 +1,21 @@
+#ifndef _BCACHE_DIRENT_H
+#define _BCACHE_DIRENT_H
+
+extern const struct btree_keys_ops bch_dirent_ops;
+extern const struct bkey_ops bch_bkey_dirent_ops;
+
+struct qstr;
+struct file;
+struct dir_context;
+struct cache_set;
+
+int bch_dirent_create(struct cache_set *, u64, u8, const struct qstr *,
+		      u64, u64 *);
+int bch_dirent_update(struct cache_set *, u64, const struct qstr *, u64, u64 *);
+int bch_dirent_delete(struct cache_set *, u64, const struct qstr *);
+u64 bch_dirent_lookup(struct cache_set *, u64, const struct qstr *);
+int bch_empty_dir(struct cache_set *, u64);
+int bch_readdir(struct file *, struct dir_context *);
+
+#endif /* _BCACHE_DIRENT_H */
+
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 077164e271b5..55ee8043b9b8 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -8,12 +8,14 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "dirent.h"
 #include "extents.h"
 #include "gc.h"
 #include "inode.h"
 #include "journal.h"
 #include "super.h"
 #include "writeback.h"
+#include "xattr.h"
 
 #include <trace/events/bcache.h>
 
@@ -1822,4 +1824,6 @@ const struct bkey_ops bch_bkey_extent_ops = {
 const struct btree_keys_ops *bch_btree_ops[] = {
 	[BTREE_ID_EXTENTS]	= &bch_extent_ops,
 	[BTREE_ID_INODES]	= &bch_inode_ops,
+	[BTREE_ID_DIRENTS]	= &bch_dirent_ops,
+	[BTREE_ID_XATTRS]	= &bch_xattr_ops,
 };
diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c
new file mode 100644
index 000000000000..47e7a7f093e0
--- /dev/null
+++ b/drivers/md/bcache/fs-gc.c
@@ -0,0 +1,202 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "dirent.h"
+#include "fs.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+
+#define INODES_PER_ITER		(1 << 24)
+
+struct nlink {
+	u32	count;
+	u32	dir_count;
+};
+
+static void inc_link(u64 pos, struct nlink *links, bool *need_loop,
+		     u64 inum, unsigned count, bool dir)
+{
+	if (inum >= pos + INODES_PER_ITER) {
+		*need_loop = true;
+	} else if (inum >= pos) {
+		if (dir)
+			links[inum - pos].dir_count += count;
+		else
+			links[inum - pos].count += count;
+	}
+}
+
+/*
+ * XXX: should do a DFS (via filesystem heirarchy), and make sure all dirents
+ * are reachable
+ */
+
+noinline_for_stack
+static int bch_gc_walk_dirents(struct cache_set *c, u64 pos,
+			       struct nlink *links, bool *need_loop)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+
+	need_loop = false;
+	memset(links, 0, INODES_PER_ITER * sizeof(*links));
+
+	inc_link(pos, links, need_loop, BCACHE_ROOT_INO, 2, false);
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
+		switch (k.k->type) {
+		case BCH_DIRENT:
+			d = bkey_s_c_to_dirent(k);
+
+			if (d.v->d_type == DT_DIR) {
+				inc_link(pos, links, need_loop,
+					 d.v->d_inum, 2, false);
+				inc_link(pos, links, need_loop,
+					 d.k->p.inode, 1, true);
+			} else {
+				inc_link(pos, links, need_loop,
+					 d.v->d_inum, 1, false);
+			}
+
+			break;
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	return bch_btree_iter_unlock(&iter);
+}
+
+static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter,
+			   struct bkey_s_c_inode inode, struct nlink link)
+{
+	struct bkey_i_inode update;
+	int ret;
+
+	cache_set_err_on(inode.v->i_nlink < link.count, c,
+			 "i_link too small (%u < %u, type %i)",
+			 inode.v->i_nlink, link.count + link.dir_count,
+			 mode_to_type(inode.v->i_mode));
+
+	if (!link.count) {
+		cache_set_err_on(S_ISDIR(inode.v->i_mode) &&
+			bch_empty_dir(c, inode.k->p.inode), c,
+			"non empty directory with link count 0,inode nlink %u, dir links found %u",
+			inode.v->i_nlink, link.dir_count);
+		pr_info("deleting inum %llu", inode.k->p.inode);
+
+		bch_btree_iter_unlock(iter);
+		return bch_inode_rm(c, inode.k->p.inode);
+	}
+
+	if (inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		pr_info("truncating inode %llu", inode.k->p.inode);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+
+		ret = bch_inode_truncate(c, inode.k->p.inode,
+				round_up(inode.v->i_size, PAGE_SIZE) >> 9);
+		if (ret)
+			return ret;
+	}
+
+	if (inode.v->i_nlink != link.count + link.dir_count ||
+	    inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) {
+		if (inode.v->i_nlink != link.count + link.dir_count)
+			pr_info("setting inum %llu nlinks from %u to %u",
+				inode.k->p.inode, inode.v->i_nlink,
+				link.count + link.dir_count);
+
+		bkey_reassemble(&update.k_i, inode.s_c);
+		update.v.i_nlink = link.count + link.dir_count;
+		update.v.i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+
+		return bch_btree_insert_at(iter,
+					   &keylist_single(&update.k_i),
+					   NULL, NULL,
+					   BTREE_INSERT_ATOMIC);
+	}
+
+	return 0;
+}
+
+noinline_for_stack
+static int bch_gc_walk_inodes(struct cache_set *c, u64 pos, struct nlink *links)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+	u64 i = 0;
+
+	bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(pos, 0));
+
+	while ((k = bch_btree_iter_peek(&iter)).k) {
+		if (k.k->p.inode - pos >= INODES_PER_ITER)
+			break;
+
+		while (i < k.k->p.inode - pos) {
+			cache_set_err_on(links[i].count, c,
+					 "missing inode %llu",
+					 pos + i);
+			i++;
+		}
+
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch_gc_do_inode(c, &iter,
+					      bkey_s_c_to_inode(k),
+					      links[i]);
+			if (ret == -EAGAIN || ret == -EINTR)
+				continue;
+			if (ret)
+				goto out;
+
+			break;
+		default:
+			cache_set_err_on(links[i].count, c,
+					 "missing inode %llu",
+					 pos + i);
+			break;
+		}
+
+		if (links[i].count)
+			atomic_long_inc(&c->nr_inodes);
+
+		bch_btree_iter_advance_pos(&iter);
+		i++;
+		bch_btree_iter_cond_resched(&iter);
+	}
+out:
+	return bch_btree_iter_unlock(&iter) ?: ret;
+}
+
+int bch_gc_inode_nlinks(struct cache_set *c)
+{
+	bool need_loop = false;
+	u64 pos = 0;
+	struct nlink *links = vmalloc(INODES_PER_ITER * sizeof(*links));
+	int ret = 0;
+
+	if (!links)
+		return -ENOMEM;
+
+	do {
+		ret = bch_gc_walk_dirents(c, pos, links, &need_loop);
+		if (ret)
+			break;
+
+		ret = bch_gc_walk_inodes(c, pos, links);
+		if (ret)
+			break;
+
+		pos += INODES_PER_ITER;
+	} while (need_loop);
+
+	vfree(links);
+
+	return ret;
+}
diff --git a/drivers/md/bcache/fs-gc.h b/drivers/md/bcache/fs-gc.h
new file mode 100644
index 000000000000..4fb5728820ea
--- /dev/null
+++ b/drivers/md/bcache/fs-gc.h
@@ -0,0 +1,6 @@
+#ifndef _BCACHE_FS_GC_H
+#define _BCACHE_FS_GC_H
+
+int bch_gc_inode_nlinks(struct cache_set *);
+
+#endif /* _BCACHE_FS_GC_H */
diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c
new file mode 100644
index 000000000000..0d04efe9c40c
--- /dev/null
+++ b/drivers/md/bcache/fs.c
@@ -0,0 +1,2087 @@
+
+#include "bcache.h"
+#include "acl.h"
+#include "btree.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/compat.h>
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/xattr.h>
+
+/*
+ * our page flags:
+ *
+ * allocated - page has space on disk reserved for it (-ENOSPC was checked then,
+ * shouldn't be checked later)
+ *
+ * corresponds to c->sectors_reserved
+ *
+ * append - page is dirty from an append write, new i_size can't be written
+ * until after page is written
+ *
+ * corresponds to ei->append_count
+ */
+
+#define PF_ANY(page, enforce)	page
+PAGEFLAG(Allocated, private, PF_ANY)
+TESTSCFLAG(Allocated, private, PF_ANY)
+
+PAGEFLAG(Append, private_2, PF_ANY)
+TESTSCFLAG(Append, private_2, PF_ANY)
+#undef PF_ANY
+
+static struct bio_set *bch_fs_bioset;
+static struct kmem_cache *bch_inode_cache;
+static DECLARE_WAIT_QUEUE_HEAD(bch_append_wait);
+
+static void bch_inode_init(struct bch_inode_info *);
+static int bch_read_single_page(struct page *, struct address_space *);
+
+#define SECTORS_CACHE	1024
+
+static int reserve_sectors(struct cache_set *c, unsigned sectors)
+{
+	if (likely(atomic_long_sub_return(sectors,
+					  &c->sectors_reserved_cache) >= 0))
+		return 0;
+
+	atomic_long_add(SECTORS_CACHE, &c->sectors_reserved);
+
+	if (likely(!cache_set_full(c))) {
+		atomic_long_add(SECTORS_CACHE, &c->sectors_reserved_cache);
+		return 0;
+	}
+
+	atomic_long_sub_bug(SECTORS_CACHE, &c->sectors_reserved);
+	atomic_long_add(sectors, &c->sectors_reserved_cache);
+	return -ENOSPC;
+}
+
+static void bch_append_put(struct bch_inode_info *ei)
+{
+	if (atomic_long_dec_and_test(&ei->append_count))
+		wake_up(&bch_append_wait);
+}
+
+static void bch_clear_page_bits(struct cache_set *c, struct bch_inode_info *ei,
+				struct page *page)
+{
+	if (TestClearPageAllocated(page))
+		atomic_long_sub_bug(PAGE_SECTORS, &c->sectors_reserved);
+
+	if (TestClearPageAppend(page))
+		bch_append_put(ei);
+}
+
+static int __bch_write_inode(struct inode *inode)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bch_inode *bi = &ei->inode.v;
+
+	lockdep_assert_held(&ei->update_lock);
+	BUG_ON(ei->inode.k.p.inode != inode->i_ino);
+	BUG_ON(ei->inode.k.type != BCH_INODE_FS);
+
+	if (!atomic_long_read(&ei->append_count)) {
+		bi->i_flags	&= ~BCH_INODE_I_SIZE_DIRTY;
+		bi->i_size	= inode->i_size;
+	}
+
+	bi->i_mode	= inode->i_mode;
+	bi->i_uid	= i_uid_read(inode);
+	bi->i_gid	= i_gid_read(inode);
+	bi->i_nlink	= inode->i_nlink;
+	bi->i_dev	= inode->i_rdev;
+	bi->i_atime	= timespec_to_ns(&inode->i_atime);
+	bi->i_mtime	= timespec_to_ns(&inode->i_mtime);
+	bi->i_ctime	= timespec_to_ns(&inode->i_ctime);
+
+	return bch_inode_update(c, &ei->inode.k_i, NULL, &ei->journal_seq);
+}
+
+static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct bch_inode_info *ei;
+	struct inode *inode;
+	int ret;
+
+	pr_debug("inum %llu", inum);
+
+	inode = iget_locked(sb, inum);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	ei = to_bch_ei(inode);
+
+	ret = bch_inode_find_by_inum(c, inum, &ei->inode);
+	if (unlikely(ret)) {
+		iget_failed(inode);
+		return ERR_PTR(ret);
+	}
+
+	bch_inode_init(ei);
+	unlock_new_inode(inode);
+
+	return inode;
+}
+
+static void bch_set_inode_flags(struct inode *inode)
+{
+	unsigned flags = to_bch_ei(inode)->inode.v.i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME);
+	if (flags & FS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+}
+
+static struct inode *bch_vfs_inode_create(struct cache_set *c,
+					  struct inode *parent,
+					  umode_t mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct bch_inode_info *ei;
+	struct bch_inode *bi;
+	struct timespec ts = CURRENT_TIME;
+	s64 now = timespec_to_ns(&ts);
+	int ret;
+
+	inode = new_inode(parent->i_sb);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+
+	inode_init_owner(inode, parent, mode);
+
+	ei = to_bch_ei(inode);
+
+	bi = &bkey_inode_init(&ei->inode.k_i)->v;
+	bi->i_uid	= i_uid_read(inode);
+	bi->i_gid	= i_gid_read(inode);
+
+	bi->i_mode	= inode->i_mode;
+	bi->i_dev	= rdev;
+	bi->i_atime	= now;
+	bi->i_mtime	= now;
+	bi->i_ctime	= now;
+	bi->i_nlink	= S_ISDIR(mode) ? 2 : 1;
+
+	ret = bch_inode_create(c, &ei->inode.k_i,
+			       BLOCKDEV_INODE_MAX, 0,
+			       &c->unused_inode_hint);
+	if (unlikely(ret)) {
+		/*
+		 * indicate to bch_evict_inode that the inode was never actually
+		 * created:
+		 */
+		bkey_init(&ei->inode.k);
+		goto err;
+	}
+
+	bch_inode_init(ei);
+
+	ret = bch_init_acl(inode, parent);
+	if (unlikely(ret))
+		goto err;
+
+	insert_inode_hash(inode);
+	atomic_long_inc(&c->nr_inodes);
+
+	return inode;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	return ERR_PTR(ret);
+}
+
+static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
+				 u8 type, const struct qstr *name,
+				 struct inode *dst)
+{
+	struct bch_inode_info *ei = to_bch_ei(dst);
+	int ret;
+
+	ret = bch_dirent_create(c, dir->i_ino, type, name,
+				dst->i_ino, &ei->journal_seq);
+	if (unlikely(ret))
+		return ret;
+
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty_sync(dir);
+	return 0;
+}
+
+static int __bch_create(struct inode *dir, struct dentry *dentry,
+			umode_t mode, dev_t rdev)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, mode, rdev);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		clear_nlink(inode);
+		iput(inode);
+		return ret;
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/* methods */
+
+static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry,
+				 unsigned int flags)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = NULL;
+	u64 inum;
+
+	inum = bch_dirent_lookup(c, dir->i_ino, &dentry->d_name);
+
+	if (inum)
+		inode = bch_vfs_inode_get(dir->i_sb, inum);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int bch_create(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, bool excl)
+{
+	return __bch_create(dir, dentry, mode|S_IFREG, 0);
+}
+
+static int bch_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	mutex_lock(&ei->update_lock);
+	inode->i_ctime = CURRENT_TIME;
+	inc_nlink(inode);
+	__bch_write_inode(inode);
+	mutex_unlock(&ei->update_lock);
+
+	ihold(inode);
+
+	ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode),
+				    &dentry->d_name, inode);
+	if (unlikely(ret)) {
+		inode_dec_link_count(inode);
+		iput(inode);
+		return ret;
+	}
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+static int bch_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	ret = bch_dirent_delete(c, dir->i_ino, &dentry->d_name);
+	if (ret)
+		return ret;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+
+	return 0;
+}
+
+static int bch_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *symname)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+	int ret;
+
+	inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	inode_lock(inode);
+	ret = page_symlink(inode, symname, strlen(symname) + 1);
+	inode_unlock(inode);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, inode);
+	return 0;
+err:
+	clear_nlink(inode);
+	iput(inode);
+	return ret;
+}
+
+static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int ret;
+
+	lockdep_assert_held(&dir->i_rwsem);
+
+	inode_inc_link_count(dir);
+	mark_inode_dirty_sync(dir);
+
+	ret = __bch_create(dir, dentry, mode|S_IFDIR, 0);
+	if (unlikely(ret)) {
+		inode_dec_link_count(dir);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int bch_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+
+	lockdep_assert_held(&inode->i_rwsem);
+	lockdep_assert_held(&dir->i_rwsem);
+
+	if (bch_empty_dir(c, inode->i_ino))
+		return -ENOTEMPTY;
+
+	ret = bch_unlink(dir, dentry);
+	if (unlikely(ret))
+		return ret;
+
+	inode_dec_link_count(inode);
+	inode_dec_link_count(dir);
+
+	return 0;
+}
+
+static int bch_mknod(struct inode *dir, struct dentry *dentry,
+		     umode_t mode, dev_t rdev)
+{
+	return __bch_create(dir, dentry, mode, rdev);
+}
+
+static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct cache_set *c = old_dir->i_sb->s_fs_info;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(old_inode);
+	struct inode *new_inode = new_dentry->d_inode;
+	struct timespec now = CURRENT_TIME;
+	int ret;
+
+	lockdep_assert_held(&old_dir->i_rwsem);
+	lockdep_assert_held(&new_dir->i_rwsem);
+
+	/*
+	 * XXX: This isn't atomic w.r.t. unclean shutdowns, and we'd really like
+	 * it to be
+	 */
+
+	if (new_inode && S_ISDIR(old_inode->i_mode)) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		if (!S_ISDIR(new_inode->i_mode))
+			return -ENOTDIR;
+
+		if (bch_empty_dir(c, new_inode->i_ino))
+			return -ENOTEMPTY;
+
+		ret = bch_dirent_update(c, new_dir->i_ino,
+					&new_dentry->d_name,
+					old_inode->i_ino,
+					&ei->journal_seq);
+		if (unlikely(ret))
+			return ret;
+
+		clear_nlink(new_inode);
+		inode_dec_link_count(old_dir);
+	} else if (new_inode) {
+		lockdep_assert_held(&new_inode->i_rwsem);
+
+		ret = bch_dirent_update(c, new_dir->i_ino,
+					&new_dentry->d_name,
+					old_inode->i_ino,
+					&ei->journal_seq);
+		if (unlikely(ret))
+			return ret;
+
+		new_inode->i_ctime = now;
+		inode_dec_link_count(new_inode);
+	} else if (S_ISDIR(old_inode->i_mode)) {
+		ret = bch_vfs_dirent_create(c, new_dir,
+					    mode_to_type(old_inode->i_mode),
+					    &new_dentry->d_name,
+					    old_inode);
+		if (unlikely(ret))
+			return ret;
+
+		inode_inc_link_count(new_dir);
+		inode_dec_link_count(old_dir);
+	} else {
+		ret = bch_vfs_dirent_create(c, new_dir,
+					    mode_to_type(old_inode->i_mode),
+					    &new_dentry->d_name,
+					    old_inode);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	old_dir->i_ctime = old_dir->i_mtime = now;
+	new_dir->i_ctime = new_dir->i_mtime = now;
+	mark_inode_dirty_sync(old_dir);
+	mark_inode_dirty_sync(new_dir);
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	mutex_lock(&ei->update_lock);
+	old_inode->i_ctime = now;
+	if (new_inode)
+		old_inode->i_mtime = now;
+	__bch_write_inode(old_inode);
+	mutex_unlock(&ei->update_lock);
+
+	/* XXX: error handling */
+	bch_dirent_delete(c, old_dir->i_ino, &old_dentry->d_name);
+
+	return 0;
+}
+
+static int bch_truncate_page(struct address_space *mapping, loff_t from)
+{
+	unsigned offset = from & (PAGE_SIZE - 1);
+	struct page *page;
+	int ret = 0;
+
+	/* Page boundary? Nothing to do */
+	if (!offset)
+		return 0;
+
+	page = grab_cache_page(mapping, from >> PAGE_SHIFT);
+	if (unlikely(!page)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (!PageUptodate(page))
+		if (bch_read_single_page(page, mapping)) {
+			ret = -EIO;
+			goto unlock;
+		}
+
+	zero_user_segment(page, offset, PAGE_SIZE);
+	set_page_dirty(page);
+unlock:
+	unlock_page(page);
+	put_page(page);
+out:
+	return ret;
+}
+
+static int bch_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	int ret = 0;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	pr_debug("i_size was %llu update has %llu",
+		 inode->i_size, iattr->ia_size);
+
+	ret = inode_change_ok(inode, iattr);
+	if (ret)
+		return ret;
+
+	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+		inode_dio_wait(inode);
+
+		/*
+		 * __bch_write_inode() clears I_SIZE_DIRTY if append_count == 0:
+		 */
+		atomic_long_inc(&ei->append_count);
+
+		/*
+		 * I_SIZE_DIRTY indicates that there's extents past the end of
+		 * i_size, and must be set atomically with setting the new
+		 * i_size:
+		 */
+		mutex_lock(&ei->update_lock);
+		i_size_write(inode, iattr->ia_size);
+		ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+		ei->inode.v.i_size = iattr->ia_size;
+		__bch_write_inode(inode);
+		mutex_unlock(&ei->update_lock);
+
+		ret = bch_truncate_page(inode->i_mapping, iattr->ia_size);
+		if (unlikely(ret))
+			return ret;
+
+		if (iattr->ia_size > inode->i_size)
+			pagecache_isize_extended(inode, inode->i_size,
+						 iattr->ia_size);
+		truncate_pagecache(inode, iattr->ia_size);
+
+		ret = bch_inode_truncate(c, inode->i_ino,
+				round_up(iattr->ia_size, PAGE_SIZE) >> 9);
+		if (unlikely(ret))
+			return ret;
+
+		/*
+		 * Extents discarded, now clear I_SIZE_DIRTY (which write_inode
+		 * does when append_count is 0
+		 */
+		bch_append_put(ei);
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	}
+
+	mutex_lock(&ei->update_lock);
+	setattr_copy(inode, iattr);
+	__bch_write_inode(inode);
+	mutex_unlock(&ei->update_lock);
+
+	if (iattr->ia_valid & ATTR_MODE)
+		ret = posix_acl_chmod(inode, inode->i_mode);
+
+	return ret;
+}
+
+static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct cache_set *c = dir->i_sb->s_fs_info;
+	struct inode *inode;
+
+	/* XXX: i_nlink should be 0? */
+	inode = bch_vfs_inode_create(c, dir, mode, 0);
+	if (unlikely(IS_ERR(inode)))
+		return PTR_ERR(inode);
+
+	d_tmpfile(dentry, inode);
+	return 0;
+}
+
+static int bch_fill_extent(struct fiemap_extent_info *info,
+			   struct bkey_i *k, int flags)
+{
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr) {
+		int ret = fiemap_fill_next_extent(info,
+					      bkey_start_offset(e.k) << 9,
+					      PTR_OFFSET(ptr) << 9,
+					      e.k->size << 9, flags);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info,
+		      u64 start, u64 len)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	BKEY_PADDED(k) tmp;
+	bool have_extent = false;
+	int ret = 0;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode->i_ino, start >> 9), k)
+		if (k.k->type == BCH_EXTENT) {
+			if (bkey_cmp(bkey_start_pos(k.k),
+				     POS(inode->i_ino, (start + len) >> 9)) >= 0)
+				break;
+
+			if (have_extent) {
+				ret = bch_fill_extent(info, &tmp.k, 0);
+				if (ret)
+					goto out;
+			}
+
+			bkey_reassemble(&tmp.k, k);
+			have_extent = true;
+		}
+
+	if (have_extent)
+		ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret < 0 ? ret : 0;
+}
+
+static int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct closure cl;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	/*
+	 * We really just want to sync all the PageAppend pages:
+	 */
+	start = 0;
+	end = S64_MAX;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+	if (datasync && end <= ei->inode.v.i_size)
+		goto out;
+
+	/*
+	 * redo after locking inode:
+	 */
+	filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+	wait_event(bch_append_wait,
+		   !atomic_long_read(&ei->append_count));
+
+	mutex_lock(&ei->update_lock);
+	BUG_ON(atomic_long_read(&ei->append_count));
+	ret = __bch_write_inode(inode);
+	mutex_unlock(&ei->update_lock);
+out:
+	inode_unlock(inode);
+
+	bch_journal_push_seq(&c->journal, ei->journal_seq, &cl);
+	closure_sync(&cl);
+
+	return ret;
+}
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define BCH_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
+
+static inline bool bch_flags_allowed(umode_t mode, u32 flags)
+{
+	if ((flags & BCH_FL_USER_FLAGS) != flags)
+		return false;
+
+	if (!S_ISREG(mode) &&
+	    !S_ISDIR(mode) &&
+	    (flags & BCH_OTHER_FLMASK) != flags)
+		return false;
+
+	return true;
+}
+
+static long bch_fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	unsigned flags;
+	int ret;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = ei->inode.v.i_flags & BCH_FL_USER_FLAGS;
+		return put_user(flags, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS: {
+		unsigned oldflags;
+
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EACCES;
+			goto setflags_out;
+		}
+
+		if (get_user(flags, (int __user *) arg)) {
+			ret = -EFAULT;
+			goto setflags_out;
+		}
+
+		if (!bch_flags_allowed(inode->i_mode, flags)) {
+			ret = -EINVAL;
+			goto setflags_out;
+		}
+
+		inode_lock(inode);
+		oldflags = ei->inode.v.i_flags;
+
+		if (((flags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) &&
+		    !capable(CAP_LINUX_IMMUTABLE)) {
+			inode_unlock(inode);
+			ret = -EPERM;
+			goto setflags_out;
+		}
+
+		flags = flags & BCH_FL_USER_FLAGS;
+		flags |= oldflags & ~BCH_FL_USER_FLAGS;
+		ei->inode.v.i_flags = flags;
+
+		inode->i_ctime = CURRENT_TIME_SEC;
+		bch_set_inode_flags(inode);
+		inode_unlock(inode);
+
+		mark_inode_dirty(inode);
+setflags_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch_fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static const struct file_operations bch_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.fsync		= bch_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.setattr	= bch_setattr,
+	.fiemap		= bch_fiemap,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= bch_xattr_list,
+	.removexattr	= generic_removexattr,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch_lookup,
+	.create		= bch_create,
+	.link		= bch_link,
+	.unlink		= bch_unlink,
+	.symlink	= bch_symlink,
+	.mkdir		= bch_mkdir,
+	.rmdir		= bch_rmdir,
+	.mknod		= bch_mknod,
+	.rename		= bch_rename,
+	.setattr	= bch_setattr,
+	.tmpfile	= bch_tmpfile,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= bch_xattr_list,
+	.removexattr	= generic_removexattr,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= bch_readdir,
+	.fsync		= bch_fsync,
+
+	.unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.get_link	= page_get_link,
+	.setattr	= bch_setattr,
+
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= bch_xattr_list,
+	.removexattr	= generic_removexattr,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.setattr	= bch_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= bch_xattr_list,
+	.removexattr	= generic_removexattr,
+	.get_acl	= bch_get_acl,
+	.set_acl	= bch_set_acl,
+};
+
+static int bch_bio_add_page(struct bio *bio, struct page *page)
+{
+	sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+	if (!bio->bi_vcnt) {
+		bio->bi_iter.bi_sector = offset;
+	} else if (bio_end_sector(bio) != offset ||
+		   bio->bi_vcnt == bio->bi_max_vecs)
+		return -1;
+
+	bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+		.bv_page = page,
+		.bv_len = PAGE_SIZE,
+		.bv_offset = 0,
+	};
+
+	bio->bi_iter.bi_size += PAGE_SIZE;
+
+	return 0;
+}
+
+static void bch_readpages_end_io(struct bio *bio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+
+		if (!bio->bi_error) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+	}
+
+	bio_put(bio);
+}
+
+static int bch_readpages(struct file *file, struct address_space *mapping,
+			 struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bio *bio = NULL;
+	struct page *page;
+	ssize_t ret;
+
+	pr_debug("reading %u pages", nr_pages);
+
+	while (nr_pages) {
+		page = list_entry(pages->prev, struct page, lru);
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (!add_to_page_cache_lru(page, mapping,
+					   page->index, GFP_NOFS)) {
+again:
+			if (!bio) {
+				bio = bio_alloc(GFP_NOFS,
+						min_t(unsigned, nr_pages,
+						      BIO_MAX_PAGES));
+
+				bio->bi_end_io = bch_readpages_end_io;
+			}
+
+			if (bch_bio_add_page(bio, page)) {
+				ret = bch_read(c, bio, inode->i_ino);
+				bio_endio(bio);
+				bio = NULL;
+
+				if (ret < 0) {
+					pr_debug("error %zi", ret);
+					return ret;
+				}
+				goto again;
+			}
+		}
+
+		nr_pages--;
+		put_page(page);
+	}
+
+	if (bio) {
+		ret = bch_read(c, bio, inode->i_ino);
+		bio_endio(bio);
+
+		if (ret < 0) {
+			pr_debug("error %zi", ret);
+			return ret;
+		}
+	}
+
+	pr_debug("success");
+	return 0;
+}
+
+static int bch_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bio *bio;
+	int ret;
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+	bio->bi_end_io = bch_readpages_end_io;
+
+	bch_bio_add_page(bio, page);
+
+	ret = bch_read(c, bio, inode->i_ino);
+	bio_endio(bio);
+
+	return ret;
+}
+
+struct bch_writepage_io {
+	struct closure		cl;
+	struct bch_write_op	op;
+	struct bbio		bio;
+};
+
+struct bch_writepage {
+	struct cache_set	*c;
+	u64			inum;
+	struct bch_writepage_io	*io;
+};
+
+static void bch_writepage_io_free(struct closure *cl)
+{
+	struct bch_writepage_io *io = container_of(cl,
+					struct bch_writepage_io, cl);
+	struct cache_set *c = io->op.c;
+	struct inode *inode = io->bio.bio.bi_io_vec[0].bv_page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, &io->bio.bio, i) {
+		struct page *page = bvec->bv_page;
+
+		BUG_ON(!PageWriteback(page));
+
+		if (io->bio.bio.bi_error) {
+			SetPageError(page);
+			if (page->mapping)
+				set_bit(AS_EIO, &page->mapping->flags);
+		}
+
+		bch_clear_page_bits(c, ei, page);
+		end_page_writeback(page);
+	}
+
+	bio_put(&io->bio.bio);
+}
+
+static void bch_writepage_do_io(struct bch_writepage_io *io)
+{
+	pr_debug("writing %u sectors to %llu:%llu",
+		 bio_sectors(&io->bio.bio),
+		 io->op.insert_key.k.p.inode,
+		 (u64) io->bio.bio.bi_iter.bi_sector);
+
+	closure_call(&io->op.cl, bch_write, NULL, &io->cl);
+	closure_return_with_destructor(&io->cl, bch_writepage_io_free);
+}
+
+static int __bch_writepage(struct page *page, struct writeback_control *wbc,
+			   void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct bch_writepage *w = data;
+	struct bio *bio;
+	unsigned offset;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		goto do_io;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_SIZE - 1);
+	if (page->index > end_index || !offset) {
+		unlock_page(page);
+		return 0;
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+	/* XXX: how we gonna make this synchronization efficient? */
+	mutex_lock(&ei->update_lock);
+
+	if (ei->inode.v.i_size < i_size &&
+	    page->index >= (ei->inode.v.i_size >> PAGE_SHIFT) &&
+	    !(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) {
+		ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+		__bch_write_inode(inode);
+	}
+
+	mutex_unlock(&ei->update_lock);
+
+	if (!w->io) {
+		bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, bch_fs_bioset);
+		w->io = container_of(bio, struct bch_writepage_io, bio.bio);
+
+		closure_init(&w->io->cl, NULL);
+		bch_write_op_init(&w->io->op, w->c, bio, NULL,
+				  bkey_to_s_c(&KEY(w->inum, 0, 0)),
+				  bkey_s_c_null, 0);
+		w->io->op.journal_seq = &ei->journal_seq;
+	}
+
+	if (bch_bio_add_page(&w->io->bio.bio, page)) {
+		bch_writepage_do_io(w->io);
+		w->io = NULL;
+		goto do_io;
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+static int bch_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct bch_writepage w = {
+		.c	= mapping->host->i_sb->s_fs_info,
+		.inum	= mapping->host->i_ino,
+		.io	= NULL,
+	};
+
+	ret = write_cache_pages(mapping, wbc, __bch_writepage, &w);
+
+	if (w.io)
+		bch_writepage_do_io(w.io);
+
+	return ret;
+}
+
+static int bch_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_writepage w = {
+		.c = inode->i_sb->s_fs_info,
+		.inum = inode->i_ino,
+		.io = NULL,
+	};
+
+	__bch_writepage(page, NULL, &w);
+	bch_writepage_do_io(w.io);
+
+	return 0;
+}
+
+static void bch_read_single_page_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+static int bch_read_single_page(struct page *page,
+				struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bio *bio;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+	bio->bi_private = &done;
+	bio->bi_end_io = bch_read_single_page_end_io;
+	bch_bio_add_page(bio, page);
+
+	ret = bch_read(c, bio, inode->i_ino);
+	bio_endio(bio);
+	wait_for_completion(&done);
+
+	if (!ret)
+		ret = bio->bi_error;
+	bio_put(bio);
+
+	if (ret < 0)
+		return ret;
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+static int bch_write_begin(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned flags,
+			   struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *page;
+	int ret = 0;
+
+	BUG_ON(inode_unhashed(mapping->host));
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	if (!PageAllocated(page)) {
+		if (reserve_sectors(c, PAGE_SECTORS)) {
+			ret = -ENOSPC;
+			goto err;
+		}
+
+		SetPageAllocated(page);
+	}
+
+	if (PageUptodate(page))
+		goto out;
+
+	/* If we're writing entire page, don't need to read it in first: */
+	if (len == PAGE_SIZE)
+		goto out;
+
+	if (pos + len >= inode->i_size) {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+
+		/*
+		 * If the write extents past i_size, the top part of the page
+		 * we're not writing to doesn't need to be read in, just zeroed:
+		 */
+		zero_user(page, offset + len, PAGE_SIZE - offset - len);
+		flush_dcache_page(page);
+
+		if (!offset)
+			goto out;
+
+		/*
+		 * If the start of the page is past i_size, zero that part too:
+		 */
+		if ((index << PAGE_SHIFT) >> inode->i_size) {
+			zero_user(page, 0, offset);
+			flush_dcache_page(page);
+			goto out;
+		}
+	}
+
+	ret = bch_read_single_page(page, mapping);
+	if (ret)
+		goto err;
+out:
+	*pagep = page;
+	return ret;
+err:
+	unlock_page(page);
+	put_page(page);
+	page = NULL;
+	goto out;
+}
+
+static int bch_write_end(struct file *filp, struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned copied,
+			 struct page *page, void *fsdata)
+{
+	loff_t last_pos = pos + copied;
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+
+	/*
+	 * can't set a page dirty without i_rwsem, to avoid racing with truncate
+	 */
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (unlikely(copied < len)) {
+#if 0
+		if (!PageUptodate(page)) {
+			/* we skipped reading in the page before, read it now..  */
+		}
+#endif
+
+		/*
+		 * zero out the rest of the area
+		 */
+		unsigned from = pos & (PAGE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+		flush_dcache_page(page);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	if (!PageDirty(page))
+		set_page_dirty(page);
+
+	if (last_pos > inode->i_size) {
+		mutex_lock(&ei->update_lock);
+
+		if (!TestSetPageAppend(page))
+			atomic_long_inc(&ei->append_count);
+
+		i_size_write(inode, last_pos);
+		mark_inode_dirty(inode);
+
+		mutex_unlock(&ei->update_lock);
+	}
+
+	unlock_page(page);
+	put_page(page);
+
+	return copied;
+}
+
+static void bch_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	if (offset || length < PAGE_SIZE)
+		return;
+
+	bch_clear_page_bits(c, ei, page);
+}
+
+static int bch_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct inode *inode = page->mapping->host;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	bch_clear_page_bits(c, ei, page);
+
+	if (PageDirty(page)) {
+		ClearPageDirty(page);
+		cancel_dirty_page(page);
+	}
+
+	return 1;
+}
+
+/* O_DIRECT */
+
+static struct bio_set *bch_dio_read_bioset;
+
+struct dio_read {
+	struct closure		cl;
+	struct kiocb		*req;
+	long			ret;
+	struct bio		bio;
+};
+
+static void bch_dio_read_complete(struct closure *cl)
+{
+	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret, 0);
+	bio_put(&dio->bio);
+}
+
+static void bch_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_error)
+		dio->ret = bio->bi_error;
+
+	closure_put(&dio->cl);
+	bio_check_pages_dirty(bio);	/* transfers ownership */
+}
+
+static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req,
+			      struct file *file, struct inode *inode,
+			      struct iov_iter *iter, loff_t offset)
+{
+	struct dio_read *dio;
+	struct bio *bio;
+	unsigned long inum = inode->i_ino;
+	ssize_t ret = 0;
+	size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+	loff_t i_size;
+
+	bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_read_bioset);
+	bio_get(bio);
+
+	dio = container_of(bio, struct dio_read, bio);
+	closure_init(&dio->cl, NULL);
+	dio->req	= req;
+	dio->ret	= iter->count;
+
+	i_size = i_size_read(inode);
+	if (offset + dio->ret > i_size) {
+		dio->ret = max_t(loff_t, 0, i_size - offset);
+		iter->count = round_up(dio->ret, PAGE_SIZE);
+	}
+
+	if (!dio->ret)
+		goto out;
+
+	goto start;
+	while (iter->count && !ret) {
+		pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+		bio = bio_alloc(GFP_KERNEL, pages);
+start:
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_end_io		= bch_direct_IO_read_endio;
+		bio->bi_private		= dio;
+
+		ret = bio_get_user_pages(bio, iter, 1);
+		if (ret < 0) {
+			dio->ret = ret;
+			bio_put(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		bio_set_pages_dirty(bio);
+
+		closure_get(&dio->cl);
+		ret = bch_read(c, bio, inum);
+		if (ret)
+			bio->bi_error = ret;
+		bio_endio(bio);
+	}
+out:
+	if (is_sync_kiocb(req)) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_put(&dio->bio);
+		return ret;
+	} else {
+		closure_return_with_destructor_noreturn(&dio->cl,
+						bch_dio_read_complete);
+		return -EIOCBQUEUED;
+	}
+}
+
+struct dio_write {
+	struct closure		cl;
+	struct kiocb		*req;
+	long			ret;
+	bool			append;
+};
+
+struct dio_write_bio {
+	struct closure		cl;
+	struct dio_write	*dio;
+	struct bch_write_op	iop;
+	struct bbio		bio;
+};
+
+static void __bch_dio_write_complete(struct dio_write *dio)
+{
+	struct bch_inode_info *ei = to_bch_ei(dio->req->ki_filp->f_inode);
+
+	if (dio->append)
+		bch_append_put(ei);
+	inode_dio_end(dio->req->ki_filp->f_inode);
+	kfree(dio);
+}
+
+static void bch_dio_write_complete(struct closure *cl)
+{
+	struct dio_write *dio = container_of(cl, struct dio_write, cl);
+	struct kiocb *req = dio->req;
+	long ret = dio->ret;
+
+	__bch_dio_write_complete(dio);
+	req->ki_complete(req, ret, 0);
+}
+
+static void bch_direct_IO_write_done(struct closure *cl)
+{
+	struct dio_write_bio *op = container_of(cl,
+					struct dio_write_bio, cl);
+	struct bio_vec *bv;
+	int i;
+
+	if (op->iop.error)
+		op->dio->ret = op->iop.error;
+	closure_put(&op->dio->cl);
+
+	bio_for_each_segment_all(bv, &op->bio.bio, i)
+		put_page(bv->bv_page);
+	kfree(op);
+}
+
+static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req,
+			       struct file *file, struct inode *inode,
+			       struct iov_iter *iter, loff_t offset)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct dio_write *dio;
+	struct dio_write_bio *op;
+	struct bio *bio;
+	unsigned long inum = inode->i_ino;
+	unsigned flags = BCH_WRITE_CHECK_ENOSPC;
+	ssize_t ret = 0;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (file->f_flags & O_DSYNC || IS_SYNC(file->f_mapping->host))
+		flags |= BCH_WRITE_FLUSH;
+
+	dio = kmalloc(sizeof(*dio), GFP_NOIO);
+	if (!dio)
+		return -ENOMEM;
+
+	closure_init(&dio->cl, NULL);
+	dio->req	= req;
+	dio->ret	= iter->count;
+	dio->append	= false;
+
+	if (offset + iter->count > inode->i_size) {
+		dio->append = true;
+		atomic_long_inc(&ei->append_count);
+
+		mutex_lock(&ei->update_lock);
+		if (!(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) {
+			ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+			__bch_write_inode(inode);
+		}
+		mutex_unlock(&ei->update_lock);
+	}
+
+	/* Decremented by inode_dio_done(): */
+	atomic_inc(&inode->i_dio_count);
+
+	while (iter->count) {
+		size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+
+		op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+			     GFP_NOIO);
+		if (!op) {
+			dio->ret = -ENOMEM;
+			break;
+		}
+
+		bio = &op->bio.bio;
+		bio_init(bio);
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_max_vecs	= pages;
+		bio->bi_io_vec		= bio->bi_inline_vecs;
+
+		ret = bio_get_user_pages(bio, iter, 0);
+		if (ret < 0) {
+			dio->ret = ret;
+			kfree(op);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+		closure_get(&dio->cl);
+		op->dio = dio;
+		closure_init(&op->cl, NULL);
+
+		bch_write_op_init(&op->iop, c, bio, NULL,
+				  bkey_to_s_c(&KEY(inum,
+						   bio_end_sector(bio),
+						   bio_sectors(bio))),
+				  bkey_s_c_null, flags);
+		op->iop.journal_seq = &ei->journal_seq;
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		closure_call(&op->iop.cl, bch_write, NULL, &op->cl);
+		closure_return_with_destructor_noreturn(&op->cl,
+						bch_direct_IO_write_done);
+	}
+
+	if (is_sync_kiocb(req) || dio->append) {
+		/*
+		 * appends are sync in order to do the i_size update under
+		 * i_rwsem, after we know the write has completed successfully
+		 */
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+
+		if (ret > 0 &&
+		    offset > inode->i_size) {
+			i_size_write(inode, offset);
+			mark_inode_dirty(inode);
+		}
+
+		__bch_dio_write_complete(dio);
+		return ret;
+	} else {
+		closure_return_with_destructor_noreturn(&dio->cl,
+						bch_dio_write_complete);
+		return -EIOCBQUEUED;
+	}
+}
+
+static ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct inode *inode = file->f_inode;
+	struct cache_set *c = inode->i_sb->s_fs_info;
+
+	if ((req->ki_pos|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	return ((iov_iter_rw(iter) == WRITE)
+		? bch_direct_IO_write
+		: bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+}
+
+#ifdef CONFIG_MIGRATION
+static int bch_migrate_page(struct address_space *mapping,
+			    struct page *newpage, struct page *page,
+			    enum migrate_mode mode)
+{
+	int ret;
+
+	ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+	if (ret != MIGRATEPAGE_SUCCESS)
+		return ret;
+
+	if (PageAllocated(page)) {
+		ClearPageAllocated(page);
+		SetPageAllocated(newpage);
+	}
+
+	if (PageAppend(page)) {
+		ClearPageAppend(page);
+		SetPageAppend(newpage);
+	}
+
+	migrate_page_copy(newpage, page);
+	return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
+static const struct address_space_operations bch_address_space_operations = {
+	.writepage		= bch_writepage,
+	.readpage		= bch_readpage,
+	.writepages		= bch_writepages,
+	.readpages		= bch_readpages,
+
+	.set_page_dirty		= __set_page_dirty_nobuffers,
+
+	.write_begin		= bch_write_begin,
+	.write_end		= bch_write_end,
+	.invalidatepage		= bch_invalidatepage,
+	.releasepage		= bch_releasepage,
+
+	.direct_IO		= bch_direct_IO,
+
+#ifdef CONFIG_MIGRATION
+	.migratepage		= bch_migrate_page,
+#endif
+	.error_remove_page	= generic_error_remove_page,
+};
+
+static void bch_inode_init(struct bch_inode_info *ei)
+{
+	struct inode *inode = &ei->vfs_inode;
+	struct bch_inode *bi = &ei->inode.v;
+
+	pr_debug("init inode %llu with mode %o",
+		 ei->inode.k.p.inode, bi->i_mode);
+
+	BUG_ON(atomic_long_read(&ei->append_count));
+
+	inode->i_mode	= bi->i_mode;
+	i_uid_write(inode, bi->i_uid);
+	i_gid_write(inode, bi->i_gid);
+
+	inode->i_ino	= ei->inode.k.p.inode;
+	set_nlink(inode, bi->i_nlink);
+	inode->i_rdev	= bi->i_dev;
+	inode->i_size	= bi->i_size;
+	inode->i_atime	= ns_to_timespec(bi->i_atime);
+	inode->i_mtime	= ns_to_timespec(bi->i_mtime);
+	inode->i_ctime	= ns_to_timespec(bi->i_ctime);
+	bch_set_inode_flags(inode);
+
+	inode->i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &bch_file_inode_operations;
+		inode->i_fop = &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &bch_dir_inode_operations;
+		inode->i_fop = &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(inode);
+		inode->i_op = &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &bch_special_inode_operations;
+		break;
+	}
+}
+
+static struct inode *bch_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *ei;
+
+	ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+	pr_debug("allocated %p", &ei->vfs_inode);
+
+	inode_init_once(&ei->vfs_inode);
+	mutex_init(&ei->update_lock);
+	ei->journal_seq = 0;
+	atomic_long_set(&ei->append_count, 0);
+
+	return &ei->vfs_inode;
+}
+
+static void bch_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(bch_inode_cache, to_bch_ei(inode));
+}
+
+static void bch_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, bch_i_callback);
+}
+
+static int bch_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	int ret;
+
+	mutex_lock(&ei->update_lock);
+	ret = __bch_write_inode(inode);
+	mutex_unlock(&ei->update_lock);
+
+	if (!ret && wbc->sync_mode == WB_SYNC_ALL) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+		bch_journal_push_seq(&c->journal, ei->journal_seq, &cl);
+		closure_sync(&cl);
+	}
+
+	return ret;
+}
+
+static void bch_evict_inode(struct inode *inode)
+{
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(inode);
+
+	if (inode->i_nlink) {
+		truncate_inode_pages_final(&inode->i_data);
+
+		mutex_lock(&ei->update_lock);
+		BUG_ON(atomic_long_read(&ei->append_count));
+
+		if (!(inode->i_state & I_NEW) &&
+		    (ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY ||
+		     inode->i_size != ei->inode.v.i_size))
+			__bch_write_inode(inode);
+		mutex_unlock(&ei->update_lock);
+
+		clear_inode(inode);
+	} else if (!bkey_deleted(&ei->inode.k)) {
+		atomic_long_inc(&ei->append_count);
+
+		mutex_lock(&ei->update_lock);
+		ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+		ei->inode.v.i_size = 0;
+		i_size_write(inode, 0);
+		__bch_write_inode(inode);
+		mutex_unlock(&ei->update_lock);
+
+		truncate_inode_pages_final(&inode->i_data);
+		clear_inode(inode);
+
+		/*
+		 * write_inode() shouldn't be called again - this will cause it
+		 * to BUG():
+		 */
+		ei->inode.k.type = KEY_TYPE_DELETED;
+		atomic_long_dec_bug(&ei->append_count);
+
+		bch_inode_rm(c, inode->i_ino);
+		atomic_long_dec(&c->nr_inodes);
+	} else {
+		/* bch_inode_create() failed: */
+		clear_inode(inode);
+	}
+}
+
+static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct cache_set *c = sb->s_fs_info;
+
+	buf->f_type	= BCACHE_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= c->capacity >> (PAGE_SHIFT - 9);
+	buf->f_bfree	= (c->capacity - cache_set_sectors_used(c)) >>
+		(PAGE_SHIFT - 9);
+	buf->f_bavail	= buf->f_bfree;
+	buf->f_files	= atomic_long_read(&c->nr_inodes);
+	buf->f_namelen	= NAME_MAX;
+
+	return 0;
+}
+
+static int bch_sync_fs(struct super_block *sb, int wait)
+{
+	struct cache_set *c = sb->s_fs_info;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	/* XXX: should only push a journal write if it's dirty */
+	bch_journal_flush(&c->journal, wait ? &cl : NULL);
+	closure_sync(&cl);
+	return 0;
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch_alloc_inode,
+	.destroy_inode	= bch_destroy_inode,
+	.write_inode	= bch_write_inode,
+	.evict_inode	= bch_evict_inode,
+	.sync_fs	= bch_sync_fs,
+	.statfs		= bch_statfs,
+	.show_options	= generic_show_options,
+#if 0
+	.put_super	= bch_put_super,
+	.freeze_fs	= bch_freeze,
+	.unfreeze_fs	= bch_unfreeze,
+	.remount_fs	= bch_remount,
+#endif
+};
+
+static struct cache_set *bch_open_as_blockdevs(const char *_dev_name)
+{
+	size_t nr_devs = 0, i = 0;
+	char *dev_name, *s, **devs;
+	struct cache_set *c = NULL;
+	const char *err;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return NULL;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+	if (!devs)
+		goto out;
+
+	for (i = 0, s = dev_name;
+	     s;
+	     (s = strchr(s, ':')) && (*s++ = '\0'))
+		devs[i++] = s;
+
+	err = bch_register_cache_set(devs, nr_devs, &c);
+	if (err) {
+		pr_err("register_cache_set err %s", err);
+		goto out;
+	}
+
+	set_bit(CACHE_SET_BDEV_MOUNTED, &c->flags);
+out:
+	kfree(devs);
+	kfree(dev_name);
+
+	return c;
+}
+
+enum {
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_user_xattr, Opt_nouser_xattr,
+	Opt_acl, Opt_noacl,
+	Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_err, NULL}
+};
+
+static int parse_options(struct cache_set *c, struct super_block *sb,
+			 char *options)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_err_panic:
+			/*
+			 * XXX: this will get written to the superblock, don't
+			 * want this option to be persistent
+			 */
+			SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_PANIC);
+			break;
+		case Opt_err_ro:
+			SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_RO);
+			break;
+		case Opt_err_cont:
+			SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_CONTINUE);
+			break;
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			break;
+		case Opt_acl:
+			sb->s_flags |= MS_POSIXACL;
+			break;
+		case Opt_noacl:
+			sb->s_flags &= ~MS_POSIXACL;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static struct dentry *bch_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	struct cache_set *c;
+	struct super_block *sb;
+	struct inode *inode;
+	int ret;
+
+	c = bch_open_as_blockdevs(dev_name);
+	if (!c)
+		return ERR_PTR(-ENOENT);
+
+	sb = sget(fs_type, NULL, set_anon_super, flags, NULL);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		goto err;
+	}
+
+	/* XXX: */
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_xattr		= bch_xattr_handlers;
+	sb->s_magic		= BCACHE_STATFS_MAGIC;
+	sb->s_time_gran		= 1;
+	sb->s_fs_info		= c;
+
+	sb->s_flags		|= MS_POSIXACL;
+
+	/* XXX */
+	sb->s_bdev		= c->cache[0]->disk_sb.bdev;
+	sb->s_bdi		= &c->bdi;
+
+	if (!parse_options(c, sb, (char *) data)) {
+		ret = -EINVAL;
+		goto err_put_super;
+	}
+
+	inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= MS_ACTIVE;
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+err:
+	closure_put(&c->cl);
+	return ERR_PTR(ret);
+}
+
+static void bch_kill_sb(struct super_block *sb)
+{
+	struct cache_set *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+
+	if (test_bit(CACHE_SET_BDEV_MOUNTED, &c->flags)) {
+		DECLARE_COMPLETION_ONSTACK(complete);
+
+		c->stop_completion = &complete;
+		bch_cache_set_stop(c);
+		closure_put(&c->cl);
+
+		/* Killable? */
+		wait_for_completion(&complete);
+	} else
+		closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcache",
+	.mount		= bch_mount,
+	.kill_sb	= bch_kill_sb,
+};
+
+MODULE_ALIAS_FS("bcache");
+
+void bch_fs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	if (bch_dio_read_bioset)
+		bioset_free(bch_dio_read_bioset);
+	if (bch_fs_bioset)
+		bioset_free(bch_fs_bioset);
+	if (bch_inode_cache)
+		kmem_cache_destroy(bch_inode_cache);
+}
+
+int __init bch_fs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+	if (!bch_inode_cache)
+		goto err;
+
+	bch_fs_bioset = bioset_create(4,
+				offsetof(struct bch_writepage_io, bio.bio));
+	if (!bch_fs_bioset)
+		goto err;
+
+
+	bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, bio));
+	if (!bch_dio_read_bioset)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch_fs_exit();
+	return ret;
+}
diff --git a/drivers/md/bcache/fs.h b/drivers/md/bcache/fs.h
new file mode 100644
index 000000000000..9e78cf8189bc
--- /dev/null
+++ b/drivers/md/bcache/fs.h
@@ -0,0 +1,20 @@
+#ifndef _BCACHE_FS_H
+#define _BCACHE_FS_H
+
+struct bch_inode_info {
+	struct bkey_i_inode	inode;
+	struct inode		vfs_inode;
+	struct mutex		update_lock;
+	u64			journal_seq;
+	atomic_long_t		append_count;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of(_inode, struct bch_inode_info, vfs_inode)
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+#endif /* _BCACHE_FS_H */
diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c
index 5e458258eaa8..ba6863ec5d01 100644
--- a/drivers/md/bcache/inode.c
+++ b/drivers/md/bcache/inode.c
@@ -162,14 +162,33 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size)
 
 int bch_inode_rm(struct cache_set *c, u64 inode_nr)
 {
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	struct bkey_i delete;
 	int ret;
 
-	ret = bch_discard(c, POS(inode_nr, 0),
-			  POS(inode_nr + 1, 0), 0);
+	ret = bch_inode_truncate(c, inode_nr, 0);
 	if (ret < 0)
 		return ret;
 
+	for_each_btree_key_intent(&iter, c, BTREE_ID_XATTRS,
+				  POS(inode_nr, 0), k) {
+		if (k.k->p.inode > inode_nr)
+			break;
+
+		bkey_init(&delete.k);
+		delete.k.p = k.k->p;
+
+		ret = bch_btree_insert_at(&iter, &keylist_single(&delete),
+					  NULL, NULL, 0);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+	}
+	bch_btree_iter_unlock(&iter);
+
 	bkey_init(&delete.k);
 	delete.k.p.inode = inode_nr;
 
@@ -179,6 +198,33 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
 				BTREE_INSERT_NOFAIL);
 }
 
+int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
+			   struct bkey_i_inode *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = -ENOENT;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
+				      POS(inode_nr, 0), k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = 0;
+			bkey_reassemble(&inode->k_i, k);
+			break;
+		default:
+			/* hole, not found */
+			break;
+		}
+
+		break;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 int bch_blockdev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
 				    struct bkey_i_inode_blockdev *ret)
 {
diff --git a/drivers/md/bcache/inode.h b/drivers/md/bcache/inode.h
index 6561e1e71ee6..dc1c26f8240f 100644
--- a/drivers/md/bcache/inode.h
+++ b/drivers/md/bcache/inode.h
@@ -17,6 +17,7 @@ static inline int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
 				cl, journal_seq);
 }
 
+int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
 int bch_blockdev_inode_find_by_uuid(struct cache_set *, uuid_le *,
 				    struct bkey_i_inode_blockdev *);
 
diff --git a/drivers/md/bcache/siphash.c b/drivers/md/bcache/siphash.c
new file mode 100644
index 000000000000..0c6f7f3ec819
--- /dev/null
+++ b/drivers/md/bcache/siphash.c
@@ -0,0 +1,185 @@
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+//#include <sys/param.h>
+//#include <sys/systm.h>
+
+#include <asm/byteorder.h>
+#include <asm/string.h>
+
+#include "siphash.h"
+
+static void	SipHash_CRounds(SIPHASH_CTX *, int);
+static void	SipHash_Rounds(SIPHASH_CTX *, int);
+
+void
+SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void
+SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		memcpy(ctx->buf, ptr, sizeof(ctx->buf));
+		SipHash_CRounds(ctx, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void
+SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64
+SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return (r);
+}
+
+u64
+SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return (SipHash_End(&ctx, rc, rf));
+}
+
+#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
+
+static void
+SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
+		ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
+	}
+}
+
+static void
+SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
+{
+	u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
diff --git a/drivers/md/bcache/siphash.h b/drivers/md/bcache/siphash.h
new file mode 100644
index 000000000000..7a4b2241f1e1
--- /dev/null
+++ b/drivers/md/bcache/siphash.h
@@ -0,0 +1,86 @@
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e82dcc5ae80c..25e570253b1b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -12,6 +12,7 @@
 #include "btree.h"
 #include "clock.h"
 #include "debug.h"
+#include "fs-gc.h"
 #include "gc.h"
 #include "inode.h"
 #include "io.h"
@@ -26,6 +27,7 @@
 #include "tier.h"
 #include "writeback.h"
 
+#include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/crc32c.h>
 #include <linux/debugfs.h>
@@ -139,6 +141,41 @@ static const char *bch_blkdev_open(const char *path, void *holder,
 	return NULL;
 }
 
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+	struct backing_dev_info *bdi;
+	struct cache_set *c = data;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (bdi_bits & (1 << WB_sync_congested)) {
+		/* Reads - check all devices: */
+		for_each_cache_rcu(ca, c, i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	} else {
+		/* Writes only go to tier 0: */
+		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+			if (bdi_congested(bdi, bdi_bits)) {
+				ret = 1;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /* Superblock */
 
 const char *validate_cache_member(struct cache_sb *sb,
@@ -601,8 +638,19 @@ static void bch_recalc_capacity(struct cache_set *c)
 	struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
 	struct cache *ca;
 	u64 capacity = 0;
+	unsigned long ra_pages = 0;
 	unsigned i, j;
 
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i) {
+		struct backing_dev_info *bdi =
+			blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	c->bdi.ra_pages = ra_pages;
+
 	/*
 	 * Capacity of the cache set is the capacity of all the devices in the
 	 * slowest (highest) tier - we don't include lower tier devices.
@@ -752,6 +800,9 @@ void bch_cache_set_fail(struct cache_set *c)
 void bch_cache_set_release(struct kobject *kobj)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	if (c->stop_completion)
+		complete(c->stop_completion);
 	kfree(c);
 	module_put(THIS_MODULE);
 }
@@ -777,6 +828,7 @@ static void cache_set_free(struct closure *cl)
 	percpu_ref_exit(&c->writes);
 	bch_io_clock_exit(&c->io_clock[WRITE]);
 	bch_io_clock_exit(&c->io_clock[READ]);
+	bdi_destroy(&c->bdi);
 	bioset_exit(&c->btree_bio);
 	bioset_exit(&c->bio_split);
 	mempool_exit(&c->btree_reserve_pool);
@@ -998,6 +1050,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) ||
 	    bioset_init(&c->btree_bio, 1, offsetof(struct bbio, bio)) ||
+	    bdi_setup_and_register(&c->bdi, "bcache") ||
 	    bch_io_clock_init(&c->io_clock[READ]) ||
 	    bch_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch_journal_alloc(&c->journal) ||
@@ -1005,6 +1058,10 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c))))
 		goto err;
 
+	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	c->bdi.congested_fn	= bch_congested_fn;
+	c->bdi.congested_data	= c;
+
 	return c;
 err:
 	bch_cache_set_stop(c);
@@ -1144,7 +1201,15 @@ static const char *run_cache_set(struct cache_set *c)
 			}
 
 		bch_journal_replay(c, &journal);
+
+		err = "error gcing inode nlinks";
+		if (bch_gc_inode_nlinks(c))
+			goto err;
+
+		bch_verify_inode_refs(c);
 	} else {
+		struct bkey_i_inode inode;
+
 		pr_notice("invalidating existing data");
 
 		err = "unable to allocate journal buckets";
@@ -1185,6 +1250,17 @@ static const char *run_cache_set(struct cache_set *c)
 		/* XXX: necessary? */
 		bch_journal_meta(&c->journal, &cl);
 		closure_sync(&cl);
+
+		bkey_inode_init(&inode.k_i);
+		inode.k.p.inode = BCACHE_ROOT_INO;
+		inode.v.i_mode = S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO;
+		inode.v.i_nlink = 2;
+
+		err = "error creating root directory";
+		if (bch_btree_insert(c, BTREE_ID_INODES,
+				     &keylist_single(&inode.k_i),
+				     NULL, &cl, NULL, 0))
+			goto err;
 	}
 
 	bch_prio_timer_start(c, READ);
@@ -2342,6 +2418,7 @@ kobj_attribute_write(reboot,		reboot_test);
 static void bcache_exit(void)
 {
 	bch_debug_exit();
+	bch_fs_exit();
 	bch_blockdev_exit();
 	if (bcache_kset)
 		kset_unregister(bcache_kset);
@@ -2368,6 +2445,7 @@ static int __init bcache_init(void)
 	    !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
 	    sysfs_create_files(&bcache_kset->kobj, files) ||
 	    bch_blockdev_init() ||
+	    bch_fs_init() ||
 	    bch_debug_init())
 		goto err;
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 5311afcd3a1c..0704697e762e 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -24,6 +24,8 @@ struct closure;
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
 #define atomic_sub_bug(i, v)	BUG_ON(atomic_sub_return(i, v) < 0)
 #define atomic_add_bug(i, v)	BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v)		BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v)	BUG_ON(atomic_long_sub_return(i, v) < 0)
 #define atomic64_dec_bug(v)	BUG_ON(atomic64_dec_return(v) < 0)
 #define atomic64_inc_bug(v, i)	BUG_ON(atomic64_inc_return(v) <= i)
 #define atomic64_sub_bug(i, v)	BUG_ON(atomic64_sub_return(i, v) < 0)
@@ -36,6 +38,8 @@ struct closure;
 #define atomic_inc_bug(v, i)	atomic_inc(v)
 #define atomic_sub_bug(i, v)	atomic_sub(i, v)
 #define atomic_add_bug(i, v)	atomic_add(i, v)
+#define atomic_long_dec_bug(v)		atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v)	atomic_long_sub(i, v)
 #define atomic64_dec_bug(v)	atomic64_dec(v)
 #define atomic64_inc_bug(v, i)	atomic64_inc(v)
 #define atomic64_sub_bug(i, v)	atomic64_sub(i, v)
diff --git a/drivers/md/bcache/xattr.c b/drivers/md/bcache/xattr.c
new file mode 100644
index 000000000000..404796ff8163
--- /dev/null
+++ b/drivers/md/bcache/xattr.c
@@ -0,0 +1,414 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "extents.h"
+#include "fs.h"
+#include "keylist.h"
+#include "siphash.h"
+#include "xattr.h"
+
+#include "linux/crc32c.h"
+#include "linux/cryptohash.h"
+#include "linux/posix_acl_xattr.h"
+#include "linux/xattr.h"
+
+#if 0
+/*
+ * XXX: should really include x_type here
+ */
+static u64 bch_xattr_hash(const struct qstr *name)
+{
+	union {
+		u32 b[SHA_DIGEST_WORDS];
+		u64 ret;
+	} digest;
+
+	unsigned done = 0;
+
+	sha_init(digest.b);
+
+	while (done < name->len) {
+		u32 workspace[SHA_WORKSPACE_WORDS];
+		u8 message[SHA_MESSAGE_BYTES];
+		unsigned bytes = min_t(unsigned, name->len - done,
+				       SHA_MESSAGE_BYTES);
+
+		memcpy(message, name->name + done, bytes);
+		memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes);
+		sha_transform(digest.b, message, workspace);
+		done += bytes;
+	}
+
+	return digest.ret;
+}
+
+static const SIPHASH_KEY bch_siphash_key;
+
+static u64 bch_xattr_hash(const struct qstr *name, u8 type)
+{
+#if 0
+	SIPHASH_CTX ctx;
+
+	SipHash24_Init(&ctx, &bch_siphash_key);
+	SipHash24_Update(&ctx, &type, sizeof(type));
+	SipHash24_Update(&ctx, name->name, name->len);
+
+	return SipHash24_End(&ctx) >> 1;
+#else
+	return SipHash24(&bch_siphash_key, name->name, name->len) >> 1;
+#endif
+}
+#endif
+
+static u64 bch_xattr_hash(const struct qstr *name, u8 type)
+{
+	return crc32c(0, name->name, name->len);
+}
+
+#define xattr_val(_xattr)	((_xattr)->x_name + (_xattr)->x_name_len)
+
+static int xattr_cmp(const struct bch_xattr *xattr,
+		     u8 type, const struct qstr *q)
+{
+	return xattr->x_type != type ||
+		xattr->x_name_len != q->len ||
+		memcmp(xattr->x_name, q->name, q->len);
+}
+
+static bool bch_xattr_invalid(const struct cache_set *c, struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_XATTR:
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+			return true;
+
+		return false;
+	case BCH_XATTR_WHITEOUT:
+		if (bkey_val_bytes(k.k))
+			return true;
+
+		return false;
+	default:
+		return true;
+	}
+}
+
+static void bch_xattr_to_text(struct cache_set *c, char *buf,
+			      size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr xattr;
+	int n;
+
+	switch (k.k->type) {
+	case BCH_XATTR:
+		xattr = bkey_s_c_to_xattr(k);
+
+		if (size) {
+			n = min_t(unsigned, size, xattr.v->x_name_len);
+			memcpy(buf, xattr.v->x_name, n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		n = scnprintf(buf, size, " -> ");
+		buf += n;
+		size -= n;
+
+		if (size) {
+			n = min_t(unsigned, size, xattr.v->x_val_len);
+			memcpy(buf, xattr_val(xattr.v), n);
+			buf[size - 1] = '\0';
+			buf += n;
+			size -= n;
+		}
+
+		break;
+	case BCH_XATTR_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+const struct btree_keys_ops bch_xattr_ops = {
+};
+
+const struct bkey_ops bch_bkey_xattr_ops = {
+	.key_invalid	= bch_xattr_invalid,
+	.val_to_text	= bch_xattr_to_text,
+};
+
+int bch_xattr_get(struct cache_set *c, u64 inum, const char *name,
+		  void *buffer, size_t size, int type)
+{
+	struct qstr qname = (struct qstr) QSTR_INIT(name, strlen(name));
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_xattr *xattr;
+	int ret = -ENODATA;
+
+	for_each_btree_key_with_holes(&iter, c, BTREE_ID_XATTRS,
+				      POS(inum, bch_xattr_hash(&qname, type)), k) {
+		switch (k.k->type) {
+		case BCH_XATTR:
+			xattr = bkey_s_c_to_xattr(k).v;
+
+			/* collision? */
+			if (!xattr_cmp(xattr, type, &qname)) {
+				ret = xattr->x_val_len;
+				if (buffer) {
+					if (xattr->x_val_len > size)
+						ret = -ERANGE;
+					else
+						memcpy(buffer, xattr_val(xattr),
+						       xattr->x_val_len);
+				}
+				goto out;
+			}
+			break;
+		case BCH_XATTR_WHITEOUT:
+			break;
+		default:
+			/* hole, not found */
+			goto out;
+		}
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch_xattr_set(struct inode *inode, const char *name,
+		  const void *value, size_t size,
+		  int flags, int type)
+{
+	struct bch_inode_info *ei = to_bch_ei(inode);
+	struct cache_set *c = inode->i_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct keylist keys;
+	struct qstr qname = (struct qstr) QSTR_INIT((char *) name,
+						    strlen(name));
+	int ret = -EINVAL;
+	unsigned insert_flags = BTREE_INSERT_ATOMIC;
+
+	if (!value)
+		insert_flags |= BTREE_INSERT_NOFAIL;
+
+	bch_btree_iter_init_intent(&iter, c, BTREE_ID_XATTRS,
+				   POS(inode->i_ino,
+				       bch_xattr_hash(&qname, type)));
+
+	while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+		switch (k.k->type) {
+		case BCH_XATTR:
+			/* collision? */
+			if (xattr_cmp(bkey_s_c_to_xattr(k).v, type, &qname)) {
+				bch_btree_iter_advance_pos(&iter);
+				continue;
+			}
+
+			if (flags & XATTR_CREATE) {
+				ret = -EEXIST;
+				goto out;
+			}
+
+			break;
+		case BCH_XATTR_WHITEOUT:
+			bch_btree_iter_advance_pos(&iter);
+			continue;
+		default:
+			/* hole, not found */
+			if (flags & XATTR_REPLACE) {
+				ret = -ENODATA;
+				goto out;
+			}
+			break;
+		}
+
+		bch_keylist_init(&keys);
+
+		if (value) {
+			struct bkey_i_xattr *xattr;
+			unsigned u64s = BKEY_U64s +
+				DIV_ROUND_UP(sizeof(struct bch_xattr) +
+					     qname.len + size,
+					     sizeof(u64));
+
+			if (u64s > U8_MAX) {
+				ret = -ERANGE;
+				break;
+			}
+
+			if (bch_keylist_realloc(&keys, u64s)) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			xattr = bkey_xattr_init(keys.top);
+			xattr->k.u64s		= u64s;
+			xattr->k.p		= k.k->p;
+			xattr->v.x_type		= type;
+			xattr->v.x_name_len	= qname.len;
+			xattr->v.x_val_len	= size;
+			memcpy(xattr->v.x_name, qname.name, qname.len);
+			memcpy(xattr_val(&xattr->v), value, size);
+
+			BUG_ON(xattr_cmp(&xattr->v, type, &qname));
+		} else {
+			/* removing */
+			bkey_init(&keys.top->k);
+			keys.top->k.type = BCH_XATTR_WHITEOUT;
+			keys.top->k.p = k.k->p;
+		}
+
+		bch_keylist_enqueue(&keys);
+
+		ret = bch_btree_insert_at(&iter, &keys, NULL,
+					  &ei->journal_seq,
+					  insert_flags);
+		bch_keylist_free(&keys);
+
+		if (ret != -EINTR && ret != -EAGAIN)
+			break;
+	}
+out:
+	bch_btree_iter_unlock(&iter);
+	return ret;
+}
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned);
+
+static size_t bch_xattr_emit(struct dentry *dentry,
+			     const struct bch_xattr *xattr,
+			     char *buffer, size_t buffer_size)
+{
+	const struct xattr_handler *handler =
+		bch_xattr_type_to_handler(xattr->x_type);
+
+	if (handler && (!handler->list || handler->list(dentry))) {
+		const size_t prefix_len = strlen(handler->prefix);
+		const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+		if (buffer && total_len <= buffer_size) {
+			memcpy(buffer, handler->prefix, prefix_len);
+			memcpy(buffer + prefix_len,
+			       xattr->x_name, xattr->x_name_len);
+			buffer[prefix_len + xattr->x_name_len] = '\0';
+		}
+
+		return total_len;
+	} else {
+		return 0;
+	}
+}
+
+ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct cache_set *c = dentry->d_sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_xattr *xattr;
+	u64 inum = dentry->d_inode->i_ino;
+	ssize_t ret = 0;
+	size_t len;
+
+	for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
+		BUG_ON(k.k->p.inode < inum);
+
+		if (k.k->p.inode > inum)
+			break;
+
+		if (k.k->type != BCH_XATTR)
+			continue;
+
+		xattr = bkey_s_c_to_xattr(k).v;
+
+		len = bch_xattr_emit(dentry, xattr, buffer, buffer_size);
+		if (buffer) {
+			if (len > buffer_size) {
+				bch_btree_iter_unlock(&iter);
+				return -ERANGE;
+			}
+
+			buffer += len;
+			buffer_size -= len;
+		}
+
+		ret += len;
+
+	}
+	bch_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+static int bch_xattr_get_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, void *buffer, size_t size)
+{
+	return bch_xattr_get(inode->i_sb->s_fs_info, inode->i_ino,
+			     name, buffer, size, handler->flags);
+}
+
+static int bch_xattr_set_handler(const struct xattr_handler *handler,
+				 struct dentry *dentry, struct inode *inode,
+				 const char *name, const void *value,
+				 size_t size, int flags)
+{
+	return bch_xattr_set(inode, name, value, size, flags,
+			     handler->flags);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_USER,
+};
+
+static bool bch_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch_xattr_trusted_list,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch_xattr_get_handler,
+	.set	= bch_xattr_set_handler,
+	.flags	= BCH_XATTR_INDEX_SECURITY,
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[BCH_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&posix_acl_access_xattr_handler,
+	[BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&posix_acl_default_xattr_handler,
+	[BCH_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[BCH_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+const struct xattr_handler *bch_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/drivers/md/bcache/xattr.h b/drivers/md/bcache/xattr.h
new file mode 100644
index 000000000000..839d47ef6910
--- /dev/null
+++ b/drivers/md/bcache/xattr.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_XATTR_H
+#define _BCACHE_XATTR_H
+
+extern const struct btree_keys_ops bch_xattr_ops;
+extern const struct bkey_ops bch_bkey_xattr_ops;
+
+struct dentry;
+struct xattr_handler;
+
+int bch_xattr_get(struct cache_set *, u64, const char *, void *, size_t, int);
+int bch_xattr_set(struct inode *, const char *, const void *, size_t, int, int);
+ssize_t bch_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch_xattr_handlers[];
+
+#endif /* _BCACHE_XATTR_H */
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 27a2926aec21..34523f6f129c 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -288,6 +288,8 @@ BITMASK(EXTENT_CACHED, struct bch_extent, data[0], 63, 64)
 
 #define BLOCKDEV_INODE_MAX	4096
 
+#define BCACHE_ROOT_INO		4096
+
 enum bch_inode_types {
 	BCH_INODE_FS		= 128,
 	BCH_INODE_BLOCKDEV	= 129,
@@ -336,6 +338,62 @@ BKEY_VAL_TYPE(inode_blockdev,	BCH_INODE_BLOCKDEV);
 BITMASK(INODE_FLASH_ONLY,	struct bch_inode_blockdev,
 				i_inode.i_flags, 0, 1);
 
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+enum {
+	BCH_DIRENT		= 128,
+	BCH_DIRENT_WHITEOUT	= 129,
+};
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	__u64			d_inum;
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __attribute__((packed));
+BKEY_VAL_TYPE(dirent,		BCH_DIRENT);
+
+/* Xattrs */
+
+enum {
+	BCH_XATTR		= 128,
+	BCH_XATTR_WHITEOUT	= 129,
+};
+
+#define BCH_XATTR_INDEX_USER			0
+#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define BCH_XATTR_INDEX_TRUSTED			3
+#define BCH_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__u16			x_val_len;
+	__u8			x_name[];
+} __attribute__((packed));
+BKEY_VAL_TYPE(xattr,		BCH_XATTR);
+
 /* Superblock */
 
 /* Version 0: Cache device
@@ -488,6 +546,14 @@ BITMASK(CACHE_BTREE_NODE_SIZE,		struct cache_sb, flags, 20, 36);
 BITMASK(CACHE_SET_META_REPLICAS_HAVE,	struct cache_sb, flags, 36, 40);
 BITMASK(CACHE_SET_DATA_REPLICAS_HAVE,	struct cache_sb, flags, 40, 44);
 
+BITMASK(CACHE_SET_DIRENT_CSUM_TYPE,	struct cache_sb, flags, 44, 48);
+enum {
+	BCH_DIRENT_CSUM_CRC32C		= 0,
+	BCH_DIRENT_CSUM_CRC64		= 1,
+	BCH_DIRENT_CSUM_SIPHASH		= 2,
+	BCH_DIRENT_CSUM_SHA1		= 3,
+};
+
 BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
 #define CACHE_MODE_WRITETHROUGH		0U
 #define CACHE_MODE_WRITEBACK		1U
@@ -532,6 +598,10 @@ static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
 	UUID_LE(0xf67385c6, 0x1a4e, 0xca45,				\
 		0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
 
+#define BCACHE_STATFS_MAGIC		0xca451a4e
+
+#define BCACHE_SB_MAGIC			0xca451a4ef67385c6ULL
+#define BCACHE_SB_MAGIC2		0x816dba487ff56582ULL
 #define JSET_MAGIC			0x245235c1a3625032ULL
 #define PSET_MAGIC			0x6750e15f87337f91ULL
 #define BSET_MAGIC			0x90135c78b99e07f5ULL
@@ -571,7 +641,9 @@ static inline __u64 bset_magic(struct cache_sb *sb)
 
 #define DEFINE_BCH_BTREE_IDS()					\
 	DEF_BTREE_ID(EXTENTS, 0, "extents")			\
-	DEF_BTREE_ID(INODES,  1, "inodes")
+	DEF_BTREE_ID(INODES,  1, "inodes")			\
+	DEF_BTREE_ID(DIRENTS, 2, "dirents")			\
+	DEF_BTREE_ID(XATTRS,  3, "xattrs")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
@@ -803,3 +875,5 @@ BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
 }
 #endif
 #endif /* _LINUX_BCACHE_H */
+
+/* vim: set foldnestmax=2: */