summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/Makefile9
-rw-r--r--drivers/md/bcache/acl.c245
-rw-r--r--drivers/md/bcache/acl.h57
-rw-r--r--drivers/md/bcache/bcache.h26
-rw-r--r--drivers/md/bcache/bkey.h4
-rw-r--r--drivers/md/bcache/bkey_methods.c4
-rw-r--r--drivers/md/bcache/buckets.h25
-rw-r--r--drivers/md/bcache/debug.c90
-rw-r--r--drivers/md/bcache/debug.h2
-rw-r--r--drivers/md/bcache/dirent.c379
-rw-r--r--drivers/md/bcache/dirent.h21
-rw-r--r--drivers/md/bcache/extents.c4
-rw-r--r--drivers/md/bcache/fs-gc.c202
-rw-r--r--drivers/md/bcache/fs-gc.h6
-rw-r--r--drivers/md/bcache/fs.c2087
-rw-r--r--drivers/md/bcache/fs.h20
-rw-r--r--drivers/md/bcache/inode.c50
-rw-r--r--drivers/md/bcache/inode.h1
-rw-r--r--drivers/md/bcache/siphash.c185
-rw-r--r--drivers/md/bcache/siphash.h86
-rw-r--r--drivers/md/bcache/super.c78
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/xattr.c414
-rw-r--r--drivers/md/bcache/xattr.h16
-rw-r--r--include/uapi/linux/bcache.h76
26 files changed, 4069 insertions, 23 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 5502372dfc94..55e135f6dd61 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -2,6 +2,7 @@
config BCACHE
tristate "Block device as cache"
select LIBCRC32C
+ select FS_POSIX_ACL
---help---
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 02ef2612777e..0dd3db8a5ef4 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,9 +1,10 @@
obj-$(CONFIG_BCACHE) += bcache.o
-bcache-y := alloc.o bkey.o bkey_methods.o blockdev.o bset.o\
- btree.o buckets.o clock.o closure.o debug.o extents.o gc.o inode.o io.o\
- journal.o keybuf.o keylist.o migrate.o move.o movinggc.o notify.o\
- request.o six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o
+bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\
+ bset.o btree.o buckets.o clock.o closure.o debug.o dirent.o extents.o\
+ fs.o fs-gc.o gc.o inode.o io.o journal.o keybuf.o keylist.o migrate.o\
+ move.o movinggc.o notify.o request.o siphash.o six.o stats.o super.o\
+ sysfs.o tier.o trace.o util.o writeback.o xattr.o
ccflags-y := -Werror
diff --git a/drivers/md/bcache/acl.c b/drivers/md/bcache/acl.c
new file mode 100644
index 000000000000..51f04ab2a9d5
--- /dev/null
+++ b/drivers/md/bcache/acl.c
@@ -0,0 +1,245 @@
+#include "bcache.h"
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch_acl_from_disk(const void *value, size_t size)
+{
+ const char *end = (char *)value + size;
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(bch_acl_header))
+ return ERR_PTR(-EINVAL);
+ if (((bch_acl_header *)value)->a_version !=
+ cpu_to_le32(BCH_ACL_VERSION))
+ return ERR_PTR(-EINVAL);
+ value = (char *)value + sizeof(bch_acl_header);
+ count = bch_acl_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n = 0; n < count; n++) {
+ bch_acl_entry *entry =
+ (bch_acl_entry *)value;
+ if ((char *)value + sizeof(bch_acl_entry_short) > end)
+ goto fail;
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ value = (char *)value +
+ sizeof(bch_acl_entry_short);
+ break;
+
+ case ACL_USER:
+ value = (char *)value + sizeof(bch_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ value = (char *)value + sizeof(bch_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ if (value != end)
+ goto fail;
+ return acl;
+
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+ bch_acl_header *ext_acl;
+ char *e;
+ size_t n;
+
+ *size = bch_acl_size(acl->a_count);
+ ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count *
+ sizeof(bch_acl_entry), GFP_KERNEL);
+ if (!ext_acl)
+ return ERR_PTR(-ENOMEM);
+ ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION);
+ e = (char *)ext_acl + sizeof(bch_acl_header);
+ for (n = 0; n < acl->a_count; n++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+ bch_acl_entry *entry = (bch_acl_entry *)e;
+
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
+ e += sizeof(bch_acl_entry);
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
+ e += sizeof(bch_acl_entry);
+ break;
+
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ e += sizeof(bch_acl_entry_short);
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ return (char *)ext_acl;
+
+fail:
+ kfree(ext_acl);
+ return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *bch_get_acl(struct inode *inode, int type)
+{
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int ret;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+ ret = bch_xattr_get(c, inode->i_ino, "", NULL, 0, name_index);
+ if (ret > 0) {
+ value = kmalloc(ret, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ ret = bch_xattr_get(c, inode->i_ino, "", value,
+ ret, name_index);
+ }
+ if (ret > 0)
+ acl = bch_acl_from_disk(value, ret);
+ else if (ret == -ENODATA || ret == -ENOSYS)
+ acl = NULL;
+ else
+ acl = ERR_PTR(ret);
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int ret;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (ret < 0)
+ return ret;
+ else {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ if (ret == 0)
+ acl = NULL;
+ }
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = bch_acl_to_disk(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ ret = bch_xattr_set(inode, "", value, size, 0, name_index);
+
+ kfree(value);
+
+ if (ret == -ERANGE)
+ ret = -E2BIG;
+
+ if (!ret)
+ set_cached_acl(inode, type, acl);
+
+ return ret;
+}
+
+int bch_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *default_acl, *acl;
+ int ret;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (ret)
+ return ret;
+
+ if (default_acl) {
+ ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!ret)
+ ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+ return ret;
+}
diff --git a/drivers/md/bcache/acl.h b/drivers/md/bcache/acl.h
new file mode 100644
index 000000000000..03f93fa0ff1b
--- /dev/null
+++ b/drivers/md/bcache/acl.h
@@ -0,0 +1,57 @@
+/*
+ File: fs/bch/acl.h
+
+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define BCH_ACL_VERSION 0x0001
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+} bch_acl_entry;
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+ __le32 a_version;
+} bch_acl_header;
+
+static inline size_t bch_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(bch_acl_header) +
+ count * sizeof(bch_acl_entry_short);
+ } else {
+ return sizeof(bch_acl_header) +
+ 4 * sizeof(bch_acl_entry_short) +
+ (count - 4) * sizeof(bch_acl_entry);
+ }
+}
+
+static inline int bch_acl_count(size_t size)
+{
+ ssize_t s;
+
+ size -= sizeof(bch_acl_header);
+ s = size - 4 * sizeof(bch_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(bch_acl_entry_short))
+ return -1;
+ return size / sizeof(bch_acl_entry_short);
+ } else {
+ if (s % sizeof(bch_acl_entry))
+ return -1;
+ return s / sizeof(bch_acl_entry) + 4;
+ }
+}
+
+extern struct posix_acl *bch_get_acl(struct inode *, int);
+extern int bch_set_acl(struct inode *, struct posix_acl *, int);
+extern int bch_init_acl(struct inode *, struct inode *);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5aa2c2863c3c..b203e28c48ca 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -351,11 +351,12 @@ struct cache {
};
struct gc_stat {
- size_t nodes;
- size_t key_bytes;
+ u64 nodes;
+ u64 key_bytes;
+ u64 nkeys;
- size_t nkeys;
- uint64_t data; /* sectors */
+ u64 data; /* sectors */
+ u64 inodes;
};
/*
@@ -384,6 +385,7 @@ enum {
CACHE_SET_RO,
CACHE_SET_GC_STOPPING,
CACHE_SET_GC_FAILURE,
+ CACHE_SET_BDEV_MOUNTED,
};
struct cache_member_rcu {
@@ -404,6 +406,7 @@ struct cache_set {
struct list_head list;
struct kobject kobj;
struct kobject internal;
+ struct completion *stop_completion;
unsigned long flags;
/* Counts outstanding writes, for clean transition to read-only */
@@ -423,10 +426,13 @@ struct cache_set {
struct bio_set bio_split;
+ /* For punting bio submissions to workqueue, io.c */
struct bio_list bio_submit_list;
struct work_struct bio_submit_work;
spinlock_t bio_submit_lock;
+ struct backing_dev_info bdi;
+
/* BTREE CACHE */
struct bio_set btree_bio;
@@ -481,9 +487,16 @@ struct cache_set {
struct timer_list foreground_write_wakeup;
+ /*
+ * These contain all r/w devices - i.e. devices we can currently
+ * allocate from:
+ */
struct cache_group cache_all;
struct cache_group cache_tiers[CACHE_TIERS];
+
u64 capacity; /* sectors */
+ atomic_long_t sectors_reserved;
+ atomic_long_t sectors_reserved_cache;
struct mutex bucket_lock;
@@ -567,6 +580,9 @@ struct cache_set {
struct work_struct read_race_work;
spinlock_t read_race_lock;
+ /* FILESYSTEM */
+ atomic_long_t nr_inodes;
+
/* TIERING */
struct task_struct *tiering_read;
struct bch_pd_controller tiering_pd;
@@ -757,5 +773,7 @@ do { \
void bch_debug_exit(void);
int bch_debug_init(void);
+void bch_fs_exit(void);
+int bch_fs_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h
index 1a82e57ab420..5bb19a700788 100644
--- a/drivers/md/bcache/bkey.h
+++ b/drivers/md/bcache/bkey.h
@@ -493,6 +493,10 @@ BKEY_VAL_ACCESSORS(extent, BCH_EXTENT);
BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS);
BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV);
+BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
+
/* byte order helpers */
#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c
index 90e1c9e7df38..03affccac1ce 100644
--- a/drivers/md/bcache/bkey_methods.c
+++ b/drivers/md/bcache/bkey_methods.c
@@ -2,12 +2,16 @@
#include "bcache.h"
#include "bkey_methods.h"
#include "btree.h"
+#include "dirent.h"
#include "extents.h"
#include "inode.h"
+#include "xattr.h"
static const struct bkey_ops *bch_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = &bch_bkey_extent_ops,
[BKEY_TYPE_INODES] = &bch_bkey_inode_ops,
+ [BKEY_TYPE_DIRENTS] = &bch_bkey_dirent_ops,
+ [BKEY_TYPE_XATTRS] = &bch_bkey_xattr_ops,
[BKEY_TYPE_BTREE] = &bch_bkey_btree_ops,
};
diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h
index 3644e7e110ab..cd58d86af3bb 100644
--- a/drivers/md/bcache/buckets.h
+++ b/drivers/md/bcache/buckets.h
@@ -229,26 +229,27 @@ static inline size_t buckets_free_cache(struct cache *ca,
return __buckets_free_cache(ca, bch_bucket_stats_read(ca), reserve);
}
-static inline u64 cache_sectors_used(struct cache *ca)
-{
- struct bucket_stats stats = bch_bucket_stats_read(ca);
-
- return (stats.buckets_meta << ca->bucket_bits) +
- stats.sectors_dirty;
-}
-
-static inline bool cache_set_full(struct cache_set *c)
+static inline u64 cache_set_sectors_used(struct cache_set *c)
{
struct cache *ca;
unsigned i;
u64 used = 0;
rcu_read_lock();
- for_each_cache_rcu(ca, c, i)
- used += cache_sectors_used(ca);
+ for_each_cache_rcu(ca, c, i) {
+ struct bucket_stats stats = bch_bucket_stats_read(ca);
+
+ used += (stats.buckets_meta << ca->bucket_bits) +
+ stats.sectors_dirty;
+ }
rcu_read_unlock();
- return used >= c->capacity;
+ return min(c->capacity, used + atomic_long_read(&c->sectors_reserved));
+}
+
+static inline bool cache_set_full(struct cache_set *c)
+{
+ return cache_set_sectors_used(c) >= c->capacity;
}
static inline bool is_available_bucket(struct bucket_mark mark)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index b0d22579ea0b..967420d7c078 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -10,6 +10,7 @@
#include "buckets.h"
#include "debug.h"
#include "extents.h"
+#include "inode.h"
#include "io.h"
#include "super.h"
@@ -182,6 +183,95 @@ out_put:
bio_put(check);
}
+void bch_verify_inode_refs(struct cache_set *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_inode inode;
+ u64 cur_inum = 0;
+ char buf[100];
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(BCACHE_ROOT_INO, 0), k) {
+ if (k.k->type == KEY_TYPE_DISCARD)
+ continue;
+
+ if (k.k->p.inode != cur_inum &&
+ bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+ bch_bkey_val_to_text(c, iter.nodes[0], buf,
+ sizeof(buf), k);
+ bch_cache_set_error(c,
+ "extent for missing inode %llu\n%s",
+ k.k->p.inode, buf);
+ bch_btree_iter_unlock(&iter);
+ return;
+ }
+
+ cur_inum = k.k->p.inode;
+
+ if (!S_ISREG(inode.v.i_mode) &&
+ !S_ISLNK(inode.v.i_mode))
+ bch_cache_set_error(c,
+ "extent for non regular file, inode %llu mode %u",
+ k.k->p.inode, inode.v.i_mode);
+
+ BUG_ON(inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY);
+
+ if (k.k->p.offset > round_up(inode.v.i_size, PAGE_SIZE) >> 9) {
+ bch_bkey_val_to_text(c, iter.nodes[0], buf,
+ sizeof(buf), k);
+ bch_cache_set_error(c,
+ "extent past end of inode %llu: i_size %llu extent\n%s",
+ k.k->p.inode, inode.v.i_size, buf);
+ }
+ }
+ bch_btree_iter_unlock(&iter);
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+ POS(BCACHE_ROOT_INO, 0), k) {
+ /* XXX: skipping whiteouts for now */
+ if (k.k->type != BCH_DIRENT)
+ continue;
+
+ if (k.k->p.inode != cur_inum &&
+ bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+ bch_cache_set_error(c, "dirent for missing inode %llu",
+ k.k->p.inode);
+ bch_btree_iter_unlock(&iter);
+ return;
+ }
+
+ cur_inum = k.k->p.inode;
+
+ if (!S_ISDIR(inode.v.i_mode))
+ bch_cache_set_error(c,
+ "dirent for non directory, inode %llu mode %u",
+ k.k->p.inode, inode.v.i_mode);
+ }
+ bch_btree_iter_unlock(&iter);
+
+ for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
+ POS(BCACHE_ROOT_INO, 0), k) {
+ if (k.k->p.inode != cur_inum &&
+ bch_inode_find_by_inum(c, k.k->p.inode, &inode)) {
+ bch_cache_set_error(c,
+ "xattr for missing inode %llu",
+ k.k->p.inode);
+ bch_btree_iter_unlock(&iter);
+ return;
+ }
+
+ cur_inum = k.k->p.inode;
+
+ if (!S_ISREG(inode.v.i_mode) &&
+ !S_ISDIR(inode.v.i_mode))
+ bch_cache_set_error(c,
+ "xattr for non file/directory, inode %llu mode %u",
+ k.k->p.inode, inode.v.i_mode);
+ }
+ bch_btree_iter_unlock(&iter);
+}
+
#endif
#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index da35861aa3cb..b3cbb0bd9cd3 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -10,6 +10,7 @@ struct cache_set;
void bch_btree_verify(struct cache_set *, struct btree *);
void bch_data_verify(struct cached_dev *, struct bio *);
+void bch_verify_inode_refs(struct cache_set *);
#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
#define key_merging_disabled(c) ((c)->key_merging_disabled)
@@ -19,6 +20,7 @@ void bch_data_verify(struct cached_dev *, struct bio *);
static inline void bch_btree_verify(struct cache_set *c, struct btree *b) {}
static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+static inline void bch_verify_inode_refs(struct cache_set *c) {}
#define expensive_debug_checks(c) 0
#define key_merging_disabled(c) 0
diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c
new file mode 100644
index 000000000000..999538c71391
--- /dev/null
+++ b/drivers/md/bcache/dirent.c
@@ -0,0 +1,379 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "extents.h"
+#include "dirent.h"
+#include "keylist.h"
+#include "siphash.h"
+
+#include "linux/crc32c.h"
+#include "linux/cryptohash.h"
+
+#if 0
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+ union {
+ u32 b[SHA_DIGEST_WORDS];
+ u64 ret;
+ } digest;
+
+ unsigned done = 0;
+
+ sha_init(digest.b);
+
+ while (done < name->len) {
+ u32 workspace[SHA_WORKSPACE_WORDS];
+ u8 message[SHA_MESSAGE_BYTES];
+ unsigned bytes = min_t(unsigned, name->len - done,
+ SHA_MESSAGE_BYTES);
+
+ memcpy(message, name->name + done, bytes);
+ memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes);
+ sha_transform(digest.b, message, workspace);
+ done += bytes;
+ }
+
+ /* [0,2) reserved for dots */
+
+ return (digest.ret >= 2 ? digest.ret : 2) & S64_MAX;
+}
+
+static const SIPHASH_KEY bch_siphash_key;
+
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+ u64 hash = SipHash24(&bch_siphash_key,
+ name->name, name->len) >> 1;
+
+ /* [0,2) reserved for dots */
+
+ return (hash >= 2 ? hash : 2);
+}
+#endif
+
+static u64 bch_dirent_hash(const struct qstr *name)
+{
+ u64 hash = crc32c(0, name->name, name->len);
+
+ /* [0,2) reserved for dots */
+
+ return (hash >= 2 ? hash : 2);
+}
+
+static unsigned dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+ unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+
+ while (len && !d.v->d_name[len - 1])
+ --len;
+
+ return len;
+}
+
+static int dirent_cmp(struct bkey_s_c_dirent d,
+ const struct qstr *q)
+{
+ int len = dirent_name_bytes(d);
+
+ return len - q->len ?: memcmp(d.v->d_name, q->name, len);
+}
+
+static bool bch_dirent_invalid(const struct cache_set *c, struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_DIRENT:
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+ return true;
+
+ return false;
+ case BCH_DIRENT_WHITEOUT:
+ if (bkey_val_bytes(k.k))
+ return true;
+
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void bch_dirent_to_text(struct cache_set *c, char *buf,
+ size_t size, struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d;
+
+ switch (k.k->type) {
+ case BCH_DIRENT:
+ d = bkey_s_c_to_dirent(k);
+
+ if (size) {
+ unsigned n = min_t(unsigned, size,
+ dirent_name_bytes(d));
+ memcpy(buf, d.v->d_name, n);
+ buf[size - 1] = '\0';
+ buf += n;
+ size -= n;
+ }
+
+ scnprintf(buf, size, " -> %llu", d.v->d_inum);
+ break;
+ case BCH_DIRENT_WHITEOUT:
+ scnprintf(buf, size, "whiteout");
+ break;
+ }
+}
+
+const struct btree_keys_ops bch_dirent_ops = {
+};
+
+const struct bkey_ops bch_bkey_dirent_ops = {
+ .key_invalid = bch_dirent_invalid,
+ .val_to_text = bch_dirent_to_text,
+};
+
+static int __bch_dirent_create(struct cache_set *c, u64 dir_inum,
+ u8 type, const struct qstr *name,
+ u64 dst_inum, bool update,
+ u64 *journal_seq)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct keylist keys;
+ struct bkey_i_dirent *dirent;
+ unsigned u64s = BKEY_U64s +
+ DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
+ sizeof(u64));
+ int ret = -ENOENT;
+
+ bch_keylist_init(&keys);
+
+ if (bch_keylist_realloc(&keys, u64s))
+ return -ENOMEM;
+
+ dirent = bkey_dirent_init(keys.top);
+ dirent->k.u64s = u64s;
+ dirent->v.d_inum = dst_inum;
+ dirent->v.d_type = type;
+
+ memcpy(dirent->v.d_name, name->name, name->len);
+ memset(dirent->v.d_name + name->len, 0,
+ bkey_val_bytes(&dirent->k) -
+ (sizeof(struct bch_dirent) + name->len));
+
+ BUG_ON(dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+ BUG_ON(dirent_cmp(dirent_i_to_s_c(dirent), name));
+
+ bch_keylist_enqueue(&keys);
+
+ bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS,
+ POS(dir_inum, bch_dirent_hash(name)));
+
+ while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+ /* hole? */
+ if (k.k->type != BCH_DIRENT) {
+ if (!update)
+ goto insert;
+ break;
+ }
+
+ if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) {
+ /* found: */
+ if (!update) {
+ ret = -EEXIST;
+ break;
+ }
+insert:
+ dirent->k.p = k.k->p;
+
+ ret = bch_btree_insert_at(&iter, &keys, NULL,
+ journal_seq,
+ BTREE_INSERT_ATOMIC);
+ if (ret != -EINTR && ret != -EAGAIN)
+ break;
+ } else {
+ /* collision */
+ bch_btree_iter_advance_pos(&iter);
+ }
+ }
+ bch_btree_iter_unlock(&iter);
+ bch_keylist_free(&keys);
+
+ return ret;
+}
+
+int bch_dirent_create(struct cache_set *c, u64 dir_inum, u8 type,
+ const struct qstr *name, u64 dst_inum,
+ u64 *journal_seq)
+{
+ return __bch_dirent_create(c, dir_inum, type,
+ name, dst_inum, false,
+ journal_seq);
+}
+
+int bch_dirent_update(struct cache_set *c, u64 dir_inum,
+ const struct qstr *name, u64 dst_inum,
+ u64 *journal_seq)
+{
+ return __bch_dirent_create(c, dir_inum, DT_UNKNOWN,
+ name, dst_inum, true,
+ journal_seq);
+}
+
+int bch_dirent_delete(struct cache_set *c, u64 dir_inum,
+ const struct qstr *name)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 hash = bch_dirent_hash(name);
+ int ret = -ENOENT;
+
+ pr_debug("deleting %llu:%llu (%s)",
+ dir_inum, hash, name->name);
+
+ bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS,
+ POS(dir_inum, bch_dirent_hash(name)));
+
+ while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+ switch (k.k->type) {
+ case BCH_DIRENT:
+ if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) {
+ struct bkey_i delete;
+
+ bkey_init(&delete.k);
+ delete.k.p = k.k->p;
+ delete.k.type = BCH_DIRENT_WHITEOUT;
+
+ ret = bch_btree_insert_at(&iter,
+ &keylist_single(&delete),
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC);
+ if (ret == -EINTR || ret == -EAGAIN)
+ continue;
+ }
+ break;
+ case BCH_DIRENT_WHITEOUT:
+ break;
+ default:
+ /* hole, not found */
+ goto out;
+ }
+
+ bch_btree_iter_advance_pos(&iter);
+ }
+out:
+ bch_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
+u64 bch_dirent_lookup(struct cache_set *c, u64 dir_inum,
+ const struct qstr *name)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent dirent;
+ u64 hash = bch_dirent_hash(name);
+
+ pr_debug("searching for %llu:%llu (%s)",
+ dir_inum, hash, name->name);
+
+ for_each_btree_key_with_holes(&iter, c, BTREE_ID_DIRENTS,
+ POS(dir_inum, bch_dirent_hash(name)), k) {
+ switch (k.k->type) {
+ case BCH_DIRENT:
+ dirent = bkey_s_c_to_dirent(k);
+
+ /* collision? */
+ if (!dirent_cmp(dirent, name)) {
+ u64 inum = dirent.v->d_inum;
+
+ bch_btree_iter_unlock(&iter);
+ pr_debug("found %s: %llu", name->name, inum);
+ return inum;
+ }
+ break;
+ case BCH_DIRENT_WHITEOUT:
+ break;
+ default:
+ /* hole, not found */
+ goto out;
+ }
+ }
+out:
+ bch_btree_iter_unlock(&iter);
+
+ pr_debug("%s not found", name->name);
+ return 0;
+}
+
+int bch_empty_dir(struct cache_set *c, u64 dir_inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
+ if (k.k->p.inode > dir_inum)
+ break;
+
+ if (k.k->type == BCH_DIRENT) {
+ ret = -ENOTEMPTY;
+ break;
+ }
+
+ }
+ bch_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
+int bch_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ struct cache_set *c = sb->s_fs_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent dirent;
+ unsigned len;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+ POS(inode->i_ino, ctx->pos), k) {
+ if (k.k->type != BCH_DIRENT)
+ continue;
+
+ dirent = bkey_s_c_to_dirent(k);
+
+ pr_debug("saw %llu:%llu (%s) -> %llu",
+ k.k->p.inode, k.k->p.offset,
+ dirent.v->d_name, dirent.v->d_inum);
+
+ if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0)
+ continue;
+
+ if (k.k->p.inode > inode->i_ino)
+ break;
+
+ len = dirent_name_bytes(dirent);
+
+ pr_debug("emitting %s", dirent.v->d_name);
+
+ /*
+ * XXX: dir_emit() can fault and block, while we're holding
+ * locks
+ */
+ if (!dir_emit(ctx, dirent.v->d_name, len,
+ dirent.v->d_inum, dirent.v->d_type))
+ break;
+
+ ctx->pos = k.k->p.offset + 1;
+ }
+ bch_btree_iter_unlock(&iter);
+
+ return 0;
+}
diff --git a/drivers/md/bcache/dirent.h b/drivers/md/bcache/dirent.h
new file mode 100644
index 000000000000..4de22a53c875
--- /dev/null
+++ b/drivers/md/bcache/dirent.h
@@ -0,0 +1,21 @@
+#ifndef _BCACHE_DIRENT_H
+#define _BCACHE_DIRENT_H
+
+extern const struct btree_keys_ops bch_dirent_ops;
+extern const struct bkey_ops bch_bkey_dirent_ops;
+
+struct qstr;
+struct file;
+struct dir_context;
+struct cache_set;
+
+int bch_dirent_create(struct cache_set *, u64, u8, const struct qstr *,
+ u64, u64 *);
+int bch_dirent_update(struct cache_set *, u64, const struct qstr *, u64, u64 *);
+int bch_dirent_delete(struct cache_set *, u64, const struct qstr *);
+u64 bch_dirent_lookup(struct cache_set *, u64, const struct qstr *);
+int bch_empty_dir(struct cache_set *, u64);
+int bch_readdir(struct file *, struct dir_context *);
+
+#endif /* _BCACHE_DIRENT_H */
+
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 077164e271b5..55ee8043b9b8 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -8,12 +8,14 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
+#include "dirent.h"
#include "extents.h"
#include "gc.h"
#include "inode.h"
#include "journal.h"
#include "super.h"
#include "writeback.h"
+#include "xattr.h"
#include <trace/events/bcache.h>
@@ -1822,4 +1824,6 @@ const struct bkey_ops bch_bkey_extent_ops = {
const struct btree_keys_ops *bch_btree_ops[] = {
[BTREE_ID_EXTENTS] = &bch_extent_ops,
[BTREE_ID_INODES] = &bch_inode_ops,
+ [BTREE_ID_DIRENTS] = &bch_dirent_ops,
+ [BTREE_ID_XATTRS] = &bch_xattr_ops,
};
diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c
new file mode 100644
index 000000000000..47e7a7f093e0
--- /dev/null
+++ b/drivers/md/bcache/fs-gc.c
@@ -0,0 +1,202 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "dirent.h"
+#include "fs.h"
+#include "inode.h"
+#include "keylist.h"
+#include "super.h"
+
+#define INODES_PER_ITER (1 << 24)
+
+struct nlink {
+ u32 count;
+ u32 dir_count;
+};
+
+static void inc_link(u64 pos, struct nlink *links, bool *need_loop,
+ u64 inum, unsigned count, bool dir)
+{
+ if (inum >= pos + INODES_PER_ITER) {
+ *need_loop = true;
+ } else if (inum >= pos) {
+ if (dir)
+ links[inum - pos].dir_count += count;
+ else
+ links[inum - pos].count += count;
+ }
+}
+
+/*
+ * XXX: should do a DFS (via filesystem heirarchy), and make sure all dirents
+ * are reachable
+ */
+
+noinline_for_stack
+static int bch_gc_walk_dirents(struct cache_set *c, u64 pos,
+ struct nlink *links, bool *need_loop)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+
+ need_loop = false;
+ memset(links, 0, INODES_PER_ITER * sizeof(*links));
+
+ inc_link(pos, links, need_loop, BCACHE_ROOT_INO, 2, false);
+
+ for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
+ switch (k.k->type) {
+ case BCH_DIRENT:
+ d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type == DT_DIR) {
+ inc_link(pos, links, need_loop,
+ d.v->d_inum, 2, false);
+ inc_link(pos, links, need_loop,
+ d.k->p.inode, 1, true);
+ } else {
+ inc_link(pos, links, need_loop,
+ d.v->d_inum, 1, false);
+ }
+
+ break;
+ }
+
+ bch_btree_iter_cond_resched(&iter);
+ }
+ return bch_btree_iter_unlock(&iter);
+}
+
+static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter,
+ struct bkey_s_c_inode inode, struct nlink link)
+{
+ struct bkey_i_inode update;
+ int ret;
+
+ cache_set_err_on(inode.v->i_nlink < link.count, c,
+ "i_link too small (%u < %u, type %i)",
+ inode.v->i_nlink, link.count + link.dir_count,
+ mode_to_type(inode.v->i_mode));
+
+ if (!link.count) {
+ cache_set_err_on(S_ISDIR(inode.v->i_mode) &&
+ bch_empty_dir(c, inode.k->p.inode), c,
+ "non empty directory with link count 0,inode nlink %u, dir links found %u",
+ inode.v->i_nlink, link.dir_count);
+ pr_info("deleting inum %llu", inode.k->p.inode);
+
+ bch_btree_iter_unlock(iter);
+ return bch_inode_rm(c, inode.k->p.inode);
+ }
+
+ if (inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) {
+ pr_info("truncating inode %llu", inode.k->p.inode);
+
+ /*
+ * XXX: need to truncate partial blocks too here - or ideally
+ * just switch units to bytes and that issue goes away
+ */
+
+ ret = bch_inode_truncate(c, inode.k->p.inode,
+ round_up(inode.v->i_size, PAGE_SIZE) >> 9);
+ if (ret)
+ return ret;
+ }
+
+ if (inode.v->i_nlink != link.count + link.dir_count ||
+ inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) {
+ if (inode.v->i_nlink != link.count + link.dir_count)
+ pr_info("setting inum %llu nlinks from %u to %u",
+ inode.k->p.inode, inode.v->i_nlink,
+ link.count + link.dir_count);
+
+ bkey_reassemble(&update.k_i, inode.s_c);
+ update.v.i_nlink = link.count + link.dir_count;
+ update.v.i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+
+ return bch_btree_insert_at(iter,
+ &keylist_single(&update.k_i),
+ NULL, NULL,
+ BTREE_INSERT_ATOMIC);
+ }
+
+ return 0;
+}
+
+noinline_for_stack
+static int bch_gc_walk_inodes(struct cache_set *c, u64 pos, struct nlink *links)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+ u64 i = 0;
+
+ bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(pos, 0));
+
+ while ((k = bch_btree_iter_peek(&iter)).k) {
+ if (k.k->p.inode - pos >= INODES_PER_ITER)
+ break;
+
+ while (i < k.k->p.inode - pos) {
+ cache_set_err_on(links[i].count, c,
+ "missing inode %llu",
+ pos + i);
+ i++;
+ }
+
+ switch (k.k->type) {
+ case BCH_INODE_FS:
+ ret = bch_gc_do_inode(c, &iter,
+ bkey_s_c_to_inode(k),
+ links[i]);
+ if (ret == -EAGAIN || ret == -EINTR)
+ continue;
+ if (ret)
+ goto out;
+
+ break;
+ default:
+ cache_set_err_on(links[i].count, c,
+ "missing inode %llu",
+ pos + i);
+ break;
+ }
+
+ if (links[i].count)
+ atomic_long_inc(&c->nr_inodes);
+
+ bch_btree_iter_advance_pos(&iter);
+ i++;
+ bch_btree_iter_cond_resched(&iter);
+ }
+out:
+ return bch_btree_iter_unlock(&iter) ?: ret;
+}
+
+int bch_gc_inode_nlinks(struct cache_set *c)
+{
+ bool need_loop = false;
+ u64 pos = 0;
+ struct nlink *links = vmalloc(INODES_PER_ITER * sizeof(*links));
+ int ret = 0;
+
+ if (!links)
+ return -ENOMEM;
+
+ do {
+ ret = bch_gc_walk_dirents(c, pos, links, &need_loop);
+ if (ret)
+ break;
+
+ ret = bch_gc_walk_inodes(c, pos, links);
+ if (ret)
+ break;
+
+ pos += INODES_PER_ITER;
+ } while (need_loop);
+
+ vfree(links);
+
+ return ret;
+}
diff --git a/drivers/md/bcache/fs-gc.h b/drivers/md/bcache/fs-gc.h
new file mode 100644
index 000000000000..4fb5728820ea
--- /dev/null
+++ b/drivers/md/bcache/fs-gc.h
@@ -0,0 +1,6 @@
+#ifndef _BCACHE_FS_GC_H
+#define _BCACHE_FS_GC_H
+
+int bch_gc_inode_nlinks(struct cache_set *);
+
+#endif /* _BCACHE_FS_GC_H */
diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c
new file mode 100644
index 000000000000..0d04efe9c40c
--- /dev/null
+++ b/drivers/md/bcache/fs.c
@@ -0,0 +1,2087 @@
+
+#include "bcache.h"
+#include "acl.h"
+#include "btree.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "extents.h"
+#include "fs.h"
+#include "inode.h"
+#include "io.h"
+#include "journal.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/compat.h>
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/xattr.h>
+
+/*
+ * our page flags:
+ *
+ * allocated - page has space on disk reserved for it (-ENOSPC was checked then,
+ * shouldn't be checked later)
+ *
+ * corresponds to c->sectors_reserved
+ *
+ * append - page is dirty from an append write, new i_size can't be written
+ * until after page is written
+ *
+ * corresponds to ei->append_count
+ */
+
+#define PF_ANY(page, enforce) page
+PAGEFLAG(Allocated, private, PF_ANY)
+TESTSCFLAG(Allocated, private, PF_ANY)
+
+PAGEFLAG(Append, private_2, PF_ANY)
+TESTSCFLAG(Append, private_2, PF_ANY)
+#undef PF_ANY
+
+static struct bio_set *bch_fs_bioset;
+static struct kmem_cache *bch_inode_cache;
+static DECLARE_WAIT_QUEUE_HEAD(bch_append_wait);
+
+static void bch_inode_init(struct bch_inode_info *);
+static int bch_read_single_page(struct page *, struct address_space *);
+
+#define SECTORS_CACHE 1024
+
+static int reserve_sectors(struct cache_set *c, unsigned sectors)
+{
+ if (likely(atomic_long_sub_return(sectors,
+ &c->sectors_reserved_cache) >= 0))
+ return 0;
+
+ atomic_long_add(SECTORS_CACHE, &c->sectors_reserved);
+
+ if (likely(!cache_set_full(c))) {
+ atomic_long_add(SECTORS_CACHE, &c->sectors_reserved_cache);
+ return 0;
+ }
+
+ atomic_long_sub_bug(SECTORS_CACHE, &c->sectors_reserved);
+ atomic_long_add(sectors, &c->sectors_reserved_cache);
+ return -ENOSPC;
+}
+
+static void bch_append_put(struct bch_inode_info *ei)
+{
+ if (atomic_long_dec_and_test(&ei->append_count))
+ wake_up(&bch_append_wait);
+}
+
+static void bch_clear_page_bits(struct cache_set *c, struct bch_inode_info *ei,
+ struct page *page)
+{
+ if (TestClearPageAllocated(page))
+ atomic_long_sub_bug(PAGE_SECTORS, &c->sectors_reserved);
+
+ if (TestClearPageAppend(page))
+ bch_append_put(ei);
+}
+
+static int __bch_write_inode(struct inode *inode)
+{
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct bch_inode *bi = &ei->inode.v;
+
+ lockdep_assert_held(&ei->update_lock);
+ BUG_ON(ei->inode.k.p.inode != inode->i_ino);
+ BUG_ON(ei->inode.k.type != BCH_INODE_FS);
+
+ if (!atomic_long_read(&ei->append_count)) {
+ bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ bi->i_size = inode->i_size;
+ }
+
+ bi->i_mode = inode->i_mode;
+ bi->i_uid = i_uid_read(inode);
+ bi->i_gid = i_gid_read(inode);
+ bi->i_nlink = inode->i_nlink;
+ bi->i_dev = inode->i_rdev;
+ bi->i_atime = timespec_to_ns(&inode->i_atime);
+ bi->i_mtime = timespec_to_ns(&inode->i_mtime);
+ bi->i_ctime = timespec_to_ns(&inode->i_ctime);
+
+ return bch_inode_update(c, &ei->inode.k_i, NULL, &ei->journal_seq);
+}
+
+static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
+{
+ struct cache_set *c = sb->s_fs_info;
+ struct bch_inode_info *ei;
+ struct inode *inode;
+ int ret;
+
+ pr_debug("inum %llu", inum);
+
+ inode = iget_locked(sb, inum);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ ei = to_bch_ei(inode);
+
+ ret = bch_inode_find_by_inum(c, inum, &ei->inode);
+ if (unlikely(ret)) {
+ iget_failed(inode);
+ return ERR_PTR(ret);
+ }
+
+ bch_inode_init(ei);
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+static void bch_set_inode_flags(struct inode *inode)
+{
+ unsigned flags = to_bch_ei(inode)->inode.v.i_flags;
+
+ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME);
+ if (flags & FS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+}
+
+static struct inode *bch_vfs_inode_create(struct cache_set *c,
+ struct inode *parent,
+ umode_t mode, dev_t rdev)
+{
+ struct inode *inode;
+ struct bch_inode_info *ei;
+ struct bch_inode *bi;
+ struct timespec ts = CURRENT_TIME;
+ s64 now = timespec_to_ns(&ts);
+ int ret;
+
+ inode = new_inode(parent->i_sb);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+
+ inode_init_owner(inode, parent, mode);
+
+ ei = to_bch_ei(inode);
+
+ bi = &bkey_inode_init(&ei->inode.k_i)->v;
+ bi->i_uid = i_uid_read(inode);
+ bi->i_gid = i_gid_read(inode);
+
+ bi->i_mode = inode->i_mode;
+ bi->i_dev = rdev;
+ bi->i_atime = now;
+ bi->i_mtime = now;
+ bi->i_ctime = now;
+ bi->i_nlink = S_ISDIR(mode) ? 2 : 1;
+
+ ret = bch_inode_create(c, &ei->inode.k_i,
+ BLOCKDEV_INODE_MAX, 0,
+ &c->unused_inode_hint);
+ if (unlikely(ret)) {
+ /*
+ * indicate to bch_evict_inode that the inode was never actually
+ * created:
+ */
+ bkey_init(&ei->inode.k);
+ goto err;
+ }
+
+ bch_inode_init(ei);
+
+ ret = bch_init_acl(inode, parent);
+ if (unlikely(ret))
+ goto err;
+
+ insert_inode_hash(inode);
+ atomic_long_inc(&c->nr_inodes);
+
+ return inode;
+err:
+ clear_nlink(inode);
+ iput(inode);
+ return ERR_PTR(ret);
+}
+
+static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
+ u8 type, const struct qstr *name,
+ struct inode *dst)
+{
+ struct bch_inode_info *ei = to_bch_ei(dst);
+ int ret;
+
+ ret = bch_dirent_create(c, dir->i_ino, type, name,
+ dst->i_ino, &ei->journal_seq);
+ if (unlikely(ret))
+ return ret;
+
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty_sync(dir);
+ return 0;
+}
+
+static int __bch_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode;
+ int ret;
+
+ inode = bch_vfs_inode_create(c, dir, mode, rdev);
+ if (unlikely(IS_ERR(inode)))
+ return PTR_ERR(inode);
+
+ ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode),
+ &dentry->d_name, inode);
+ if (unlikely(ret)) {
+ clear_nlink(inode);
+ iput(inode);
+ return ret;
+ }
+
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+/* methods */
+
+static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode = NULL;
+ u64 inum;
+
+ inum = bch_dirent_lookup(c, dir->i_ino, &dentry->d_name);
+
+ if (inum)
+ inode = bch_vfs_inode_get(dir->i_sb, inum);
+
+ return d_splice_alias(inode, dentry);
+}
+
+static int bch_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ return __bch_create(dir, dentry, mode|S_IFREG, 0);
+}
+
+static int bch_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode = old_dentry->d_inode;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ int ret;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ mutex_lock(&ei->update_lock);
+ inode->i_ctime = CURRENT_TIME;
+ inc_nlink(inode);
+ __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ ihold(inode);
+
+ ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode),
+ &dentry->d_name, inode);
+ if (unlikely(ret)) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ return ret;
+ }
+
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+static int bch_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ ret = bch_dirent_delete(c, dir->i_ino, &dentry->d_name);
+ if (ret)
+ return ret;
+
+ inode->i_ctime = dir->i_ctime;
+ inode_dec_link_count(inode);
+
+ return 0;
+}
+
+static int bch_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode;
+ int ret;
+
+ inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+ if (unlikely(IS_ERR(inode)))
+ return PTR_ERR(inode);
+
+ inode_lock(inode);
+ ret = page_symlink(inode, symname, strlen(symname) + 1);
+ inode_unlock(inode);
+
+ if (unlikely(ret))
+ goto err;
+
+ ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode);
+ if (unlikely(ret))
+ goto err;
+
+ d_instantiate(dentry, inode);
+ return 0;
+err:
+ clear_nlink(inode);
+ iput(inode);
+ return ret;
+}
+
+static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ int ret;
+
+ lockdep_assert_held(&dir->i_rwsem);
+
+ inode_inc_link_count(dir);
+ mark_inode_dirty_sync(dir);
+
+ ret = __bch_create(dir, dentry, mode|S_IFDIR, 0);
+ if (unlikely(ret)) {
+ inode_dec_link_count(dir);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int bch_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+
+ lockdep_assert_held(&inode->i_rwsem);
+ lockdep_assert_held(&dir->i_rwsem);
+
+ if (bch_empty_dir(c, inode->i_ino))
+ return -ENOTEMPTY;
+
+ ret = bch_unlink(dir, dentry);
+ if (unlikely(ret))
+ return ret;
+
+ inode_dec_link_count(inode);
+ inode_dec_link_count(dir);
+
+ return 0;
+}
+
+static int bch_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ return __bch_create(dir, dentry, mode, rdev);
+}
+
+static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct cache_set *c = old_dir->i_sb->s_fs_info;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct bch_inode_info *ei = to_bch_ei(old_inode);
+ struct inode *new_inode = new_dentry->d_inode;
+ struct timespec now = CURRENT_TIME;
+ int ret;
+
+ lockdep_assert_held(&old_dir->i_rwsem);
+ lockdep_assert_held(&new_dir->i_rwsem);
+
+ /*
+ * XXX: This isn't atomic w.r.t. unclean shutdowns, and we'd really like
+ * it to be
+ */
+
+ if (new_inode && S_ISDIR(old_inode->i_mode)) {
+ lockdep_assert_held(&new_inode->i_rwsem);
+
+ if (!S_ISDIR(new_inode->i_mode))
+ return -ENOTDIR;
+
+ if (bch_empty_dir(c, new_inode->i_ino))
+ return -ENOTEMPTY;
+
+ ret = bch_dirent_update(c, new_dir->i_ino,
+ &new_dentry->d_name,
+ old_inode->i_ino,
+ &ei->journal_seq);
+ if (unlikely(ret))
+ return ret;
+
+ clear_nlink(new_inode);
+ inode_dec_link_count(old_dir);
+ } else if (new_inode) {
+ lockdep_assert_held(&new_inode->i_rwsem);
+
+ ret = bch_dirent_update(c, new_dir->i_ino,
+ &new_dentry->d_name,
+ old_inode->i_ino,
+ &ei->journal_seq);
+ if (unlikely(ret))
+ return ret;
+
+ new_inode->i_ctime = now;
+ inode_dec_link_count(new_inode);
+ } else if (S_ISDIR(old_inode->i_mode)) {
+ ret = bch_vfs_dirent_create(c, new_dir,
+ mode_to_type(old_inode->i_mode),
+ &new_dentry->d_name,
+ old_inode);
+ if (unlikely(ret))
+ return ret;
+
+ inode_inc_link_count(new_dir);
+ inode_dec_link_count(old_dir);
+ } else {
+ ret = bch_vfs_dirent_create(c, new_dir,
+ mode_to_type(old_inode->i_mode),
+ &new_dentry->d_name,
+ old_inode);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ old_dir->i_ctime = old_dir->i_mtime = now;
+ new_dir->i_ctime = new_dir->i_mtime = now;
+ mark_inode_dirty_sync(old_dir);
+ mark_inode_dirty_sync(new_dir);
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ mutex_lock(&ei->update_lock);
+ old_inode->i_ctime = now;
+ if (new_inode)
+ old_inode->i_mtime = now;
+ __bch_write_inode(old_inode);
+ mutex_unlock(&ei->update_lock);
+
+ /* XXX: error handling */
+ bch_dirent_delete(c, old_dir->i_ino, &old_dentry->d_name);
+
+ return 0;
+}
+
+static int bch_truncate_page(struct address_space *mapping, loff_t from)
+{
+ unsigned offset = from & (PAGE_SIZE - 1);
+ struct page *page;
+ int ret = 0;
+
+ /* Page boundary? Nothing to do */
+ if (!offset)
+ return 0;
+
+ page = grab_cache_page(mapping, from >> PAGE_SHIFT);
+ if (unlikely(!page)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!PageUptodate(page))
+ if (bch_read_single_page(page, mapping)) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zero_user_segment(page, offset, PAGE_SIZE);
+ set_page_dirty(page);
+unlock:
+ unlock_page(page);
+ put_page(page);
+out:
+ return ret;
+}
+
+static int bch_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ int ret = 0;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ pr_debug("i_size was %llu update has %llu",
+ inode->i_size, iattr->ia_size);
+
+ ret = inode_change_ok(inode, iattr);
+ if (ret)
+ return ret;
+
+ if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+ inode_dio_wait(inode);
+
+ /*
+ * __bch_write_inode() clears I_SIZE_DIRTY if append_count == 0:
+ */
+ atomic_long_inc(&ei->append_count);
+
+ /*
+ * I_SIZE_DIRTY indicates that there's extents past the end of
+ * i_size, and must be set atomically with setting the new
+ * i_size:
+ */
+ mutex_lock(&ei->update_lock);
+ i_size_write(inode, iattr->ia_size);
+ ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+ ei->inode.v.i_size = iattr->ia_size;
+ __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ ret = bch_truncate_page(inode->i_mapping, iattr->ia_size);
+ if (unlikely(ret))
+ return ret;
+
+ if (iattr->ia_size > inode->i_size)
+ pagecache_isize_extended(inode, inode->i_size,
+ iattr->ia_size);
+ truncate_pagecache(inode, iattr->ia_size);
+
+ ret = bch_inode_truncate(c, inode->i_ino,
+ round_up(iattr->ia_size, PAGE_SIZE) >> 9);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * Extents discarded, now clear I_SIZE_DIRTY (which write_inode
+ * does when append_count is 0
+ */
+ bch_append_put(ei);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ }
+
+ mutex_lock(&ei->update_lock);
+ setattr_copy(inode, iattr);
+ __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ if (iattr->ia_valid & ATTR_MODE)
+ ret = posix_acl_chmod(inode, inode->i_mode);
+
+ return ret;
+}
+
+static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct cache_set *c = dir->i_sb->s_fs_info;
+ struct inode *inode;
+
+ /* XXX: i_nlink should be 0? */
+ inode = bch_vfs_inode_create(c, dir, mode, 0);
+ if (unlikely(IS_ERR(inode)))
+ return PTR_ERR(inode);
+
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
+static int bch_fill_extent(struct fiemap_extent_info *info,
+ struct bkey_i *k, int flags)
+{
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr) {
+ int ret = fiemap_fill_next_extent(info,
+ bkey_start_offset(e.k) << 9,
+ PTR_OFFSET(ptr) << 9,
+ e.k->size << 9, flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info,
+ u64 start, u64 len)
+{
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ BKEY_PADDED(k) tmp;
+ bool have_extent = false;
+ int ret = 0;
+
+ if (start + len < start)
+ return -EINVAL;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode->i_ino, start >> 9), k)
+ if (k.k->type == BCH_EXTENT) {
+ if (bkey_cmp(bkey_start_pos(k.k),
+ POS(inode->i_ino, (start + len) >> 9)) >= 0)
+ break;
+
+ if (have_extent) {
+ ret = bch_fill_extent(info, &tmp.k, 0);
+ if (ret)
+ goto out;
+ }
+
+ bkey_reassemble(&tmp.k, k);
+ have_extent = true;
+ }
+
+ if (have_extent)
+ ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+out:
+ bch_btree_iter_unlock(&iter);
+ return ret < 0 ? ret : 0;
+}
+
+static int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct closure cl;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ /*
+ * We really just want to sync all the PageAppend pages:
+ */
+ start = 0;
+ end = S64_MAX;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+ if (datasync && end <= ei->inode.v.i_size)
+ goto out;
+
+ /*
+ * redo after locking inode:
+ */
+ filemap_write_and_wait_range(inode->i_mapping, start, end);
+
+ wait_event(bch_append_wait,
+ !atomic_long_read(&ei->append_count));
+
+ mutex_lock(&ei->update_lock);
+ BUG_ON(atomic_long_read(&ei->append_count));
+ ret = __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+out:
+ inode_unlock(inode);
+
+ bch_journal_push_seq(&c->journal, ei->journal_seq, &cl);
+ closure_sync(&cl);
+
+ return ret;
+}
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define BCH_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+
+static inline bool bch_flags_allowed(umode_t mode, u32 flags)
+{
+ if ((flags & BCH_FL_USER_FLAGS) != flags)
+ return false;
+
+ if (!S_ISREG(mode) &&
+ !S_ISDIR(mode) &&
+ (flags & BCH_OTHER_FLMASK) != flags)
+ return false;
+
+ return true;
+}
+
+static long bch_fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ unsigned flags;
+ int ret;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = ei->inode.v.i_flags & BCH_FL_USER_FLAGS;
+ return put_user(flags, (int __user *) arg);
+
+ case FS_IOC_SETFLAGS: {
+ unsigned oldflags;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto setflags_out;
+ }
+
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto setflags_out;
+ }
+
+ if (!bch_flags_allowed(inode->i_mode, flags)) {
+ ret = -EINVAL;
+ goto setflags_out;
+ }
+
+ inode_lock(inode);
+ oldflags = ei->inode.v.i_flags;
+
+ if (((flags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ inode_unlock(inode);
+ ret = -EPERM;
+ goto setflags_out;
+ }
+
+ flags = flags & BCH_FL_USER_FLAGS;
+ flags |= oldflags & ~BCH_FL_USER_FLAGS;
+ ei->inode.v.i_flags = flags;
+
+ inode->i_ctime = CURRENT_TIME_SEC;
+ bch_set_inode_flags(inode);
+ inode_unlock(inode);
+
+ mark_inode_dirty(inode);
+setflags_out:
+ mnt_drop_write_file(filp);
+ return ret;
+ }
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return bch_fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_file_llseek_size(file, offset, whence,
+ S64_MAX, S64_MAX);
+}
+
+static const struct file_operations bch_file_operations = {
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .mmap = generic_file_mmap,
+ .open = generic_file_open,
+ .fsync = bch_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+
+ .unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+ .setattr = bch_setattr,
+ .fiemap = bch_fiemap,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = bch_xattr_list,
+ .removexattr = generic_removexattr,
+ .get_acl = bch_get_acl,
+ .set_acl = bch_set_acl,
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+ .lookup = bch_lookup,
+ .create = bch_create,
+ .link = bch_link,
+ .unlink = bch_unlink,
+ .symlink = bch_symlink,
+ .mkdir = bch_mkdir,
+ .rmdir = bch_rmdir,
+ .mknod = bch_mknod,
+ .rename = bch_rename,
+ .setattr = bch_setattr,
+ .tmpfile = bch_tmpfile,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = bch_xattr_list,
+ .removexattr = generic_removexattr,
+ .get_acl = bch_get_acl,
+ .set_acl = bch_set_acl,
+};
+
+static const struct file_operations bch_dir_file_operations = {
+ .llseek = bch_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = bch_readdir,
+ .fsync = bch_fsync,
+
+ .unlocked_ioctl = bch_fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = bch_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .get_link = page_get_link,
+ .setattr = bch_setattr,
+
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = bch_xattr_list,
+ .removexattr = generic_removexattr,
+ .get_acl = bch_get_acl,
+ .set_acl = bch_set_acl,
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+ .setattr = bch_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = bch_xattr_list,
+ .removexattr = generic_removexattr,
+ .get_acl = bch_get_acl,
+ .set_acl = bch_set_acl,
+};
+
+static int bch_bio_add_page(struct bio *bio, struct page *page)
+{
+ sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
+
+ if (!bio->bi_vcnt) {
+ bio->bi_iter.bi_sector = offset;
+ } else if (bio_end_sector(bio) != offset ||
+ bio->bi_vcnt == bio->bi_max_vecs)
+ return -1;
+
+ bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+ .bv_page = page,
+ .bv_len = PAGE_SIZE,
+ .bv_offset = 0,
+ };
+
+ bio->bi_iter.bi_size += PAGE_SIZE;
+
+ return 0;
+}
+
+static void bch_readpages_end_io(struct bio *bio)
+{
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ if (!bio->bi_error) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+
+ bio_put(bio);
+}
+
+static int bch_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct inode *inode = mapping->host;
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bio *bio = NULL;
+ struct page *page;
+ ssize_t ret;
+
+ pr_debug("reading %u pages", nr_pages);
+
+ while (nr_pages) {
+ page = list_entry(pages->prev, struct page, lru);
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+
+ if (!add_to_page_cache_lru(page, mapping,
+ page->index, GFP_NOFS)) {
+again:
+ if (!bio) {
+ bio = bio_alloc(GFP_NOFS,
+ min_t(unsigned, nr_pages,
+ BIO_MAX_PAGES));
+
+ bio->bi_end_io = bch_readpages_end_io;
+ }
+
+ if (bch_bio_add_page(bio, page)) {
+ ret = bch_read(c, bio, inode->i_ino);
+ bio_endio(bio);
+ bio = NULL;
+
+ if (ret < 0) {
+ pr_debug("error %zi", ret);
+ return ret;
+ }
+ goto again;
+ }
+ }
+
+ nr_pages--;
+ put_page(page);
+ }
+
+ if (bio) {
+ ret = bch_read(c, bio, inode->i_ino);
+ bio_endio(bio);
+
+ if (ret < 0) {
+ pr_debug("error %zi", ret);
+ return ret;
+ }
+ }
+
+ pr_debug("success");
+ return 0;
+}
+
+static int bch_readpage(struct file *file, struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bio *bio;
+ int ret;
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+ bio->bi_end_io = bch_readpages_end_io;
+
+ bch_bio_add_page(bio, page);
+
+ ret = bch_read(c, bio, inode->i_ino);
+ bio_endio(bio);
+
+ return ret;
+}
+
+struct bch_writepage_io {
+ struct closure cl;
+ struct bch_write_op op;
+ struct bbio bio;
+};
+
+struct bch_writepage {
+ struct cache_set *c;
+ u64 inum;
+ struct bch_writepage_io *io;
+};
+
+static void bch_writepage_io_free(struct closure *cl)
+{
+ struct bch_writepage_io *io = container_of(cl,
+ struct bch_writepage_io, cl);
+ struct cache_set *c = io->op.c;
+ struct inode *inode = io->bio.bio.bi_io_vec[0].bv_page->mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, &io->bio.bio, i) {
+ struct page *page = bvec->bv_page;
+
+ BUG_ON(!PageWriteback(page));
+
+ if (io->bio.bio.bi_error) {
+ SetPageError(page);
+ if (page->mapping)
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+
+ bch_clear_page_bits(c, ei, page);
+ end_page_writeback(page);
+ }
+
+ bio_put(&io->bio.bio);
+}
+
+static void bch_writepage_do_io(struct bch_writepage_io *io)
+{
+ pr_debug("writing %u sectors to %llu:%llu",
+ bio_sectors(&io->bio.bio),
+ io->op.insert_key.k.p.inode,
+ (u64) io->bio.bio.bi_iter.bi_sector);
+
+ closure_call(&io->op.cl, bch_write, NULL, &io->cl);
+ closure_return_with_destructor(&io->cl, bch_writepage_io_free);
+}
+
+static int __bch_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct bch_writepage *w = data;
+ struct bio *bio;
+ unsigned offset;
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
+
+ /* Is the page fully inside i_size? */
+ if (page->index < end_index)
+ goto do_io;
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ offset = i_size & (PAGE_SIZE - 1);
+ if (page->index > end_index || !offset) {
+ unlock_page(page);
+ return 0;
+ }
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ zero_user_segment(page, offset, PAGE_SIZE);
+do_io:
+ /* XXX: how we gonna make this synchronization efficient? */
+ mutex_lock(&ei->update_lock);
+
+ if (ei->inode.v.i_size < i_size &&
+ page->index >= (ei->inode.v.i_size >> PAGE_SHIFT) &&
+ !(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) {
+ ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+ __bch_write_inode(inode);
+ }
+
+ mutex_unlock(&ei->update_lock);
+
+ if (!w->io) {
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, bch_fs_bioset);
+ w->io = container_of(bio, struct bch_writepage_io, bio.bio);
+
+ closure_init(&w->io->cl, NULL);
+ bch_write_op_init(&w->io->op, w->c, bio, NULL,
+ bkey_to_s_c(&KEY(w->inum, 0, 0)),
+ bkey_s_c_null, 0);
+ w->io->op.journal_seq = &ei->journal_seq;
+ }
+
+ if (bch_bio_add_page(&w->io->bio.bio, page)) {
+ bch_writepage_do_io(w->io);
+ w->io = NULL;
+ goto do_io;
+ }
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+
+ return 0;
+}
+
+static int bch_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+ struct bch_writepage w = {
+ .c = mapping->host->i_sb->s_fs_info,
+ .inum = mapping->host->i_ino,
+ .io = NULL,
+ };
+
+ ret = write_cache_pages(mapping, wbc, __bch_writepage, &w);
+
+ if (w.io)
+ bch_writepage_do_io(w.io);
+
+ return ret;
+}
+
+static int bch_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct bch_writepage w = {
+ .c = inode->i_sb->s_fs_info,
+ .inum = inode->i_ino,
+ .io = NULL,
+ };
+
+ __bch_writepage(page, NULL, &w);
+ bch_writepage_do_io(w.io);
+
+ return 0;
+}
+
+static void bch_read_single_page_end_io(struct bio *bio)
+{
+ complete(bio->bi_private);
+}
+
+static int bch_read_single_page(struct page *page,
+ struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+ bio->bi_private = &done;
+ bio->bi_end_io = bch_read_single_page_end_io;
+ bch_bio_add_page(bio, page);
+
+ ret = bch_read(c, bio, inode->i_ino);
+ bio_endio(bio);
+ wait_for_completion(&done);
+
+ if (!ret)
+ ret = bio->bi_error;
+ bio_put(bio);
+
+ if (ret < 0)
+ return ret;
+
+ SetPageUptodate(page);
+
+ return 0;
+}
+
+static int bch_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ struct page *page;
+ int ret = 0;
+
+ BUG_ON(inode_unhashed(mapping->host));
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+
+ if (!PageAllocated(page)) {
+ if (reserve_sectors(c, PAGE_SECTORS)) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ SetPageAllocated(page);
+ }
+
+ if (PageUptodate(page))
+ goto out;
+
+ /* If we're writing entire page, don't need to read it in first: */
+ if (len == PAGE_SIZE)
+ goto out;
+
+ if (pos + len >= inode->i_size) {
+ unsigned offset = pos & (PAGE_SIZE - 1);
+
+ /*
+ * If the write extents past i_size, the top part of the page
+ * we're not writing to doesn't need to be read in, just zeroed:
+ */
+ zero_user(page, offset + len, PAGE_SIZE - offset - len);
+ flush_dcache_page(page);
+
+ if (!offset)
+ goto out;
+
+ /*
+ * If the start of the page is past i_size, zero that part too:
+ */
+ if ((index << PAGE_SHIFT) >> inode->i_size) {
+ zero_user(page, 0, offset);
+ flush_dcache_page(page);
+ goto out;
+ }
+ }
+
+ ret = bch_read_single_page(page, mapping);
+ if (ret)
+ goto err;
+out:
+ *pagep = page;
+ return ret;
+err:
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto out;
+}
+
+static int bch_write_end(struct file *filp, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ loff_t last_pos = pos + copied;
+ struct inode *inode = page->mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+
+ /*
+ * can't set a page dirty without i_rwsem, to avoid racing with truncate
+ */
+ lockdep_assert_held(&inode->i_rwsem);
+
+ if (unlikely(copied < len)) {
+#if 0
+ if (!PageUptodate(page)) {
+ /* we skipped reading in the page before, read it now.. */
+ }
+#endif
+
+ /*
+ * zero out the rest of the area
+ */
+ unsigned from = pos & (PAGE_SIZE - 1);
+
+ zero_user(page, from + copied, len - copied);
+ flush_dcache_page(page);
+ }
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ if (!PageDirty(page))
+ set_page_dirty(page);
+
+ if (last_pos > inode->i_size) {
+ mutex_lock(&ei->update_lock);
+
+ if (!TestSetPageAppend(page))
+ atomic_long_inc(&ei->append_count);
+
+ i_size_write(inode, last_pos);
+ mark_inode_dirty(inode);
+
+ mutex_unlock(&ei->update_lock);
+ }
+
+ unlock_page(page);
+ put_page(page);
+
+ return copied;
+}
+
+static void bch_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ struct inode *inode = page->mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct cache_set *c = inode->i_sb->s_fs_info;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ if (offset || length < PAGE_SIZE)
+ return;
+
+ bch_clear_page_bits(c, ei, page);
+}
+
+static int bch_releasepage(struct page *page, gfp_t gfp_mask)
+{
+ struct inode *inode = page->mapping->host;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct cache_set *c = inode->i_sb->s_fs_info;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ bch_clear_page_bits(c, ei, page);
+
+ if (PageDirty(page)) {
+ ClearPageDirty(page);
+ cancel_dirty_page(page);
+ }
+
+ return 1;
+}
+
+/* O_DIRECT */
+
+static struct bio_set *bch_dio_read_bioset;
+
+struct dio_read {
+ struct closure cl;
+ struct kiocb *req;
+ long ret;
+ struct bio bio;
+};
+
+static void bch_dio_read_complete(struct closure *cl)
+{
+ struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+ dio->req->ki_complete(dio->req, dio->ret, 0);
+ bio_put(&dio->bio);
+}
+
+static void bch_direct_IO_read_endio(struct bio *bio)
+{
+ struct dio_read *dio = bio->bi_private;
+
+ if (bio->bi_error)
+ dio->ret = bio->bi_error;
+
+ closure_put(&dio->cl);
+ bio_check_pages_dirty(bio); /* transfers ownership */
+}
+
+static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req,
+ struct file *file, struct inode *inode,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct dio_read *dio;
+ struct bio *bio;
+ unsigned long inum = inode->i_ino;
+ ssize_t ret = 0;
+ size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ loff_t i_size;
+
+ bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_read_bioset);
+ bio_get(bio);
+
+ dio = container_of(bio, struct dio_read, bio);
+ closure_init(&dio->cl, NULL);
+ dio->req = req;
+ dio->ret = iter->count;
+
+ i_size = i_size_read(inode);
+ if (offset + dio->ret > i_size) {
+ dio->ret = max_t(loff_t, 0, i_size - offset);
+ iter->count = round_up(dio->ret, PAGE_SIZE);
+ }
+
+ if (!dio->ret)
+ goto out;
+
+ goto start;
+ while (iter->count && !ret) {
+ pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_KERNEL, pages);
+start:
+ bio->bi_iter.bi_sector = offset >> 9;
+ bio->bi_end_io = bch_direct_IO_read_endio;
+ bio->bi_private = dio;
+
+ ret = bio_get_user_pages(bio, iter, 1);
+ if (ret < 0) {
+ dio->ret = ret;
+ bio_put(bio);
+ break;
+ }
+
+ offset += bio->bi_iter.bi_size;
+ bio_set_pages_dirty(bio);
+
+ closure_get(&dio->cl);
+ ret = bch_read(c, bio, inum);
+ if (ret)
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
+out:
+ if (is_sync_kiocb(req)) {
+ closure_sync(&dio->cl);
+ closure_debug_destroy(&dio->cl);
+ ret = dio->ret;
+ bio_put(&dio->bio);
+ return ret;
+ } else {
+ closure_return_with_destructor_noreturn(&dio->cl,
+ bch_dio_read_complete);
+ return -EIOCBQUEUED;
+ }
+}
+
+struct dio_write {
+ struct closure cl;
+ struct kiocb *req;
+ long ret;
+ bool append;
+};
+
+struct dio_write_bio {
+ struct closure cl;
+ struct dio_write *dio;
+ struct bch_write_op iop;
+ struct bbio bio;
+};
+
+static void __bch_dio_write_complete(struct dio_write *dio)
+{
+ struct bch_inode_info *ei = to_bch_ei(dio->req->ki_filp->f_inode);
+
+ if (dio->append)
+ bch_append_put(ei);
+ inode_dio_end(dio->req->ki_filp->f_inode);
+ kfree(dio);
+}
+
+static void bch_dio_write_complete(struct closure *cl)
+{
+ struct dio_write *dio = container_of(cl, struct dio_write, cl);
+ struct kiocb *req = dio->req;
+ long ret = dio->ret;
+
+ __bch_dio_write_complete(dio);
+ req->ki_complete(req, ret, 0);
+}
+
+static void bch_direct_IO_write_done(struct closure *cl)
+{
+ struct dio_write_bio *op = container_of(cl,
+ struct dio_write_bio, cl);
+ struct bio_vec *bv;
+ int i;
+
+ if (op->iop.error)
+ op->dio->ret = op->iop.error;
+ closure_put(&op->dio->cl);
+
+ bio_for_each_segment_all(bv, &op->bio.bio, i)
+ put_page(bv->bv_page);
+ kfree(op);
+}
+
+static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req,
+ struct file *file, struct inode *inode,
+ struct iov_iter *iter, loff_t offset)
+{
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct dio_write *dio;
+ struct dio_write_bio *op;
+ struct bio *bio;
+ unsigned long inum = inode->i_ino;
+ unsigned flags = BCH_WRITE_CHECK_ENOSPC;
+ ssize_t ret = 0;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ if (file->f_flags & O_DSYNC || IS_SYNC(file->f_mapping->host))
+ flags |= BCH_WRITE_FLUSH;
+
+ dio = kmalloc(sizeof(*dio), GFP_NOIO);
+ if (!dio)
+ return -ENOMEM;
+
+ closure_init(&dio->cl, NULL);
+ dio->req = req;
+ dio->ret = iter->count;
+ dio->append = false;
+
+ if (offset + iter->count > inode->i_size) {
+ dio->append = true;
+ atomic_long_inc(&ei->append_count);
+
+ mutex_lock(&ei->update_lock);
+ if (!(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) {
+ ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+ __bch_write_inode(inode);
+ }
+ mutex_unlock(&ei->update_lock);
+ }
+
+ /* Decremented by inode_dio_done(): */
+ atomic_inc(&inode->i_dio_count);
+
+ while (iter->count) {
+ size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+
+ op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+ GFP_NOIO);
+ if (!op) {
+ dio->ret = -ENOMEM;
+ break;
+ }
+
+ bio = &op->bio.bio;
+ bio_init(bio);
+ bio->bi_iter.bi_sector = offset >> 9;
+ bio->bi_max_vecs = pages;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+
+ ret = bio_get_user_pages(bio, iter, 0);
+ if (ret < 0) {
+ dio->ret = ret;
+ kfree(op);
+ break;
+ }
+
+ offset += bio->bi_iter.bi_size;
+ closure_get(&dio->cl);
+ op->dio = dio;
+ closure_init(&op->cl, NULL);
+
+ bch_write_op_init(&op->iop, c, bio, NULL,
+ bkey_to_s_c(&KEY(inum,
+ bio_end_sector(bio),
+ bio_sectors(bio))),
+ bkey_s_c_null, flags);
+ op->iop.journal_seq = &ei->journal_seq;
+
+ task_io_account_write(bio->bi_iter.bi_size);
+
+ closure_call(&op->iop.cl, bch_write, NULL, &op->cl);
+ closure_return_with_destructor_noreturn(&op->cl,
+ bch_direct_IO_write_done);
+ }
+
+ if (is_sync_kiocb(req) || dio->append) {
+ /*
+ * appends are sync in order to do the i_size update under
+ * i_rwsem, after we know the write has completed successfully
+ */
+ closure_sync(&dio->cl);
+ closure_debug_destroy(&dio->cl);
+ ret = dio->ret;
+
+ if (ret > 0 &&
+ offset > inode->i_size) {
+ i_size_write(inode, offset);
+ mark_inode_dirty(inode);
+ }
+
+ __bch_dio_write_complete(dio);
+ return ret;
+ } else {
+ closure_return_with_destructor_noreturn(&dio->cl,
+ bch_dio_write_complete);
+ return -EIOCBQUEUED;
+ }
+}
+
+static ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter)
+{
+ struct file *file = req->ki_filp;
+ struct inode *inode = file->f_inode;
+ struct cache_set *c = inode->i_sb->s_fs_info;
+
+ if ((req->ki_pos|iter->count) & (block_bytes(c) - 1))
+ return -EINVAL;
+
+ return ((iov_iter_rw(iter) == WRITE)
+ ? bch_direct_IO_write
+ : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
+}
+
+#ifdef CONFIG_MIGRATION
+static int bch_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ int ret;
+
+ ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (PageAllocated(page)) {
+ ClearPageAllocated(page);
+ SetPageAllocated(newpage);
+ }
+
+ if (PageAppend(page)) {
+ ClearPageAppend(page);
+ SetPageAppend(newpage);
+ }
+
+ migrate_page_copy(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
+static const struct address_space_operations bch_address_space_operations = {
+ .writepage = bch_writepage,
+ .readpage = bch_readpage,
+ .writepages = bch_writepages,
+ .readpages = bch_readpages,
+
+ .set_page_dirty = __set_page_dirty_nobuffers,
+
+ .write_begin = bch_write_begin,
+ .write_end = bch_write_end,
+ .invalidatepage = bch_invalidatepage,
+ .releasepage = bch_releasepage,
+
+ .direct_IO = bch_direct_IO,
+
+#ifdef CONFIG_MIGRATION
+ .migratepage = bch_migrate_page,
+#endif
+ .error_remove_page = generic_error_remove_page,
+};
+
+static void bch_inode_init(struct bch_inode_info *ei)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct bch_inode *bi = &ei->inode.v;
+
+ pr_debug("init inode %llu with mode %o",
+ ei->inode.k.p.inode, bi->i_mode);
+
+ BUG_ON(atomic_long_read(&ei->append_count));
+
+ inode->i_mode = bi->i_mode;
+ i_uid_write(inode, bi->i_uid);
+ i_gid_write(inode, bi->i_gid);
+
+ inode->i_ino = ei->inode.k.p.inode;
+ set_nlink(inode, bi->i_nlink);
+ inode->i_rdev = bi->i_dev;
+ inode->i_size = bi->i_size;
+ inode->i_atime = ns_to_timespec(bi->i_atime);
+ inode->i_mtime = ns_to_timespec(bi->i_mtime);
+ inode->i_ctime = ns_to_timespec(bi->i_ctime);
+ bch_set_inode_flags(inode);
+
+ inode->i_mapping->a_ops = &bch_address_space_operations;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &bch_file_inode_operations;
+ inode->i_fop = &bch_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &bch_dir_inode_operations;
+ inode->i_fop = &bch_dir_file_operations;
+ break;
+ case S_IFLNK:
+ inode_nohighmem(inode);
+ inode->i_op = &bch_symlink_inode_operations;
+ break;
+ default:
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ inode->i_op = &bch_special_inode_operations;
+ break;
+ }
+}
+
+static struct inode *bch_alloc_inode(struct super_block *sb)
+{
+ struct bch_inode_info *ei;
+
+ ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS);
+ if (!ei)
+ return NULL;
+
+ pr_debug("allocated %p", &ei->vfs_inode);
+
+ inode_init_once(&ei->vfs_inode);
+ mutex_init(&ei->update_lock);
+ ei->journal_seq = 0;
+ atomic_long_set(&ei->append_count, 0);
+
+ return &ei->vfs_inode;
+}
+
+static void bch_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ kmem_cache_free(bch_inode_cache, to_bch_ei(inode));
+}
+
+static void bch_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, bch_i_callback);
+}
+
+static int bch_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ int ret;
+
+ mutex_lock(&ei->update_lock);
+ ret = __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ if (!ret && wbc->sync_mode == WB_SYNC_ALL) {
+ struct closure cl;
+
+ closure_init_stack(&cl);
+ bch_journal_push_seq(&c->journal, ei->journal_seq, &cl);
+ closure_sync(&cl);
+ }
+
+ return ret;
+}
+
+static void bch_evict_inode(struct inode *inode)
+{
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(inode);
+
+ if (inode->i_nlink) {
+ truncate_inode_pages_final(&inode->i_data);
+
+ mutex_lock(&ei->update_lock);
+ BUG_ON(atomic_long_read(&ei->append_count));
+
+ if (!(inode->i_state & I_NEW) &&
+ (ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY ||
+ inode->i_size != ei->inode.v.i_size))
+ __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ clear_inode(inode);
+ } else if (!bkey_deleted(&ei->inode.k)) {
+ atomic_long_inc(&ei->append_count);
+
+ mutex_lock(&ei->update_lock);
+ ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY;
+ ei->inode.v.i_size = 0;
+ i_size_write(inode, 0);
+ __bch_write_inode(inode);
+ mutex_unlock(&ei->update_lock);
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ /*
+ * write_inode() shouldn't be called again - this will cause it
+ * to BUG():
+ */
+ ei->inode.k.type = KEY_TYPE_DELETED;
+ atomic_long_dec_bug(&ei->append_count);
+
+ bch_inode_rm(c, inode->i_ino);
+ atomic_long_dec(&c->nr_inodes);
+ } else {
+ /* bch_inode_create() failed: */
+ clear_inode(inode);
+ }
+}
+
+static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct cache_set *c = sb->s_fs_info;
+
+ buf->f_type = BCACHE_STATFS_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = c->capacity >> (PAGE_SHIFT - 9);
+ buf->f_bfree = (c->capacity - cache_set_sectors_used(c)) >>
+ (PAGE_SHIFT - 9);
+ buf->f_bavail = buf->f_bfree;
+ buf->f_files = atomic_long_read(&c->nr_inodes);
+ buf->f_namelen = NAME_MAX;
+
+ return 0;
+}
+
+static int bch_sync_fs(struct super_block *sb, int wait)
+{
+ struct cache_set *c = sb->s_fs_info;
+ struct closure cl;
+
+ closure_init_stack(&cl);
+
+ /* XXX: should only push a journal write if it's dirty */
+ bch_journal_flush(&c->journal, wait ? &cl : NULL);
+ closure_sync(&cl);
+ return 0;
+}
+
+static const struct super_operations bch_super_operations = {
+ .alloc_inode = bch_alloc_inode,
+ .destroy_inode = bch_destroy_inode,
+ .write_inode = bch_write_inode,
+ .evict_inode = bch_evict_inode,
+ .sync_fs = bch_sync_fs,
+ .statfs = bch_statfs,
+ .show_options = generic_show_options,
+#if 0
+ .put_super = bch_put_super,
+ .freeze_fs = bch_freeze,
+ .unfreeze_fs = bch_unfreeze,
+ .remount_fs = bch_remount,
+#endif
+};
+
+static struct cache_set *bch_open_as_blockdevs(const char *_dev_name)
+{
+ size_t nr_devs = 0, i = 0;
+ char *dev_name, *s, **devs;
+ struct cache_set *c = NULL;
+ const char *err;
+
+ dev_name = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ return NULL;
+
+ for (s = dev_name; s; s = strchr(s + 1, ':'))
+ nr_devs++;
+
+ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+ if (!devs)
+ goto out;
+
+ for (i = 0, s = dev_name;
+ s;
+ (s = strchr(s, ':')) && (*s++ = '\0'))
+ devs[i++] = s;
+
+ err = bch_register_cache_set(devs, nr_devs, &c);
+ if (err) {
+ pr_err("register_cache_set err %s", err);
+ goto out;
+ }
+
+ set_bit(CACHE_SET_BDEV_MOUNTED, &c->flags);
+out:
+ kfree(devs);
+ kfree(dev_name);
+
+ return c;
+}
+
+enum {
+ Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_user_xattr, Opt_nouser_xattr,
+ Opt_acl, Opt_noacl,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_err_cont, "errors=continue"},
+ {Opt_err_panic, "errors=panic"},
+ {Opt_err_ro, "errors=remount-ro"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_err, NULL}
+};
+
+static int parse_options(struct cache_set *c, struct super_block *sb,
+ char *options)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_err_panic:
+ /*
+ * XXX: this will get written to the superblock, don't
+ * want this option to be persistent
+ */
+ SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_PANIC);
+ break;
+ case Opt_err_ro:
+ SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_RO);
+ break;
+ case Opt_err_cont:
+ SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_CONTINUE);
+ break;
+ case Opt_user_xattr:
+ case Opt_nouser_xattr:
+ break;
+ case Opt_acl:
+ sb->s_flags |= MS_POSIXACL;
+ break;
+ case Opt_noacl:
+ sb->s_flags &= ~MS_POSIXACL;
+ break;
+ default:
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static struct dentry *bch_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct cache_set *c;
+ struct super_block *sb;
+ struct inode *inode;
+ int ret;
+
+ c = bch_open_as_blockdevs(dev_name);
+ if (!c)
+ return ERR_PTR(-ENOENT);
+
+ sb = sget(fs_type, NULL, set_anon_super, flags, NULL);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto err;
+ }
+
+ /* XXX: */
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &bch_super_operations;
+ sb->s_xattr = bch_xattr_handlers;
+ sb->s_magic = BCACHE_STATFS_MAGIC;
+ sb->s_time_gran = 1;
+ sb->s_fs_info = c;
+
+ sb->s_flags |= MS_POSIXACL;
+
+ /* XXX */
+ sb->s_bdev = c->cache[0]->disk_sb.bdev;
+ sb->s_bdi = &c->bdi;
+
+ if (!parse_options(c, sb, (char *) data)) {
+ ret = -EINVAL;
+ goto err_put_super;
+ }
+
+ inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto err_put_super;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ ret = -ENOMEM;
+ goto err_put_super;
+ }
+
+ sb->s_flags |= MS_ACTIVE;
+ return dget(sb->s_root);
+
+err_put_super:
+ deactivate_locked_super(sb);
+err:
+ closure_put(&c->cl);
+ return ERR_PTR(ret);
+}
+
+static void bch_kill_sb(struct super_block *sb)
+{
+ struct cache_set *c = sb->s_fs_info;
+
+ generic_shutdown_super(sb);
+
+ if (test_bit(CACHE_SET_BDEV_MOUNTED, &c->flags)) {
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ c->stop_completion = &complete;
+ bch_cache_set_stop(c);
+ closure_put(&c->cl);
+
+ /* Killable? */
+ wait_for_completion(&complete);
+ } else
+ closure_put(&c->cl);
+}
+
+static struct file_system_type bcache_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bcache",
+ .mount = bch_mount,
+ .kill_sb = bch_kill_sb,
+};
+
+MODULE_ALIAS_FS("bcache");
+
+void bch_fs_exit(void)
+{
+ unregister_filesystem(&bcache_fs_type);
+ if (bch_dio_read_bioset)
+ bioset_free(bch_dio_read_bioset);
+ if (bch_fs_bioset)
+ bioset_free(bch_fs_bioset);
+ if (bch_inode_cache)
+ kmem_cache_destroy(bch_inode_cache);
+}
+
+int __init bch_fs_init(void)
+{
+ int ret = -ENOMEM;
+
+ bch_inode_cache = KMEM_CACHE(bch_inode_info, 0);
+ if (!bch_inode_cache)
+ goto err;
+
+ bch_fs_bioset = bioset_create(4,
+ offsetof(struct bch_writepage_io, bio.bio));
+ if (!bch_fs_bioset)
+ goto err;
+
+
+ bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, bio));
+ if (!bch_dio_read_bioset)
+ goto err;
+
+ ret = register_filesystem(&bcache_fs_type);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ bch_fs_exit();
+ return ret;
+}
diff --git a/drivers/md/bcache/fs.h b/drivers/md/bcache/fs.h
new file mode 100644
index 000000000000..9e78cf8189bc
--- /dev/null
+++ b/drivers/md/bcache/fs.h
@@ -0,0 +1,20 @@
+#ifndef _BCACHE_FS_H
+#define _BCACHE_FS_H
+
+struct bch_inode_info {
+ struct bkey_i_inode inode;
+ struct inode vfs_inode;
+ struct mutex update_lock;
+ u64 journal_seq;
+ atomic_long_t append_count;
+};
+
+#define to_bch_ei(_inode) \
+ container_of(_inode, struct bch_inode_info, vfs_inode)
+
+static inline u8 mode_to_type(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+#endif /* _BCACHE_FS_H */
diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c
index 5e458258eaa8..ba6863ec5d01 100644
--- a/drivers/md/bcache/inode.c
+++ b/drivers/md/bcache/inode.c
@@ -162,14 +162,33 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size)
int bch_inode_rm(struct cache_set *c, u64 inode_nr)
{
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bkey_i delete;
int ret;
- ret = bch_discard(c, POS(inode_nr, 0),
- POS(inode_nr + 1, 0), 0);
+ ret = bch_inode_truncate(c, inode_nr, 0);
if (ret < 0)
return ret;
+ for_each_btree_key_intent(&iter, c, BTREE_ID_XATTRS,
+ POS(inode_nr, 0), k) {
+ if (k.k->p.inode > inode_nr)
+ break;
+
+ bkey_init(&delete.k);
+ delete.k.p = k.k->p;
+
+ ret = bch_btree_insert_at(&iter, &keylist_single(&delete),
+ NULL, NULL, 0);
+ if (ret) {
+ bch_btree_iter_unlock(&iter);
+ return ret;
+ }
+
+ }
+ bch_btree_iter_unlock(&iter);
+
bkey_init(&delete.k);
delete.k.p.inode = inode_nr;
@@ -179,6 +198,33 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
BTREE_INSERT_NOFAIL);
}
+int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
+ struct bkey_i_inode *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = -ENOENT;
+
+ for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
+ POS(inode_nr, 0), k) {
+ switch (k.k->type) {
+ case BCH_INODE_FS:
+ ret = 0;
+ bkey_reassemble(&inode->k_i, k);
+ break;
+ default:
+ /* hole, not found */
+ break;
+ }
+
+ break;
+
+ }
+ bch_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
int bch_blockdev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
struct bkey_i_inode_blockdev *ret)
{
diff --git a/drivers/md/bcache/inode.h b/drivers/md/bcache/inode.h
index 6561e1e71ee6..dc1c26f8240f 100644
--- a/drivers/md/bcache/inode.h
+++ b/drivers/md/bcache/inode.h
@@ -17,6 +17,7 @@ static inline int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
cl, journal_seq);
}
+int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
int bch_blockdev_inode_find_by_uuid(struct cache_set *, uuid_le *,
struct bkey_i_inode_blockdev *);
diff --git a/drivers/md/bcache/siphash.c b/drivers/md/bcache/siphash.c
new file mode 100644
index 000000000000..0c6f7f3ec819
--- /dev/null
+++ b/drivers/md/bcache/siphash.c
@@ -0,0 +1,185 @@
+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound. Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+//#include <sys/param.h>
+//#include <sys/systm.h>
+
+#include <asm/byteorder.h>
+#include <asm/string.h>
+
+#include "siphash.h"
+
+static void SipHash_CRounds(SIPHASH_CTX *, int);
+static void SipHash_Rounds(SIPHASH_CTX *, int);
+
+void
+SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+ u64 k0, k1;
+
+ k0 = le64_to_cpu(key->k0);
+ k1 = le64_to_cpu(key->k1);
+
+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+ ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+ memset(ctx->buf, 0, sizeof(ctx->buf));
+ ctx->bytes = 0;
+}
+
+void
+SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+{
+ const u8 *ptr = src;
+ size_t left, used;
+
+ if (len == 0)
+ return;
+
+ used = ctx->bytes % sizeof(ctx->buf);
+ ctx->bytes += len;
+
+ if (used > 0) {
+ left = sizeof(ctx->buf) - used;
+
+ if (len >= left) {
+ memcpy(&ctx->buf[used], ptr, left);
+ SipHash_CRounds(ctx, rc);
+ len -= left;
+ ptr += left;
+ } else {
+ memcpy(&ctx->buf[used], ptr, len);
+ return;
+ }
+ }
+
+ while (len >= sizeof(ctx->buf)) {
+ memcpy(ctx->buf, ptr, sizeof(ctx->buf));
+ SipHash_CRounds(ctx, rc);
+ len -= sizeof(ctx->buf);
+ ptr += sizeof(ctx->buf);
+ }
+
+ if (len > 0)
+ memcpy(&ctx->buf[used], ptr, len);
+}
+
+void
+SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+ u64 r;
+
+ r = SipHash_End(ctx, rc, rf);
+
+ *((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64
+SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+ u64 r;
+ size_t left, used;
+
+ used = ctx->bytes % sizeof(ctx->buf);
+ left = sizeof(ctx->buf) - used;
+ memset(&ctx->buf[used], 0, left - 1);
+ ctx->buf[7] = ctx->bytes;
+
+ SipHash_CRounds(ctx, rc);
+ ctx->v[2] ^= 0xff;
+ SipHash_Rounds(ctx, rf);
+
+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+ memset(ctx, 0, sizeof(*ctx));
+ return (r);
+}
+
+u64
+SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+ SIPHASH_CTX ctx;
+
+ SipHash_Init(&ctx, key);
+ SipHash_Update(&ctx, rc, rf, src, len);
+ return (SipHash_End(&ctx, rc, rf));
+}
+
+#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
+
+static void
+SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+ while (rounds--) {
+ ctx->v[0] += ctx->v[1];
+ ctx->v[2] += ctx->v[3];
+ ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
+ ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
+
+ ctx->v[1] ^= ctx->v[0];
+ ctx->v[3] ^= ctx->v[2];
+ ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
+
+ ctx->v[2] += ctx->v[1];
+ ctx->v[0] += ctx->v[3];
+ ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
+ ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
+
+ ctx->v[1] ^= ctx->v[2];
+ ctx->v[3] ^= ctx->v[0];
+ ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
+ }
+}
+
+static void
+SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
+{
+ u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
+
+ ctx->v[3] ^= m;
+ SipHash_Rounds(ctx, rounds);
+ ctx->v[0] ^= m;
+}
diff --git a/drivers/md/bcache/siphash.h b/drivers/md/bcache/siphash.h
new file mode 100644
index 000000000000..7a4b2241f1e1
--- /dev/null
+++ b/drivers/md/bcache/siphash.h
@@ -0,0 +1,86 @@
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ * SipHash24_Init() for the fast and resonable strong version
+ * SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH 8
+#define SIPHASH_KEY_LENGTH 16
+#define SIPHASH_DIGEST_LENGTH 8
+
+typedef struct _SIPHASH_CTX {
+ u64 v[4];
+ u8 buf[SIPHASH_BLOCK_LENGTH];
+ u32 bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+ __le64 k0;
+ __le64 k1;
+} SIPHASH_KEY;
+
+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64 SipHash_End(SIPHASH_CTX *, int, int);
+void SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e82dcc5ae80c..25e570253b1b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -12,6 +12,7 @@
#include "btree.h"
#include "clock.h"
#include "debug.h"
+#include "fs-gc.h"
#include "gc.h"
#include "inode.h"
#include "io.h"
@@ -26,6 +27,7 @@
#include "tier.h"
#include "writeback.h"
+#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/crc32c.h>
#include <linux/debugfs.h>
@@ -139,6 +141,41 @@ static const char *bch_blkdev_open(const char *path, void *holder,
return NULL;
}
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+ struct backing_dev_info *bdi;
+ struct cache_set *c = data;
+ struct cache *ca;
+ unsigned i;
+ int ret = 0;
+
+ rcu_read_lock();
+ if (bdi_bits & (1 << WB_sync_congested)) {
+ /* Reads - check all devices: */
+ for_each_cache_rcu(ca, c, i) {
+ bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+ if (bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ } else {
+ /* Writes only go to tier 0: */
+ group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+ bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+ if (bdi_congested(bdi, bdi_bits)) {
+ ret = 1;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/* Superblock */
const char *validate_cache_member(struct cache_sb *sb,
@@ -601,8 +638,19 @@ static void bch_recalc_capacity(struct cache_set *c)
struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
struct cache *ca;
u64 capacity = 0;
+ unsigned long ra_pages = 0;
unsigned i, j;
+ rcu_read_lock();
+ for_each_cache_rcu(ca, c, i) {
+ struct backing_dev_info *bdi =
+ blk_get_backing_dev_info(ca->disk_sb.bdev);
+
+ ra_pages += bdi->ra_pages;
+ }
+
+ c->bdi.ra_pages = ra_pages;
+
/*
* Capacity of the cache set is the capacity of all the devices in the
* slowest (highest) tier - we don't include lower tier devices.
@@ -752,6 +800,9 @@ void bch_cache_set_fail(struct cache_set *c)
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+ if (c->stop_completion)
+ complete(c->stop_completion);
kfree(c);
module_put(THIS_MODULE);
}
@@ -777,6 +828,7 @@ static void cache_set_free(struct closure *cl)
percpu_ref_exit(&c->writes);
bch_io_clock_exit(&c->io_clock[WRITE]);
bch_io_clock_exit(&c->io_clock[READ]);
+ bdi_destroy(&c->bdi);
bioset_exit(&c->btree_bio);
bioset_exit(&c->bio_split);
mempool_exit(&c->btree_reserve_pool);
@@ -998,6 +1050,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) ||
bioset_init(&c->btree_bio, 1, offsetof(struct bbio, bio)) ||
+ bdi_setup_and_register(&c->bdi, "bcache") ||
bch_io_clock_init(&c->io_clock[READ]) ||
bch_io_clock_init(&c->io_clock[WRITE]) ||
bch_journal_alloc(&c->journal) ||
@@ -1005,6 +1058,10 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c))))
goto err;
+ c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ c->bdi.congested_fn = bch_congested_fn;
+ c->bdi.congested_data = c;
+
return c;
err:
bch_cache_set_stop(c);
@@ -1144,7 +1201,15 @@ static const char *run_cache_set(struct cache_set *c)
}
bch_journal_replay(c, &journal);
+
+ err = "error gcing inode nlinks";
+ if (bch_gc_inode_nlinks(c))
+ goto err;
+
+ bch_verify_inode_refs(c);
} else {
+ struct bkey_i_inode inode;
+
pr_notice("invalidating existing data");
err = "unable to allocate journal buckets";
@@ -1185,6 +1250,17 @@ static const char *run_cache_set(struct cache_set *c)
/* XXX: necessary? */
bch_journal_meta(&c->journal, &cl);
closure_sync(&cl);
+
+ bkey_inode_init(&inode.k_i);
+ inode.k.p.inode = BCACHE_ROOT_INO;
+ inode.v.i_mode = S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO;
+ inode.v.i_nlink = 2;
+
+ err = "error creating root directory";
+ if (bch_btree_insert(c, BTREE_ID_INODES,
+ &keylist_single(&inode.k_i),
+ NULL, &cl, NULL, 0))
+ goto err;
}
bch_prio_timer_start(c, READ);
@@ -2342,6 +2418,7 @@ kobj_attribute_write(reboot, reboot_test);
static void bcache_exit(void)
{
bch_debug_exit();
+ bch_fs_exit();
bch_blockdev_exit();
if (bcache_kset)
kset_unregister(bcache_kset);
@@ -2368,6 +2445,7 @@ static int __init bcache_init(void)
!(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
sysfs_create_files(&bcache_kset->kobj, files) ||
bch_blockdev_init() ||
+ bch_fs_init() ||
bch_debug_init())
goto err;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 5311afcd3a1c..0704697e762e 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -24,6 +24,8 @@ struct closure;
#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0)
#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0)
+#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0)
+#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0)
#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0)
#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i)
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
@@ -36,6 +38,8 @@ struct closure;
#define atomic_inc_bug(v, i) atomic_inc(v)
#define atomic_sub_bug(i, v) atomic_sub(i, v)
#define atomic_add_bug(i, v) atomic_add(i, v)
+#define atomic_long_dec_bug(v) atomic_long_dec(v)
+#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v)
#define atomic64_dec_bug(v) atomic64_dec(v)
#define atomic64_inc_bug(v, i) atomic64_inc(v)
#define atomic64_sub_bug(i, v) atomic64_sub(i, v)
diff --git a/drivers/md/bcache/xattr.c b/drivers/md/bcache/xattr.c
new file mode 100644
index 000000000000..404796ff8163
--- /dev/null
+++ b/drivers/md/bcache/xattr.c
@@ -0,0 +1,414 @@
+
+#include "bcache.h"
+#include "btree.h"
+#include "extents.h"
+#include "fs.h"
+#include "keylist.h"
+#include "siphash.h"
+#include "xattr.h"
+
+#include "linux/crc32c.h"
+#include "linux/cryptohash.h"
+#include "linux/posix_acl_xattr.h"
+#include "linux/xattr.h"
+
+#if 0
+/*
+ * XXX: should really include x_type here
+ */
+static u64 bch_xattr_hash(const struct qstr *name)
+{
+ union {
+ u32 b[SHA_DIGEST_WORDS];
+ u64 ret;
+ } digest;
+
+ unsigned done = 0;
+
+ sha_init(digest.b);
+
+ while (done < name->len) {
+ u32 workspace[SHA_WORKSPACE_WORDS];
+ u8 message[SHA_MESSAGE_BYTES];
+ unsigned bytes = min_t(unsigned, name->len - done,
+ SHA_MESSAGE_BYTES);
+
+ memcpy(message, name->name + done, bytes);
+ memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes);
+ sha_transform(digest.b, message, workspace);
+ done += bytes;
+ }
+
+ return digest.ret;
+}
+
+static const SIPHASH_KEY bch_siphash_key;
+
+static u64 bch_xattr_hash(const struct qstr *name, u8 type)
+{
+#if 0
+ SIPHASH_CTX ctx;
+
+ SipHash24_Init(&ctx, &bch_siphash_key);
+ SipHash24_Update(&ctx, &type, sizeof(type));
+ SipHash24_Update(&ctx, name->name, name->len);
+
+ return SipHash24_End(&ctx) >> 1;
+#else
+ return SipHash24(&bch_siphash_key, name->name, name->len) >> 1;
+#endif
+}
+#endif
+
+static u64 bch_xattr_hash(const struct qstr *name, u8 type)
+{
+ return crc32c(0, name->name, name->len);
+}
+
+#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
+
+static int xattr_cmp(const struct bch_xattr *xattr,
+ u8 type, const struct qstr *q)
+{
+ return xattr->x_type != type ||
+ xattr->x_name_len != q->len ||
+ memcmp(xattr->x_name, q->name, q->len);
+}
+
+static bool bch_xattr_invalid(const struct cache_set *c, struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_XATTR:
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+ return true;
+
+ return false;
+ case BCH_XATTR_WHITEOUT:
+ if (bkey_val_bytes(k.k))
+ return true;
+
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void bch_xattr_to_text(struct cache_set *c, char *buf,
+ size_t size, struct bkey_s_c k)
+{
+ struct bkey_s_c_xattr xattr;
+ int n;
+
+ switch (k.k->type) {
+ case BCH_XATTR:
+ xattr = bkey_s_c_to_xattr(k);
+
+ if (size) {
+ n = min_t(unsigned, size, xattr.v->x_name_len);
+ memcpy(buf, xattr.v->x_name, n);
+ buf[size - 1] = '\0';
+ buf += n;
+ size -= n;
+ }
+
+ n = scnprintf(buf, size, " -> ");
+ buf += n;
+ size -= n;
+
+ if (size) {
+ n = min_t(unsigned, size, xattr.v->x_val_len);
+ memcpy(buf, xattr_val(xattr.v), n);
+ buf[size - 1] = '\0';
+ buf += n;
+ size -= n;
+ }
+
+ break;
+ case BCH_XATTR_WHITEOUT:
+ scnprintf(buf, size, "whiteout");
+ break;
+ }
+}
+
+const struct btree_keys_ops bch_xattr_ops = {
+};
+
+const struct bkey_ops bch_bkey_xattr_ops = {
+ .key_invalid = bch_xattr_invalid,
+ .val_to_text = bch_xattr_to_text,
+};
+
+int bch_xattr_get(struct cache_set *c, u64 inum, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct qstr qname = (struct qstr) QSTR_INIT(name, strlen(name));
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ const struct bch_xattr *xattr;
+ int ret = -ENODATA;
+
+ for_each_btree_key_with_holes(&iter, c, BTREE_ID_XATTRS,
+ POS(inum, bch_xattr_hash(&qname, type)), k) {
+ switch (k.k->type) {
+ case BCH_XATTR:
+ xattr = bkey_s_c_to_xattr(k).v;
+
+ /* collision? */
+ if (!xattr_cmp(xattr, type, &qname)) {
+ ret = xattr->x_val_len;
+ if (buffer) {
+ if (xattr->x_val_len > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, xattr_val(xattr),
+ xattr->x_val_len);
+ }
+ goto out;
+ }
+ break;
+ case BCH_XATTR_WHITEOUT:
+ break;
+ default:
+ /* hole, not found */
+ goto out;
+ }
+ }
+out:
+ bch_btree_iter_unlock(&iter);
+ return ret;
+}
+
+int bch_xattr_set(struct inode *inode, const char *name,
+ const void *value, size_t size,
+ int flags, int type)
+{
+ struct bch_inode_info *ei = to_bch_ei(inode);
+ struct cache_set *c = inode->i_sb->s_fs_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct keylist keys;
+ struct qstr qname = (struct qstr) QSTR_INIT((char *) name,
+ strlen(name));
+ int ret = -EINVAL;
+ unsigned insert_flags = BTREE_INSERT_ATOMIC;
+
+ if (!value)
+ insert_flags |= BTREE_INSERT_NOFAIL;
+
+ bch_btree_iter_init_intent(&iter, c, BTREE_ID_XATTRS,
+ POS(inode->i_ino,
+ bch_xattr_hash(&qname, type)));
+
+ while ((k = bch_btree_iter_peek_with_holes(&iter)).k) {
+ switch (k.k->type) {
+ case BCH_XATTR:
+ /* collision? */
+ if (xattr_cmp(bkey_s_c_to_xattr(k).v, type, &qname)) {
+ bch_btree_iter_advance_pos(&iter);
+ continue;
+ }
+
+ if (flags & XATTR_CREATE) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ break;
+ case BCH_XATTR_WHITEOUT:
+ bch_btree_iter_advance_pos(&iter);
+ continue;
+ default:
+ /* hole, not found */
+ if (flags & XATTR_REPLACE) {
+ ret = -ENODATA;
+ goto out;
+ }
+ break;
+ }
+
+ bch_keylist_init(&keys);
+
+ if (value) {
+ struct bkey_i_xattr *xattr;
+ unsigned u64s = BKEY_U64s +
+ DIV_ROUND_UP(sizeof(struct bch_xattr) +
+ qname.len + size,
+ sizeof(u64));
+
+ if (u64s > U8_MAX) {
+ ret = -ERANGE;
+ break;
+ }
+
+ if (bch_keylist_realloc(&keys, u64s)) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ xattr = bkey_xattr_init(keys.top);
+ xattr->k.u64s = u64s;
+ xattr->k.p = k.k->p;
+ xattr->v.x_type = type;
+ xattr->v.x_name_len = qname.len;
+ xattr->v.x_val_len = size;
+ memcpy(xattr->v.x_name, qname.name, qname.len);
+ memcpy(xattr_val(&xattr->v), value, size);
+
+ BUG_ON(xattr_cmp(&xattr->v, type, &qname));
+ } else {
+ /* removing */
+ bkey_init(&keys.top->k);
+ keys.top->k.type = BCH_XATTR_WHITEOUT;
+ keys.top->k.p = k.k->p;
+ }
+
+ bch_keylist_enqueue(&keys);
+
+ ret = bch_btree_insert_at(&iter, &keys, NULL,
+ &ei->journal_seq,
+ insert_flags);
+ bch_keylist_free(&keys);
+
+ if (ret != -EINTR && ret != -EAGAIN)
+ break;
+ }
+out:
+ bch_btree_iter_unlock(&iter);
+ return ret;
+}
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned);
+
+static size_t bch_xattr_emit(struct dentry *dentry,
+ const struct bch_xattr *xattr,
+ char *buffer, size_t buffer_size)
+{
+ const struct xattr_handler *handler =
+ bch_xattr_type_to_handler(xattr->x_type);
+
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const size_t prefix_len = strlen(handler->prefix);
+ const size_t total_len = prefix_len + xattr->x_name_len + 1;
+
+ if (buffer && total_len <= buffer_size) {
+ memcpy(buffer, handler->prefix, prefix_len);
+ memcpy(buffer + prefix_len,
+ xattr->x_name, xattr->x_name_len);
+ buffer[prefix_len + xattr->x_name_len] = '\0';
+ }
+
+ return total_len;
+ } else {
+ return 0;
+ }
+}
+
+ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct cache_set *c = dentry->d_sb->s_fs_info;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ const struct bch_xattr *xattr;
+ u64 inum = dentry->d_inode->i_ino;
+ ssize_t ret = 0;
+ size_t len;
+
+ for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
+ BUG_ON(k.k->p.inode < inum);
+
+ if (k.k->p.inode > inum)
+ break;
+
+ if (k.k->type != BCH_XATTR)
+ continue;
+
+ xattr = bkey_s_c_to_xattr(k).v;
+
+ len = bch_xattr_emit(dentry, xattr, buffer, buffer_size);
+ if (buffer) {
+ if (len > buffer_size) {
+ bch_btree_iter_unlock(&iter);
+ return -ERANGE;
+ }
+
+ buffer += len;
+ buffer_size -= len;
+ }
+
+ ret += len;
+
+ }
+ bch_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
+static int bch_xattr_get_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return bch_xattr_get(inode->i_sb->s_fs_info, inode->i_ino,
+ name, buffer, size, handler->flags);
+}
+
+static int bch_xattr_set_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return bch_xattr_set(inode, name, value, size, flags,
+ handler->flags);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = bch_xattr_get_handler,
+ .set = bch_xattr_set_handler,
+ .flags = BCH_XATTR_INDEX_USER,
+};
+
+static bool bch_xattr_trusted_list(struct dentry *dentry)
+{
+ return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = bch_xattr_trusted_list,
+ .get = bch_xattr_get_handler,
+ .set = bch_xattr_set_handler,
+ .flags = BCH_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = bch_xattr_get_handler,
+ .set = bch_xattr_set_handler,
+ .flags = BCH_XATTR_INDEX_SECURITY,
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+ [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
+ &posix_acl_access_xattr_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &posix_acl_default_xattr_handler,
+ [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
+ [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+};
+
+const struct xattr_handler *bch_xattr_handlers[] = {
+ &bch_xattr_user_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &bch_xattr_trusted_handler,
+ &bch_xattr_security_handler,
+ NULL
+};
+
+static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type)
+{
+ return type < ARRAY_SIZE(bch_xattr_handler_map)
+ ? bch_xattr_handler_map[type]
+ : NULL;
+}
diff --git a/drivers/md/bcache/xattr.h b/drivers/md/bcache/xattr.h
new file mode 100644
index 000000000000..839d47ef6910
--- /dev/null
+++ b/drivers/md/bcache/xattr.h
@@ -0,0 +1,16 @@
+#ifndef _BCACHE_XATTR_H
+#define _BCACHE_XATTR_H
+
+extern const struct btree_keys_ops bch_xattr_ops;
+extern const struct bkey_ops bch_bkey_xattr_ops;
+
+struct dentry;
+struct xattr_handler;
+
+int bch_xattr_get(struct cache_set *, u64, const char *, void *, size_t, int);
+int bch_xattr_set(struct inode *, const char *, const void *, size_t, int, int);
+ssize_t bch_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch_xattr_handlers[];
+
+#endif /* _BCACHE_XATTR_H */
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 27a2926aec21..34523f6f129c 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -288,6 +288,8 @@ BITMASK(EXTENT_CACHED, struct bch_extent, data[0], 63, 64)
#define BLOCKDEV_INODE_MAX 4096
+#define BCACHE_ROOT_INO 4096
+
enum bch_inode_types {
BCH_INODE_FS = 128,
BCH_INODE_BLOCKDEV = 129,
@@ -336,6 +338,62 @@ BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV);
BITMASK(INODE_FLASH_ONLY, struct bch_inode_blockdev,
i_inode.i_flags, 0, 1);
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+enum {
+ BCH_DIRENT = 128,
+ BCH_DIRENT_WHITEOUT = 129,
+};
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ __u64 d_inum;
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __attribute__((packed));
+BKEY_VAL_TYPE(dirent, BCH_DIRENT);
+
+/* Xattrs */
+
+enum {
+ BCH_XATTR = 128,
+ BCH_XATTR_WHITEOUT = 129,
+};
+
+#define BCH_XATTR_INDEX_USER 0
+#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define BCH_XATTR_INDEX_TRUSTED 3
+#define BCH_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __u16 x_val_len;
+ __u8 x_name[];
+} __attribute__((packed));
+BKEY_VAL_TYPE(xattr, BCH_XATTR);
+
/* Superblock */
/* Version 0: Cache device
@@ -488,6 +546,14 @@ BITMASK(CACHE_BTREE_NODE_SIZE, struct cache_sb, flags, 20, 36);
BITMASK(CACHE_SET_META_REPLICAS_HAVE, struct cache_sb, flags, 36, 40);
BITMASK(CACHE_SET_DATA_REPLICAS_HAVE, struct cache_sb, flags, 40, 44);
+BITMASK(CACHE_SET_DIRENT_CSUM_TYPE, struct cache_sb, flags, 44, 48);
+enum {
+ BCH_DIRENT_CSUM_CRC32C = 0,
+ BCH_DIRENT_CSUM_CRC64 = 1,
+ BCH_DIRENT_CSUM_SIPHASH = 2,
+ BCH_DIRENT_CSUM_SHA1 = 3,
+};
+
BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
#define CACHE_MODE_WRITETHROUGH 0U
#define CACHE_MODE_WRITEBACK 1U
@@ -532,6 +598,10 @@ static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \
0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCACHE_STATFS_MAGIC 0xca451a4e
+
+#define BCACHE_SB_MAGIC 0xca451a4ef67385c6ULL
+#define BCACHE_SB_MAGIC2 0x816dba487ff56582ULL
#define JSET_MAGIC 0x245235c1a3625032ULL
#define PSET_MAGIC 0x6750e15f87337f91ULL
#define BSET_MAGIC 0x90135c78b99e07f5ULL
@@ -571,7 +641,9 @@ static inline __u64 bset_magic(struct cache_sb *sb)
#define DEFINE_BCH_BTREE_IDS() \
DEF_BTREE_ID(EXTENTS, 0, "extents") \
- DEF_BTREE_ID(INODES, 1, "inodes")
+ DEF_BTREE_ID(INODES, 1, "inodes") \
+ DEF_BTREE_ID(DIRENTS, 2, "dirents") \
+ DEF_BTREE_ID(XATTRS, 3, "xattrs")
#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
@@ -803,3 +875,5 @@ BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
}
#endif
#endif /* _LINUX_BCACHE_H */
+
+/* vim: set foldnestmax=2: */