diff options
-rw-r--r-- | drivers/md/bcache/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bcache/Makefile | 9 | ||||
-rw-r--r-- | drivers/md/bcache/acl.c | 245 | ||||
-rw-r--r-- | drivers/md/bcache/acl.h | 57 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 26 | ||||
-rw-r--r-- | drivers/md/bcache/bkey.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bkey_methods.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/buckets.h | 25 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 90 | ||||
-rw-r--r-- | drivers/md/bcache/debug.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/dirent.c | 379 | ||||
-rw-r--r-- | drivers/md/bcache/dirent.h | 21 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/fs-gc.c | 202 | ||||
-rw-r--r-- | drivers/md/bcache/fs-gc.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/fs.c | 2087 | ||||
-rw-r--r-- | drivers/md/bcache/fs.h | 20 | ||||
-rw-r--r-- | drivers/md/bcache/inode.c | 50 | ||||
-rw-r--r-- | drivers/md/bcache/inode.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/siphash.c | 185 | ||||
-rw-r--r-- | drivers/md/bcache/siphash.h | 86 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 78 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/xattr.c | 414 | ||||
-rw-r--r-- | drivers/md/bcache/xattr.h | 16 | ||||
-rw-r--r-- | include/uapi/linux/bcache.h | 76 |
26 files changed, 4069 insertions, 23 deletions
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 5502372dfc94..55e135f6dd61 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -2,6 +2,7 @@ config BCACHE tristate "Block device as cache" select LIBCRC32C + select FS_POSIX_ACL ---help--- Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 02ef2612777e..0dd3db8a5ef4 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,9 +1,10 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o bkey.o bkey_methods.o blockdev.o bset.o\ - btree.o buckets.o clock.o closure.o debug.o extents.o gc.o inode.o io.o\ - journal.o keybuf.o keylist.o migrate.o move.o movinggc.o notify.o\ - request.o six.o stats.o super.o sysfs.o tier.o trace.o util.o writeback.o +bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\ + bset.o btree.o buckets.o clock.o closure.o debug.o dirent.o extents.o\ + fs.o fs-gc.o gc.o inode.o io.o journal.o keybuf.o keylist.o migrate.o\ + move.o movinggc.o notify.o request.o siphash.o six.o stats.o super.o\ + sysfs.o tier.o trace.o util.o writeback.o xattr.o ccflags-y := -Werror diff --git a/drivers/md/bcache/acl.c b/drivers/md/bcache/acl.c new file mode 100644 index 000000000000..51f04ab2a9d5 --- /dev/null +++ b/drivers/md/bcache/acl.c @@ -0,0 +1,245 @@ +#include "bcache.h" + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *bch_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(bch_acl_header)) + return ERR_PTR(-EINVAL); + if (((bch_acl_header *)value)->a_version != + cpu_to_le32(BCH_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(bch_acl_header); + count = bch_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + bch_acl_entry *entry = + (bch_acl_entry *)value; + if ((char *)value + sizeof(bch_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(bch_acl_entry_short); + break; + + case ACL_USER: + value = (char *)value + sizeof(bch_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_uid = + make_kuid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + case ACL_GROUP: + value = (char *)value + sizeof(bch_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_gid = + make_kgid(&init_user_ns, + le32_to_cpu(entry->e_id)); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + bch_acl_header *ext_acl; + char *e; + size_t n; + + *size = bch_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count * + sizeof(bch_acl_entry), GFP_KERNEL); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION); + e = (char *)ext_acl + sizeof(bch_acl_header); + for (n = 0; n < acl->a_count; n++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + bch_acl_entry *entry = (bch_acl_entry *)e; + + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + e += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + e += sizeof(bch_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(bch_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +struct posix_acl *bch_get_acl(struct inode *inode, int type) +{ + struct cache_set *c = inode->i_sb->s_fs_info; + int name_index; + char *value = NULL; + struct posix_acl *acl; + int ret; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + ret = bch_xattr_get(c, inode->i_ino, "", NULL, 0, name_index); + if (ret > 0) { + value = kmalloc(ret, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + ret = bch_xattr_get(c, inode->i_ino, "", value, + ret, name_index); + } + if (ret > 0) + acl = bch_acl_from_disk(value, ret); + else if (ret == -ENODATA || ret == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(ret); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &inode->i_mode); + if (ret < 0) + return ret; + else { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (ret == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + value = bch_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + ret = bch_xattr_set(inode, "", value, size, 0, name_index); + + kfree(value); + + if (ret == -ERANGE) + ret = -E2BIG; + + if (!ret) + set_cached_acl(inode, type, acl); + + return ret; +} + +int bch_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int ret; + + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; + + if (default_acl) { + ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!ret) + ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return ret; +} diff --git a/drivers/md/bcache/acl.h b/drivers/md/bcache/acl.h new file mode 100644 index 000000000000..03f93fa0ff1b --- /dev/null +++ b/drivers/md/bcache/acl.h @@ -0,0 +1,57 @@ +/* + File: fs/bch/acl.h + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/posix_acl_xattr.h> + +#define BCH_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} bch_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} bch_acl_entry_short; + +typedef struct { + __le32 a_version; +} bch_acl_header; + +static inline size_t bch_acl_size(int count) +{ + if (count <= 4) { + return sizeof(bch_acl_header) + + count * sizeof(bch_acl_entry_short); + } else { + return sizeof(bch_acl_header) + + 4 * sizeof(bch_acl_entry_short) + + (count - 4) * sizeof(bch_acl_entry); + } +} + +static inline int bch_acl_count(size_t size) +{ + ssize_t s; + + size -= sizeof(bch_acl_header); + s = size - 4 * sizeof(bch_acl_entry_short); + if (s < 0) { + if (size % sizeof(bch_acl_entry_short)) + return -1; + return size / sizeof(bch_acl_entry_short); + } else { + if (s % sizeof(bch_acl_entry)) + return -1; + return s / sizeof(bch_acl_entry) + 4; + } +} + +extern struct posix_acl *bch_get_acl(struct inode *, int); +extern int bch_set_acl(struct inode *, struct posix_acl *, int); +extern int bch_init_acl(struct inode *, struct inode *); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5aa2c2863c3c..b203e28c48ca 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -351,11 +351,12 @@ struct cache { }; struct gc_stat { - size_t nodes; - size_t key_bytes; + u64 nodes; + u64 key_bytes; + u64 nkeys; - size_t nkeys; - uint64_t data; /* sectors */ + u64 data; /* sectors */ + u64 inodes; }; /* @@ -384,6 +385,7 @@ enum { CACHE_SET_RO, CACHE_SET_GC_STOPPING, CACHE_SET_GC_FAILURE, + CACHE_SET_BDEV_MOUNTED, }; struct cache_member_rcu { @@ -404,6 +406,7 @@ struct cache_set { struct list_head list; struct kobject kobj; struct kobject internal; + struct completion *stop_completion; unsigned long flags; /* Counts outstanding writes, for clean transition to read-only */ @@ -423,10 +426,13 @@ struct cache_set { struct bio_set bio_split; + /* For punting bio submissions to workqueue, io.c */ struct bio_list bio_submit_list; struct work_struct bio_submit_work; spinlock_t bio_submit_lock; + struct backing_dev_info bdi; + /* BTREE CACHE */ struct bio_set btree_bio; @@ -481,9 +487,16 @@ struct cache_set { struct timer_list foreground_write_wakeup; + /* + * These contain all r/w devices - i.e. devices we can currently + * allocate from: + */ struct cache_group cache_all; struct cache_group cache_tiers[CACHE_TIERS]; + u64 capacity; /* sectors */ + atomic_long_t sectors_reserved; + atomic_long_t sectors_reserved_cache; struct mutex bucket_lock; @@ -567,6 +580,9 @@ struct cache_set { struct work_struct read_race_work; spinlock_t read_race_lock; + /* FILESYSTEM */ + atomic_long_t nr_inodes; + /* TIERING */ struct task_struct *tiering_read; struct bch_pd_controller tiering_pd; @@ -757,5 +773,7 @@ do { \ void bch_debug_exit(void); int bch_debug_init(void); +void bch_fs_exit(void); +int bch_fs_init(void); #endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bkey.h b/drivers/md/bcache/bkey.h index 1a82e57ab420..5bb19a700788 100644 --- a/drivers/md/bcache/bkey.h +++ b/drivers/md/bcache/bkey.h @@ -493,6 +493,10 @@ BKEY_VAL_ACCESSORS(extent, BCH_EXTENT); BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); +BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); + +BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); + /* byte order helpers */ #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) diff --git a/drivers/md/bcache/bkey_methods.c b/drivers/md/bcache/bkey_methods.c index 90e1c9e7df38..03affccac1ce 100644 --- a/drivers/md/bcache/bkey_methods.c +++ b/drivers/md/bcache/bkey_methods.c @@ -2,12 +2,16 @@ #include "bcache.h" #include "bkey_methods.h" #include "btree.h" +#include "dirent.h" #include "extents.h" #include "inode.h" +#include "xattr.h" static const struct bkey_ops *bch_bkey_ops[] = { [BKEY_TYPE_EXTENTS] = &bch_bkey_extent_ops, [BKEY_TYPE_INODES] = &bch_bkey_inode_ops, + [BKEY_TYPE_DIRENTS] = &bch_bkey_dirent_ops, + [BKEY_TYPE_XATTRS] = &bch_bkey_xattr_ops, [BKEY_TYPE_BTREE] = &bch_bkey_btree_ops, }; diff --git a/drivers/md/bcache/buckets.h b/drivers/md/bcache/buckets.h index 3644e7e110ab..cd58d86af3bb 100644 --- a/drivers/md/bcache/buckets.h +++ b/drivers/md/bcache/buckets.h @@ -229,26 +229,27 @@ static inline size_t buckets_free_cache(struct cache *ca, return __buckets_free_cache(ca, bch_bucket_stats_read(ca), reserve); } -static inline u64 cache_sectors_used(struct cache *ca) -{ - struct bucket_stats stats = bch_bucket_stats_read(ca); - - return (stats.buckets_meta << ca->bucket_bits) + - stats.sectors_dirty; -} - -static inline bool cache_set_full(struct cache_set *c) +static inline u64 cache_set_sectors_used(struct cache_set *c) { struct cache *ca; unsigned i; u64 used = 0; rcu_read_lock(); - for_each_cache_rcu(ca, c, i) - used += cache_sectors_used(ca); + for_each_cache_rcu(ca, c, i) { + struct bucket_stats stats = bch_bucket_stats_read(ca); + + used += (stats.buckets_meta << ca->bucket_bits) + + stats.sectors_dirty; + } rcu_read_unlock(); - return used >= c->capacity; + return min(c->capacity, used + atomic_long_read(&c->sectors_reserved)); +} + +static inline bool cache_set_full(struct cache_set *c) +{ + return cache_set_sectors_used(c) >= c->capacity; } static inline bool is_available_bucket(struct bucket_mark mark) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index b0d22579ea0b..967420d7c078 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -10,6 +10,7 @@ #include "buckets.h" #include "debug.h" #include "extents.h" +#include "inode.h" #include "io.h" #include "super.h" @@ -182,6 +183,95 @@ out_put: bio_put(check); } +void bch_verify_inode_refs(struct cache_set *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_inode inode; + u64 cur_inum = 0; + char buf[100]; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(BCACHE_ROOT_INO, 0), k) { + if (k.k->type == KEY_TYPE_DISCARD) + continue; + + if (k.k->p.inode != cur_inum && + bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { + bch_bkey_val_to_text(c, iter.nodes[0], buf, + sizeof(buf), k); + bch_cache_set_error(c, + "extent for missing inode %llu\n%s", + k.k->p.inode, buf); + bch_btree_iter_unlock(&iter); + return; + } + + cur_inum = k.k->p.inode; + + if (!S_ISREG(inode.v.i_mode) && + !S_ISLNK(inode.v.i_mode)) + bch_cache_set_error(c, + "extent for non regular file, inode %llu mode %u", + k.k->p.inode, inode.v.i_mode); + + BUG_ON(inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY); + + if (k.k->p.offset > round_up(inode.v.i_size, PAGE_SIZE) >> 9) { + bch_bkey_val_to_text(c, iter.nodes[0], buf, + sizeof(buf), k); + bch_cache_set_error(c, + "extent past end of inode %llu: i_size %llu extent\n%s", + k.k->p.inode, inode.v.i_size, buf); + } + } + bch_btree_iter_unlock(&iter); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, + POS(BCACHE_ROOT_INO, 0), k) { + /* XXX: skipping whiteouts for now */ + if (k.k->type != BCH_DIRENT) + continue; + + if (k.k->p.inode != cur_inum && + bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { + bch_cache_set_error(c, "dirent for missing inode %llu", + k.k->p.inode); + bch_btree_iter_unlock(&iter); + return; + } + + cur_inum = k.k->p.inode; + + if (!S_ISDIR(inode.v.i_mode)) + bch_cache_set_error(c, + "dirent for non directory, inode %llu mode %u", + k.k->p.inode, inode.v.i_mode); + } + bch_btree_iter_unlock(&iter); + + for_each_btree_key(&iter, c, BTREE_ID_XATTRS, + POS(BCACHE_ROOT_INO, 0), k) { + if (k.k->p.inode != cur_inum && + bch_inode_find_by_inum(c, k.k->p.inode, &inode)) { + bch_cache_set_error(c, + "xattr for missing inode %llu", + k.k->p.inode); + bch_btree_iter_unlock(&iter); + return; + } + + cur_inum = k.k->p.inode; + + if (!S_ISREG(inode.v.i_mode) && + !S_ISDIR(inode.v.i_mode)) + bch_cache_set_error(c, + "xattr for non file/directory, inode %llu mode %u", + k.k->p.inode, inode.v.i_mode); + } + bch_btree_iter_unlock(&iter); +} + #endif #ifdef CONFIG_DEBUG_FS diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index da35861aa3cb..b3cbb0bd9cd3 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -10,6 +10,7 @@ struct cache_set; void bch_btree_verify(struct cache_set *, struct btree *); void bch_data_verify(struct cached_dev *, struct bio *); +void bch_verify_inode_refs(struct cache_set *); #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) @@ -19,6 +20,7 @@ void bch_data_verify(struct cached_dev *, struct bio *); static inline void bch_btree_verify(struct cache_set *c, struct btree *b) {} static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} +static inline void bch_verify_inode_refs(struct cache_set *c) {} #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 diff --git a/drivers/md/bcache/dirent.c b/drivers/md/bcache/dirent.c new file mode 100644 index 000000000000..999538c71391 --- /dev/null +++ b/drivers/md/bcache/dirent.c @@ -0,0 +1,379 @@ + +#include "bcache.h" +#include "btree.h" +#include "extents.h" +#include "dirent.h" +#include "keylist.h" +#include "siphash.h" + +#include "linux/crc32c.h" +#include "linux/cryptohash.h" + +#if 0 +static u64 bch_dirent_hash(const struct qstr *name) +{ + union { + u32 b[SHA_DIGEST_WORDS]; + u64 ret; + } digest; + + unsigned done = 0; + + sha_init(digest.b); + + while (done < name->len) { + u32 workspace[SHA_WORKSPACE_WORDS]; + u8 message[SHA_MESSAGE_BYTES]; + unsigned bytes = min_t(unsigned, name->len - done, + SHA_MESSAGE_BYTES); + + memcpy(message, name->name + done, bytes); + memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes); + sha_transform(digest.b, message, workspace); + done += bytes; + } + + /* [0,2) reserved for dots */ + + return (digest.ret >= 2 ? digest.ret : 2) & S64_MAX; +} + +static const SIPHASH_KEY bch_siphash_key; + +static u64 bch_dirent_hash(const struct qstr *name) +{ + u64 hash = SipHash24(&bch_siphash_key, + name->name, name->len) >> 1; + + /* [0,2) reserved for dots */ + + return (hash >= 2 ? hash : 2); +} +#endif + +static u64 bch_dirent_hash(const struct qstr *name) +{ + u64 hash = crc32c(0, name->name, name->len); + + /* [0,2) reserved for dots */ + + return (hash >= 2 ? hash : 2); +} + +static unsigned dirent_name_bytes(struct bkey_s_c_dirent d) +{ + unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent); + + while (len && !d.v->d_name[len - 1]) + --len; + + return len; +} + +static int dirent_cmp(struct bkey_s_c_dirent d, + const struct qstr *q) +{ + int len = dirent_name_bytes(d); + + return len - q->len ?: memcmp(d.v->d_name, q->name, len); +} + +static bool bch_dirent_invalid(const struct cache_set *c, struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_DIRENT: + if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) + return true; + + return false; + case BCH_DIRENT_WHITEOUT: + if (bkey_val_bytes(k.k)) + return true; + + return false; + default: + return true; + } +} + +static void bch_dirent_to_text(struct cache_set *c, char *buf, + size_t size, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d; + + switch (k.k->type) { + case BCH_DIRENT: + d = bkey_s_c_to_dirent(k); + + if (size) { + unsigned n = min_t(unsigned, size, + dirent_name_bytes(d)); + memcpy(buf, d.v->d_name, n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + scnprintf(buf, size, " -> %llu", d.v->d_inum); + break; + case BCH_DIRENT_WHITEOUT: + scnprintf(buf, size, "whiteout"); + break; + } +} + +const struct btree_keys_ops bch_dirent_ops = { +}; + +const struct bkey_ops bch_bkey_dirent_ops = { + .key_invalid = bch_dirent_invalid, + .val_to_text = bch_dirent_to_text, +}; + +static int __bch_dirent_create(struct cache_set *c, u64 dir_inum, + u8 type, const struct qstr *name, + u64 dst_inum, bool update, + u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct keylist keys; + struct bkey_i_dirent *dirent; + unsigned u64s = BKEY_U64s + + DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len, + sizeof(u64)); + int ret = -ENOENT; + + bch_keylist_init(&keys); + + if (bch_keylist_realloc(&keys, u64s)) + return -ENOMEM; + + dirent = bkey_dirent_init(keys.top); + dirent->k.u64s = u64s; + dirent->v.d_inum = dst_inum; + dirent->v.d_type = type; + + memcpy(dirent->v.d_name, name->name, name->len); + memset(dirent->v.d_name + name->len, 0, + bkey_val_bytes(&dirent->k) - + (sizeof(struct bch_dirent) + name->len)); + + BUG_ON(dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + BUG_ON(dirent_cmp(dirent_i_to_s_c(dirent), name)); + + bch_keylist_enqueue(&keys); + + bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS, + POS(dir_inum, bch_dirent_hash(name))); + + while ((k = bch_btree_iter_peek_with_holes(&iter)).k) { + /* hole? */ + if (k.k->type != BCH_DIRENT) { + if (!update) + goto insert; + break; + } + + if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) { + /* found: */ + if (!update) { + ret = -EEXIST; + break; + } +insert: + dirent->k.p = k.k->p; + + ret = bch_btree_insert_at(&iter, &keys, NULL, + journal_seq, + BTREE_INSERT_ATOMIC); + if (ret != -EINTR && ret != -EAGAIN) + break; + } else { + /* collision */ + bch_btree_iter_advance_pos(&iter); + } + } + bch_btree_iter_unlock(&iter); + bch_keylist_free(&keys); + + return ret; +} + +int bch_dirent_create(struct cache_set *c, u64 dir_inum, u8 type, + const struct qstr *name, u64 dst_inum, + u64 *journal_seq) +{ + return __bch_dirent_create(c, dir_inum, type, + name, dst_inum, false, + journal_seq); +} + +int bch_dirent_update(struct cache_set *c, u64 dir_inum, + const struct qstr *name, u64 dst_inum, + u64 *journal_seq) +{ + return __bch_dirent_create(c, dir_inum, DT_UNKNOWN, + name, dst_inum, true, + journal_seq); +} + +int bch_dirent_delete(struct cache_set *c, u64 dir_inum, + const struct qstr *name) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 hash = bch_dirent_hash(name); + int ret = -ENOENT; + + pr_debug("deleting %llu:%llu (%s)", + dir_inum, hash, name->name); + + bch_btree_iter_init_intent(&iter, c, BTREE_ID_DIRENTS, + POS(dir_inum, bch_dirent_hash(name))); + + while ((k = bch_btree_iter_peek_with_holes(&iter)).k) { + switch (k.k->type) { + case BCH_DIRENT: + if (!dirent_cmp(bkey_s_c_to_dirent(k), name)) { + struct bkey_i delete; + + bkey_init(&delete.k); + delete.k.p = k.k->p; + delete.k.type = BCH_DIRENT_WHITEOUT; + + ret = bch_btree_insert_at(&iter, + &keylist_single(&delete), + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC); + if (ret == -EINTR || ret == -EAGAIN) + continue; + } + break; + case BCH_DIRENT_WHITEOUT: + break; + default: + /* hole, not found */ + goto out; + } + + bch_btree_iter_advance_pos(&iter); + } +out: + bch_btree_iter_unlock(&iter); + + return ret; +} + +u64 bch_dirent_lookup(struct cache_set *c, u64 dir_inum, + const struct qstr *name) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + u64 hash = bch_dirent_hash(name); + + pr_debug("searching for %llu:%llu (%s)", + dir_inum, hash, name->name); + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_DIRENTS, + POS(dir_inum, bch_dirent_hash(name)), k) { + switch (k.k->type) { + case BCH_DIRENT: + dirent = bkey_s_c_to_dirent(k); + + /* collision? */ + if (!dirent_cmp(dirent, name)) { + u64 inum = dirent.v->d_inum; + + bch_btree_iter_unlock(&iter); + pr_debug("found %s: %llu", name->name, inum); + return inum; + } + break; + case BCH_DIRENT_WHITEOUT: + break; + default: + /* hole, not found */ + goto out; + } + } +out: + bch_btree_iter_unlock(&iter); + + pr_debug("%s not found", name->name); + return 0; +} + +int bch_empty_dir(struct cache_set *c, u64 dir_inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) { + if (k.k->p.inode > dir_inum) + break; + + if (k.k->type == BCH_DIRENT) { + ret = -ENOTEMPTY; + break; + } + + } + bch_btree_iter_unlock(&iter); + + return ret; +} + +int bch_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct cache_set *c = sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + unsigned len; + + if (!dir_emit_dots(file, ctx)) + return 0; + + pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, + POS(inode->i_ino, ctx->pos), k) { + if (k.k->type != BCH_DIRENT) + continue; + + dirent = bkey_s_c_to_dirent(k); + + pr_debug("saw %llu:%llu (%s) -> %llu", + k.k->p.inode, k.k->p.offset, + dirent.v->d_name, dirent.v->d_inum); + + if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0) + continue; + + if (k.k->p.inode > inode->i_ino) + break; + + len = dirent_name_bytes(dirent); + + pr_debug("emitting %s", dirent.v->d_name); + + /* + * XXX: dir_emit() can fault and block, while we're holding + * locks + */ + if (!dir_emit(ctx, dirent.v->d_name, len, + dirent.v->d_inum, dirent.v->d_type)) + break; + + ctx->pos = k.k->p.offset + 1; + } + bch_btree_iter_unlock(&iter); + + return 0; +} diff --git a/drivers/md/bcache/dirent.h b/drivers/md/bcache/dirent.h new file mode 100644 index 000000000000..4de22a53c875 --- /dev/null +++ b/drivers/md/bcache/dirent.h @@ -0,0 +1,21 @@ +#ifndef _BCACHE_DIRENT_H +#define _BCACHE_DIRENT_H + +extern const struct btree_keys_ops bch_dirent_ops; +extern const struct bkey_ops bch_bkey_dirent_ops; + +struct qstr; +struct file; +struct dir_context; +struct cache_set; + +int bch_dirent_create(struct cache_set *, u64, u8, const struct qstr *, + u64, u64 *); +int bch_dirent_update(struct cache_set *, u64, const struct qstr *, u64, u64 *); +int bch_dirent_delete(struct cache_set *, u64, const struct qstr *); +u64 bch_dirent_lookup(struct cache_set *, u64, const struct qstr *); +int bch_empty_dir(struct cache_set *, u64); +int bch_readdir(struct file *, struct dir_context *); + +#endif /* _BCACHE_DIRENT_H */ + diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 077164e271b5..55ee8043b9b8 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -8,12 +8,14 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "dirent.h" #include "extents.h" #include "gc.h" #include "inode.h" #include "journal.h" #include "super.h" #include "writeback.h" +#include "xattr.h" #include <trace/events/bcache.h> @@ -1822,4 +1824,6 @@ const struct bkey_ops bch_bkey_extent_ops = { const struct btree_keys_ops *bch_btree_ops[] = { [BTREE_ID_EXTENTS] = &bch_extent_ops, [BTREE_ID_INODES] = &bch_inode_ops, + [BTREE_ID_DIRENTS] = &bch_dirent_ops, + [BTREE_ID_XATTRS] = &bch_xattr_ops, }; diff --git a/drivers/md/bcache/fs-gc.c b/drivers/md/bcache/fs-gc.c new file mode 100644 index 000000000000..47e7a7f093e0 --- /dev/null +++ b/drivers/md/bcache/fs-gc.c @@ -0,0 +1,202 @@ + +#include "bcache.h" +#include "btree.h" +#include "dirent.h" +#include "fs.h" +#include "inode.h" +#include "keylist.h" +#include "super.h" + +#define INODES_PER_ITER (1 << 24) + +struct nlink { + u32 count; + u32 dir_count; +}; + +static void inc_link(u64 pos, struct nlink *links, bool *need_loop, + u64 inum, unsigned count, bool dir) +{ + if (inum >= pos + INODES_PER_ITER) { + *need_loop = true; + } else if (inum >= pos) { + if (dir) + links[inum - pos].dir_count += count; + else + links[inum - pos].count += count; + } +} + +/* + * XXX: should do a DFS (via filesystem heirarchy), and make sure all dirents + * are reachable + */ + +noinline_for_stack +static int bch_gc_walk_dirents(struct cache_set *c, u64 pos, + struct nlink *links, bool *need_loop) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + + need_loop = false; + memset(links, 0, INODES_PER_ITER * sizeof(*links)); + + inc_link(pos, links, need_loop, BCACHE_ROOT_INO, 2, false); + + for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) { + switch (k.k->type) { + case BCH_DIRENT: + d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_DIR) { + inc_link(pos, links, need_loop, + d.v->d_inum, 2, false); + inc_link(pos, links, need_loop, + d.k->p.inode, 1, true); + } else { + inc_link(pos, links, need_loop, + d.v->d_inum, 1, false); + } + + break; + } + + bch_btree_iter_cond_resched(&iter); + } + return bch_btree_iter_unlock(&iter); +} + +static int bch_gc_do_inode(struct cache_set *c, struct btree_iter *iter, + struct bkey_s_c_inode inode, struct nlink link) +{ + struct bkey_i_inode update; + int ret; + + cache_set_err_on(inode.v->i_nlink < link.count, c, + "i_link too small (%u < %u, type %i)", + inode.v->i_nlink, link.count + link.dir_count, + mode_to_type(inode.v->i_mode)); + + if (!link.count) { + cache_set_err_on(S_ISDIR(inode.v->i_mode) && + bch_empty_dir(c, inode.k->p.inode), c, + "non empty directory with link count 0,inode nlink %u, dir links found %u", + inode.v->i_nlink, link.dir_count); + pr_info("deleting inum %llu", inode.k->p.inode); + + bch_btree_iter_unlock(iter); + return bch_inode_rm(c, inode.k->p.inode); + } + + if (inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) { + pr_info("truncating inode %llu", inode.k->p.inode); + + /* + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away + */ + + ret = bch_inode_truncate(c, inode.k->p.inode, + round_up(inode.v->i_size, PAGE_SIZE) >> 9); + if (ret) + return ret; + } + + if (inode.v->i_nlink != link.count + link.dir_count || + inode.v->i_flags & BCH_INODE_I_SIZE_DIRTY) { + if (inode.v->i_nlink != link.count + link.dir_count) + pr_info("setting inum %llu nlinks from %u to %u", + inode.k->p.inode, inode.v->i_nlink, + link.count + link.dir_count); + + bkey_reassemble(&update.k_i, inode.s_c); + update.v.i_nlink = link.count + link.dir_count; + update.v.i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + + return bch_btree_insert_at(iter, + &keylist_single(&update.k_i), + NULL, NULL, + BTREE_INSERT_ATOMIC); + } + + return 0; +} + +noinline_for_stack +static int bch_gc_walk_inodes(struct cache_set *c, u64 pos, struct nlink *links) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u64 i = 0; + + bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(pos, 0)); + + while ((k = bch_btree_iter_peek(&iter)).k) { + if (k.k->p.inode - pos >= INODES_PER_ITER) + break; + + while (i < k.k->p.inode - pos) { + cache_set_err_on(links[i].count, c, + "missing inode %llu", + pos + i); + i++; + } + + switch (k.k->type) { + case BCH_INODE_FS: + ret = bch_gc_do_inode(c, &iter, + bkey_s_c_to_inode(k), + links[i]); + if (ret == -EAGAIN || ret == -EINTR) + continue; + if (ret) + goto out; + + break; + default: + cache_set_err_on(links[i].count, c, + "missing inode %llu", + pos + i); + break; + } + + if (links[i].count) + atomic_long_inc(&c->nr_inodes); + + bch_btree_iter_advance_pos(&iter); + i++; + bch_btree_iter_cond_resched(&iter); + } +out: + return bch_btree_iter_unlock(&iter) ?: ret; +} + +int bch_gc_inode_nlinks(struct cache_set *c) +{ + bool need_loop = false; + u64 pos = 0; + struct nlink *links = vmalloc(INODES_PER_ITER * sizeof(*links)); + int ret = 0; + + if (!links) + return -ENOMEM; + + do { + ret = bch_gc_walk_dirents(c, pos, links, &need_loop); + if (ret) + break; + + ret = bch_gc_walk_inodes(c, pos, links); + if (ret) + break; + + pos += INODES_PER_ITER; + } while (need_loop); + + vfree(links); + + return ret; +} diff --git a/drivers/md/bcache/fs-gc.h b/drivers/md/bcache/fs-gc.h new file mode 100644 index 000000000000..4fb5728820ea --- /dev/null +++ b/drivers/md/bcache/fs-gc.h @@ -0,0 +1,6 @@ +#ifndef _BCACHE_FS_GC_H +#define _BCACHE_FS_GC_H + +int bch_gc_inode_nlinks(struct cache_set *); + +#endif /* _BCACHE_FS_GC_H */ diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c new file mode 100644 index 000000000000..0d04efe9c40c --- /dev/null +++ b/drivers/md/bcache/fs.c @@ -0,0 +1,2087 @@ + +#include "bcache.h" +#include "acl.h" +#include "btree.h" +#include "buckets.h" +#include "dirent.h" +#include "extents.h" +#include "fs.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "super.h" +#include "xattr.h" + +#include <linux/aio.h> +#include <linux/compat.h> +#include <linux/migrate.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/parser.h> +#include <linux/statfs.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include <linux/xattr.h> + +/* + * our page flags: + * + * allocated - page has space on disk reserved for it (-ENOSPC was checked then, + * shouldn't be checked later) + * + * corresponds to c->sectors_reserved + * + * append - page is dirty from an append write, new i_size can't be written + * until after page is written + * + * corresponds to ei->append_count + */ + +#define PF_ANY(page, enforce) page +PAGEFLAG(Allocated, private, PF_ANY) +TESTSCFLAG(Allocated, private, PF_ANY) + +PAGEFLAG(Append, private_2, PF_ANY) +TESTSCFLAG(Append, private_2, PF_ANY) +#undef PF_ANY + +static struct bio_set *bch_fs_bioset; +static struct kmem_cache *bch_inode_cache; +static DECLARE_WAIT_QUEUE_HEAD(bch_append_wait); + +static void bch_inode_init(struct bch_inode_info *); +static int bch_read_single_page(struct page *, struct address_space *); + +#define SECTORS_CACHE 1024 + +static int reserve_sectors(struct cache_set *c, unsigned sectors) +{ + if (likely(atomic_long_sub_return(sectors, + &c->sectors_reserved_cache) >= 0)) + return 0; + + atomic_long_add(SECTORS_CACHE, &c->sectors_reserved); + + if (likely(!cache_set_full(c))) { + atomic_long_add(SECTORS_CACHE, &c->sectors_reserved_cache); + return 0; + } + + atomic_long_sub_bug(SECTORS_CACHE, &c->sectors_reserved); + atomic_long_add(sectors, &c->sectors_reserved_cache); + return -ENOSPC; +} + +static void bch_append_put(struct bch_inode_info *ei) +{ + if (atomic_long_dec_and_test(&ei->append_count)) + wake_up(&bch_append_wait); +} + +static void bch_clear_page_bits(struct cache_set *c, struct bch_inode_info *ei, + struct page *page) +{ + if (TestClearPageAllocated(page)) + atomic_long_sub_bug(PAGE_SECTORS, &c->sectors_reserved); + + if (TestClearPageAppend(page)) + bch_append_put(ei); +} + +static int __bch_write_inode(struct inode *inode) +{ + struct cache_set *c = inode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_inode *bi = &ei->inode.v; + + lockdep_assert_held(&ei->update_lock); + BUG_ON(ei->inode.k.p.inode != inode->i_ino); + BUG_ON(ei->inode.k.type != BCH_INODE_FS); + + if (!atomic_long_read(&ei->append_count)) { + bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + bi->i_size = inode->i_size; + } + + bi->i_mode = inode->i_mode; + bi->i_uid = i_uid_read(inode); + bi->i_gid = i_gid_read(inode); + bi->i_nlink = inode->i_nlink; + bi->i_dev = inode->i_rdev; + bi->i_atime = timespec_to_ns(&inode->i_atime); + bi->i_mtime = timespec_to_ns(&inode->i_mtime); + bi->i_ctime = timespec_to_ns(&inode->i_ctime); + + return bch_inode_update(c, &ei->inode.k_i, NULL, &ei->journal_seq); +} + +static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) +{ + struct cache_set *c = sb->s_fs_info; + struct bch_inode_info *ei; + struct inode *inode; + int ret; + + pr_debug("inum %llu", inum); + + inode = iget_locked(sb, inum); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = to_bch_ei(inode); + + ret = bch_inode_find_by_inum(c, inum, &ei->inode); + if (unlikely(ret)) { + iget_failed(inode); + return ERR_PTR(ret); + } + + bch_inode_init(ei); + unlock_new_inode(inode); + + return inode; +} + +static void bch_set_inode_flags(struct inode *inode) +{ + unsigned flags = to_bch_ei(inode)->inode.v.i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME); + if (flags & FS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; +} + +static struct inode *bch_vfs_inode_create(struct cache_set *c, + struct inode *parent, + umode_t mode, dev_t rdev) +{ + struct inode *inode; + struct bch_inode_info *ei; + struct bch_inode *bi; + struct timespec ts = CURRENT_TIME; + s64 now = timespec_to_ns(&ts); + int ret; + + inode = new_inode(parent->i_sb); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + inode_init_owner(inode, parent, mode); + + ei = to_bch_ei(inode); + + bi = &bkey_inode_init(&ei->inode.k_i)->v; + bi->i_uid = i_uid_read(inode); + bi->i_gid = i_gid_read(inode); + + bi->i_mode = inode->i_mode; + bi->i_dev = rdev; + bi->i_atime = now; + bi->i_mtime = now; + bi->i_ctime = now; + bi->i_nlink = S_ISDIR(mode) ? 2 : 1; + + ret = bch_inode_create(c, &ei->inode.k_i, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (unlikely(ret)) { + /* + * indicate to bch_evict_inode that the inode was never actually + * created: + */ + bkey_init(&ei->inode.k); + goto err; + } + + bch_inode_init(ei); + + ret = bch_init_acl(inode, parent); + if (unlikely(ret)) + goto err; + + insert_inode_hash(inode); + atomic_long_inc(&c->nr_inodes); + + return inode; +err: + clear_nlink(inode); + iput(inode); + return ERR_PTR(ret); +} + +static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir, + u8 type, const struct qstr *name, + struct inode *dst) +{ + struct bch_inode_info *ei = to_bch_ei(dst); + int ret; + + ret = bch_dirent_create(c, dir->i_ino, type, name, + dst->i_ino, &ei->journal_seq); + if (unlikely(ret)) + return ret; + + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(dir); + return 0; +} + +static int __bch_create(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode; + int ret; + + inode = bch_vfs_inode_create(c, dir, mode, rdev); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode), + &dentry->d_name, inode); + if (unlikely(ret)) { + clear_nlink(inode); + iput(inode); + return ret; + } + + d_instantiate(dentry, inode); + return 0; +} + +/* methods */ + +static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode = NULL; + u64 inum; + + inum = bch_dirent_lookup(c, dir->i_ino, &dentry->d_name); + + if (inum) + inode = bch_vfs_inode_get(dir->i_sb, inum); + + return d_splice_alias(inode, dentry); +} + +static int bch_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return __bch_create(dir, dentry, mode|S_IFREG, 0); +} + +static int bch_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode = old_dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + int ret; + + lockdep_assert_held(&inode->i_rwsem); + + mutex_lock(&ei->update_lock); + inode->i_ctime = CURRENT_TIME; + inc_nlink(inode); + __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + ihold(inode); + + ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode), + &dentry->d_name, inode); + if (unlikely(ret)) { + inode_dec_link_count(inode); + iput(inode); + return ret; + } + + d_instantiate(dentry, inode); + return 0; +} + +static int bch_unlink(struct inode *dir, struct dentry *dentry) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + int ret; + + lockdep_assert_held(&inode->i_rwsem); + + ret = bch_dirent_delete(c, dir->i_ino, &dentry->d_name); + if (ret) + return ret; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + + return 0; +} + +static int bch_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode; + int ret; + + inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + inode_lock(inode); + ret = page_symlink(inode, symname, strlen(symname) + 1); + inode_unlock(inode); + + if (unlikely(ret)) + goto err; + + ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode); + if (unlikely(ret)) + goto err; + + d_instantiate(dentry, inode); + return 0; +err: + clear_nlink(inode); + iput(inode); + return ret; +} + +static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int ret; + + lockdep_assert_held(&dir->i_rwsem); + + inode_inc_link_count(dir); + mark_inode_dirty_sync(dir); + + ret = __bch_create(dir, dentry, mode|S_IFDIR, 0); + if (unlikely(ret)) { + inode_dec_link_count(dir); + return ret; + } + + return 0; +} + +static int bch_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + int ret; + + lockdep_assert_held(&inode->i_rwsem); + lockdep_assert_held(&dir->i_rwsem); + + if (bch_empty_dir(c, inode->i_ino)) + return -ENOTEMPTY; + + ret = bch_unlink(dir, dentry); + if (unlikely(ret)) + return ret; + + inode_dec_link_count(inode); + inode_dec_link_count(dir); + + return 0; +} + +static int bch_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + return __bch_create(dir, dentry, mode, rdev); +} + +static int bch_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct cache_set *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(old_inode); + struct inode *new_inode = new_dentry->d_inode; + struct timespec now = CURRENT_TIME; + int ret; + + lockdep_assert_held(&old_dir->i_rwsem); + lockdep_assert_held(&new_dir->i_rwsem); + + /* + * XXX: This isn't atomic w.r.t. unclean shutdowns, and we'd really like + * it to be + */ + + if (new_inode && S_ISDIR(old_inode->i_mode)) { + lockdep_assert_held(&new_inode->i_rwsem); + + if (!S_ISDIR(new_inode->i_mode)) + return -ENOTDIR; + + if (bch_empty_dir(c, new_inode->i_ino)) + return -ENOTEMPTY; + + ret = bch_dirent_update(c, new_dir->i_ino, + &new_dentry->d_name, + old_inode->i_ino, + &ei->journal_seq); + if (unlikely(ret)) + return ret; + + clear_nlink(new_inode); + inode_dec_link_count(old_dir); + } else if (new_inode) { + lockdep_assert_held(&new_inode->i_rwsem); + + ret = bch_dirent_update(c, new_dir->i_ino, + &new_dentry->d_name, + old_inode->i_ino, + &ei->journal_seq); + if (unlikely(ret)) + return ret; + + new_inode->i_ctime = now; + inode_dec_link_count(new_inode); + } else if (S_ISDIR(old_inode->i_mode)) { + ret = bch_vfs_dirent_create(c, new_dir, + mode_to_type(old_inode->i_mode), + &new_dentry->d_name, + old_inode); + if (unlikely(ret)) + return ret; + + inode_inc_link_count(new_dir); + inode_dec_link_count(old_dir); + } else { + ret = bch_vfs_dirent_create(c, new_dir, + mode_to_type(old_inode->i_mode), + &new_dentry->d_name, + old_inode); + if (unlikely(ret)) + return ret; + } + + old_dir->i_ctime = old_dir->i_mtime = now; + new_dir->i_ctime = new_dir->i_mtime = now; + mark_inode_dirty_sync(old_dir); + mark_inode_dirty_sync(new_dir); + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + mutex_lock(&ei->update_lock); + old_inode->i_ctime = now; + if (new_inode) + old_inode->i_mtime = now; + __bch_write_inode(old_inode); + mutex_unlock(&ei->update_lock); + + /* XXX: error handling */ + bch_dirent_delete(c, old_dir->i_ino, &old_dentry->d_name); + + return 0; +} + +static int bch_truncate_page(struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_SIZE - 1); + struct page *page; + int ret = 0; + + /* Page boundary? Nothing to do */ + if (!offset) + return 0; + + page = grab_cache_page(mapping, from >> PAGE_SHIFT); + if (unlikely(!page)) { + ret = -ENOMEM; + goto out; + } + + if (!PageUptodate(page)) + if (bch_read_single_page(page, mapping)) { + ret = -EIO; + goto unlock; + } + + zero_user_segment(page, offset, PAGE_SIZE); + set_page_dirty(page); +unlock: + unlock_page(page); + put_page(page); +out: + return ret; +} + +static int bch_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + int ret = 0; + + lockdep_assert_held(&inode->i_rwsem); + + pr_debug("i_size was %llu update has %llu", + inode->i_size, iattr->ia_size); + + ret = inode_change_ok(inode, iattr); + if (ret) + return ret; + + if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) { + inode_dio_wait(inode); + + /* + * __bch_write_inode() clears I_SIZE_DIRTY if append_count == 0: + */ + atomic_long_inc(&ei->append_count); + + /* + * I_SIZE_DIRTY indicates that there's extents past the end of + * i_size, and must be set atomically with setting the new + * i_size: + */ + mutex_lock(&ei->update_lock); + i_size_write(inode, iattr->ia_size); + ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY; + ei->inode.v.i_size = iattr->ia_size; + __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + ret = bch_truncate_page(inode->i_mapping, iattr->ia_size); + if (unlikely(ret)) + return ret; + + if (iattr->ia_size > inode->i_size) + pagecache_isize_extended(inode, inode->i_size, + iattr->ia_size); + truncate_pagecache(inode, iattr->ia_size); + + ret = bch_inode_truncate(c, inode->i_ino, + round_up(iattr->ia_size, PAGE_SIZE) >> 9); + if (unlikely(ret)) + return ret; + + /* + * Extents discarded, now clear I_SIZE_DIRTY (which write_inode + * does when append_count is 0 + */ + bch_append_put(ei); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + + mutex_lock(&ei->update_lock); + setattr_copy(inode, iattr); + __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + if (iattr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(inode, inode->i_mode); + + return ret; +} + +static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct cache_set *c = dir->i_sb->s_fs_info; + struct inode *inode; + + /* XXX: i_nlink should be 0? */ + inode = bch_vfs_inode_create(c, dir, mode, 0); + if (unlikely(IS_ERR(inode))) + return PTR_ERR(inode); + + d_tmpfile(dentry, inode); + return 0; +} + +static int bch_fill_extent(struct fiemap_extent_info *info, + struct bkey_i *k, int flags) +{ + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) { + int ret = fiemap_fill_next_extent(info, + bkey_start_offset(e.k) << 9, + PTR_OFFSET(ptr) << 9, + e.k->size << 9, flags); + if (ret) + return ret; + } + + return 0; +} + +static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct cache_set *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + BKEY_PADDED(k) tmp; + bool have_extent = false; + int ret = 0; + + if (start + len < start) + return -EINVAL; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(inode->i_ino, start >> 9), k) + if (k.k->type == BCH_EXTENT) { + if (bkey_cmp(bkey_start_pos(k.k), + POS(inode->i_ino, (start + len) >> 9)) >= 0) + break; + + if (have_extent) { + ret = bch_fill_extent(info, &tmp.k, 0); + if (ret) + goto out; + } + + bkey_reassemble(&tmp.k, k); + have_extent = true; + } + + if (have_extent) + ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); +out: + bch_btree_iter_unlock(&iter); + return ret < 0 ? ret : 0; +} + +static int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct closure cl; + int ret; + + closure_init_stack(&cl); + + /* + * We really just want to sync all the PageAppend pages: + */ + start = 0; + end = S64_MAX; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + inode_lock(inode); + if (datasync && end <= ei->inode.v.i_size) + goto out; + + /* + * redo after locking inode: + */ + filemap_write_and_wait_range(inode->i_mapping, start, end); + + wait_event(bch_append_wait, + !atomic_long_read(&ei->append_count)); + + mutex_lock(&ei->update_lock); + BUG_ON(atomic_long_read(&ei->append_count)); + ret = __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); +out: + inode_unlock(inode); + + bch_journal_push_seq(&c->journal, ei->journal_seq, &cl); + closure_sync(&cl); + + return ret; +} + +/* Flags that are appropriate for non-directories/regular files. */ +#define BCH_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + +static inline bool bch_flags_allowed(umode_t mode, u32 flags) +{ + if ((flags & BCH_FL_USER_FLAGS) != flags) + return false; + + if (!S_ISREG(mode) && + !S_ISDIR(mode) && + (flags & BCH_OTHER_FLMASK) != flags) + return false; + + return true; +} + +static long bch_fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct bch_inode_info *ei = to_bch_ei(inode); + unsigned flags; + int ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = ei->inode.v.i_flags & BCH_FL_USER_FLAGS; + return put_user(flags, (int __user *) arg); + + case FS_IOC_SETFLAGS: { + unsigned oldflags; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto setflags_out; + } + + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } + + if (!bch_flags_allowed(inode->i_mode, flags)) { + ret = -EINVAL; + goto setflags_out; + } + + inode_lock(inode); + oldflags = ei->inode.v.i_flags; + + if (((flags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) && + !capable(CAP_LINUX_IMMUTABLE)) { + inode_unlock(inode); + ret = -EPERM; + goto setflags_out; + } + + flags = flags & BCH_FL_USER_FLAGS; + flags |= oldflags & ~BCH_FL_USER_FLAGS; + ei->inode.v.i_flags = flags; + + inode->i_ctime = CURRENT_TIME_SEC; + bch_set_inode_flags(inode); + inode_unlock(inode); + + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + return ret; + } + return 0; + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case FS_IOC_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return bch_fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_file_llseek_size(file, offset, whence, + S64_MAX, S64_MAX); +} + +static const struct file_operations bch_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = generic_file_open, + .fsync = bch_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + + .unlocked_ioctl = bch_fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_file_inode_operations = { + .setattr = bch_setattr, + .fiemap = bch_fiemap, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = bch_xattr_list, + .removexattr = generic_removexattr, + .get_acl = bch_get_acl, + .set_acl = bch_set_acl, +}; + +static const struct inode_operations bch_dir_inode_operations = { + .lookup = bch_lookup, + .create = bch_create, + .link = bch_link, + .unlink = bch_unlink, + .symlink = bch_symlink, + .mkdir = bch_mkdir, + .rmdir = bch_rmdir, + .mknod = bch_mknod, + .rename = bch_rename, + .setattr = bch_setattr, + .tmpfile = bch_tmpfile, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = bch_xattr_list, + .removexattr = generic_removexattr, + .get_acl = bch_get_acl, + .set_acl = bch_set_acl, +}; + +static const struct file_operations bch_dir_file_operations = { + .llseek = bch_dir_llseek, + .read = generic_read_dir, + .iterate = bch_readdir, + .fsync = bch_fsync, + + .unlocked_ioctl = bch_fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = page_get_link, + .setattr = bch_setattr, + + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = bch_xattr_list, + .removexattr = generic_removexattr, + .get_acl = bch_get_acl, + .set_acl = bch_set_acl, +}; + +static const struct inode_operations bch_special_inode_operations = { + .setattr = bch_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = bch_xattr_list, + .removexattr = generic_removexattr, + .get_acl = bch_get_acl, + .set_acl = bch_set_acl, +}; + +static int bch_bio_add_page(struct bio *bio, struct page *page) +{ + sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9); + + if (!bio->bi_vcnt) { + bio->bi_iter.bi_sector = offset; + } else if (bio_end_sector(bio) != offset || + bio->bi_vcnt == bio->bi_max_vecs) + return -1; + + bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + + bio->bi_iter.bi_size += PAGE_SIZE; + + return 0; +} + +static void bch_readpages_end_io(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!bio->bi_error) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +static int bch_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio = NULL; + struct page *page; + ssize_t ret; + + pr_debug("reading %u pages", nr_pages); + + while (nr_pages) { + page = list_entry(pages->prev, struct page, lru); + prefetchw(&page->flags); + list_del(&page->lru); + + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_NOFS)) { +again: + if (!bio) { + bio = bio_alloc(GFP_NOFS, + min_t(unsigned, nr_pages, + BIO_MAX_PAGES)); + + bio->bi_end_io = bch_readpages_end_io; + } + + if (bch_bio_add_page(bio, page)) { + ret = bch_read(c, bio, inode->i_ino); + bio_endio(bio); + bio = NULL; + + if (ret < 0) { + pr_debug("error %zi", ret); + return ret; + } + goto again; + } + } + + nr_pages--; + put_page(page); + } + + if (bio) { + ret = bch_read(c, bio, inode->i_ino); + bio_endio(bio); + + if (ret < 0) { + pr_debug("error %zi", ret); + return ret; + } + } + + pr_debug("success"); + return 0; +} + +static int bch_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio; + int ret; + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_end_io = bch_readpages_end_io; + + bch_bio_add_page(bio, page); + + ret = bch_read(c, bio, inode->i_ino); + bio_endio(bio); + + return ret; +} + +struct bch_writepage_io { + struct closure cl; + struct bch_write_op op; + struct bbio bio; +}; + +struct bch_writepage { + struct cache_set *c; + u64 inum; + struct bch_writepage_io *io; +}; + +static void bch_writepage_io_free(struct closure *cl) +{ + struct bch_writepage_io *io = container_of(cl, + struct bch_writepage_io, cl); + struct cache_set *c = io->op.c; + struct inode *inode = io->bio.bio.bi_io_vec[0].bv_page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, &io->bio.bio, i) { + struct page *page = bvec->bv_page; + + BUG_ON(!PageWriteback(page)); + + if (io->bio.bio.bi_error) { + SetPageError(page); + if (page->mapping) + set_bit(AS_EIO, &page->mapping->flags); + } + + bch_clear_page_bits(c, ei, page); + end_page_writeback(page); + } + + bio_put(&io->bio.bio); +} + +static void bch_writepage_do_io(struct bch_writepage_io *io) +{ + pr_debug("writing %u sectors to %llu:%llu", + bio_sectors(&io->bio.bio), + io->op.insert_key.k.p.inode, + (u64) io->bio.bio.bi_iter.bi_sector); + + closure_call(&io->op.cl, bch_write, NULL, &io->cl); + closure_return_with_destructor(&io->cl, bch_writepage_io_free); +} + +static int __bch_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct bch_writepage *w = data; + struct bio *bio; + unsigned offset; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_SHIFT; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + goto do_io; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_SIZE - 1); + if (page->index > end_index || !offset) { + unlock_page(page); + return 0; + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_SIZE); +do_io: + /* XXX: how we gonna make this synchronization efficient? */ + mutex_lock(&ei->update_lock); + + if (ei->inode.v.i_size < i_size && + page->index >= (ei->inode.v.i_size >> PAGE_SHIFT) && + !(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) { + ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY; + __bch_write_inode(inode); + } + + mutex_unlock(&ei->update_lock); + + if (!w->io) { + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, bch_fs_bioset); + w->io = container_of(bio, struct bch_writepage_io, bio.bio); + + closure_init(&w->io->cl, NULL); + bch_write_op_init(&w->io->op, w->c, bio, NULL, + bkey_to_s_c(&KEY(w->inum, 0, 0)), + bkey_s_c_null, 0); + w->io->op.journal_seq = &ei->journal_seq; + } + + if (bch_bio_add_page(&w->io->bio.bio, page)) { + bch_writepage_do_io(w->io); + w->io = NULL; + goto do_io; + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + return 0; +} + +static int bch_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + struct bch_writepage w = { + .c = mapping->host->i_sb->s_fs_info, + .inum = mapping->host->i_ino, + .io = NULL, + }; + + ret = write_cache_pages(mapping, wbc, __bch_writepage, &w); + + if (w.io) + bch_writepage_do_io(w.io); + + return ret; +} + +static int bch_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct bch_writepage w = { + .c = inode->i_sb->s_fs_info, + .inum = inode->i_ino, + .io = NULL, + }; + + __bch_writepage(page, NULL, &w); + bch_writepage_do_io(w.io); + + return 0; +} + +static void bch_read_single_page_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int bch_read_single_page(struct page *page, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + struct bio *bio; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_private = &done; + bio->bi_end_io = bch_read_single_page_end_io; + bch_bio_add_page(bio, page); + + ret = bch_read(c, bio, inode->i_ino); + bio_endio(bio); + wait_for_completion(&done); + + if (!ret) + ret = bio->bi_error; + bio_put(bio); + + if (ret < 0) + return ret; + + SetPageUptodate(page); + + return 0; +} + +static int bch_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct cache_set *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int ret = 0; + + BUG_ON(inode_unhashed(mapping->host)); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + if (!PageAllocated(page)) { + if (reserve_sectors(c, PAGE_SECTORS)) { + ret = -ENOSPC; + goto err; + } + + SetPageAllocated(page); + } + + if (PageUptodate(page)) + goto out; + + /* If we're writing entire page, don't need to read it in first: */ + if (len == PAGE_SIZE) + goto out; + + if (pos + len >= inode->i_size) { + unsigned offset = pos & (PAGE_SIZE - 1); + + /* + * If the write extents past i_size, the top part of the page + * we're not writing to doesn't need to be read in, just zeroed: + */ + zero_user(page, offset + len, PAGE_SIZE - offset - len); + flush_dcache_page(page); + + if (!offset) + goto out; + + /* + * If the start of the page is past i_size, zero that part too: + */ + if ((index << PAGE_SHIFT) >> inode->i_size) { + zero_user(page, 0, offset); + flush_dcache_page(page); + goto out; + } + } + + ret = bch_read_single_page(page, mapping); + if (ret) + goto err; +out: + *pagep = page; + return ret; +err: + unlock_page(page); + put_page(page); + page = NULL; + goto out; +} + +static int bch_write_end(struct file *filp, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + loff_t last_pos = pos + copied; + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + + /* + * can't set a page dirty without i_rwsem, to avoid racing with truncate + */ + lockdep_assert_held(&inode->i_rwsem); + + if (unlikely(copied < len)) { +#if 0 + if (!PageUptodate(page)) { + /* we skipped reading in the page before, read it now.. */ + } +#endif + + /* + * zero out the rest of the area + */ + unsigned from = pos & (PAGE_SIZE - 1); + + zero_user(page, from + copied, len - copied); + flush_dcache_page(page); + } + + if (!PageUptodate(page)) + SetPageUptodate(page); + if (!PageDirty(page)) + set_page_dirty(page); + + if (last_pos > inode->i_size) { + mutex_lock(&ei->update_lock); + + if (!TestSetPageAppend(page)) + atomic_long_inc(&ei->append_count); + + i_size_write(inode, last_pos); + mark_inode_dirty(inode); + + mutex_unlock(&ei->update_lock); + } + + unlock_page(page); + put_page(page); + + return copied; +} + +static void bch_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + if (offset || length < PAGE_SIZE) + return; + + bch_clear_page_bits(c, ei, page); +} + +static int bch_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct inode *inode = page->mapping->host; + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + bch_clear_page_bits(c, ei, page); + + if (PageDirty(page)) { + ClearPageDirty(page); + cancel_dirty_page(page); + } + + return 1; +} + +/* O_DIRECT */ + +static struct bio_set *bch_dio_read_bioset; + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + struct bio bio; +}; + +static void bch_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret, 0); + bio_put(&dio->bio); +} + +static void bch_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_error) + dio->ret = bio->bi_error; + + closure_put(&dio->cl); + bio_check_pages_dirty(bio); /* transfers ownership */ +} + +static int bch_direct_IO_read(struct cache_set *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct dio_read *dio; + struct bio *bio; + unsigned long inum = inode->i_ino; + ssize_t ret = 0; + size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); + loff_t i_size; + + bio = bio_alloc_bioset(GFP_KERNEL, pages, bch_dio_read_bioset); + bio_get(bio); + + dio = container_of(bio, struct dio_read, bio); + closure_init(&dio->cl, NULL); + dio->req = req; + dio->ret = iter->count; + + i_size = i_size_read(inode); + if (offset + dio->ret > i_size) { + dio->ret = max_t(loff_t, 0, i_size - offset); + iter->count = round_up(dio->ret, PAGE_SIZE); + } + + if (!dio->ret) + goto out; + + goto start; + while (iter->count && !ret) { + pages = iov_iter_npages(iter, BIO_MAX_PAGES); + bio = bio_alloc(GFP_KERNEL, pages); +start: + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_end_io = bch_direct_IO_read_endio; + bio->bi_private = dio; + + ret = bio_get_user_pages(bio, iter, 1); + if (ret < 0) { + dio->ret = ret; + bio_put(bio); + break; + } + + offset += bio->bi_iter.bi_size; + bio_set_pages_dirty(bio); + + closure_get(&dio->cl); + ret = bch_read(c, bio, inum); + if (ret) + bio->bi_error = ret; + bio_endio(bio); + } +out: + if (is_sync_kiocb(req)) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_put(&dio->bio); + return ret; + } else { + closure_return_with_destructor_noreturn(&dio->cl, + bch_dio_read_complete); + return -EIOCBQUEUED; + } +} + +struct dio_write { + struct closure cl; + struct kiocb *req; + long ret; + bool append; +}; + +struct dio_write_bio { + struct closure cl; + struct dio_write *dio; + struct bch_write_op iop; + struct bbio bio; +}; + +static void __bch_dio_write_complete(struct dio_write *dio) +{ + struct bch_inode_info *ei = to_bch_ei(dio->req->ki_filp->f_inode); + + if (dio->append) + bch_append_put(ei); + inode_dio_end(dio->req->ki_filp->f_inode); + kfree(dio); +} + +static void bch_dio_write_complete(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, cl); + struct kiocb *req = dio->req; + long ret = dio->ret; + + __bch_dio_write_complete(dio); + req->ki_complete(req, ret, 0); +} + +static void bch_direct_IO_write_done(struct closure *cl) +{ + struct dio_write_bio *op = container_of(cl, + struct dio_write_bio, cl); + struct bio_vec *bv; + int i; + + if (op->iop.error) + op->dio->ret = op->iop.error; + closure_put(&op->dio->cl); + + bio_for_each_segment_all(bv, &op->bio.bio, i) + put_page(bv->bv_page); + kfree(op); +} + +static int bch_direct_IO_write(struct cache_set *c, struct kiocb *req, + struct file *file, struct inode *inode, + struct iov_iter *iter, loff_t offset) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct dio_write *dio; + struct dio_write_bio *op; + struct bio *bio; + unsigned long inum = inode->i_ino; + unsigned flags = BCH_WRITE_CHECK_ENOSPC; + ssize_t ret = 0; + + lockdep_assert_held(&inode->i_rwsem); + + if (file->f_flags & O_DSYNC || IS_SYNC(file->f_mapping->host)) + flags |= BCH_WRITE_FLUSH; + + dio = kmalloc(sizeof(*dio), GFP_NOIO); + if (!dio) + return -ENOMEM; + + closure_init(&dio->cl, NULL); + dio->req = req; + dio->ret = iter->count; + dio->append = false; + + if (offset + iter->count > inode->i_size) { + dio->append = true; + atomic_long_inc(&ei->append_count); + + mutex_lock(&ei->update_lock); + if (!(ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY)) { + ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY; + __bch_write_inode(inode); + } + mutex_unlock(&ei->update_lock); + } + + /* Decremented by inode_dio_done(): */ + atomic_inc(&inode->i_dio_count); + + while (iter->count) { + size_t pages = iov_iter_npages(iter, BIO_MAX_PAGES); + + op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, + GFP_NOIO); + if (!op) { + dio->ret = -ENOMEM; + break; + } + + bio = &op->bio.bio; + bio_init(bio); + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_max_vecs = pages; + bio->bi_io_vec = bio->bi_inline_vecs; + + ret = bio_get_user_pages(bio, iter, 0); + if (ret < 0) { + dio->ret = ret; + kfree(op); + break; + } + + offset += bio->bi_iter.bi_size; + closure_get(&dio->cl); + op->dio = dio; + closure_init(&op->cl, NULL); + + bch_write_op_init(&op->iop, c, bio, NULL, + bkey_to_s_c(&KEY(inum, + bio_end_sector(bio), + bio_sectors(bio))), + bkey_s_c_null, flags); + op->iop.journal_seq = &ei->journal_seq; + + task_io_account_write(bio->bi_iter.bi_size); + + closure_call(&op->iop.cl, bch_write, NULL, &op->cl); + closure_return_with_destructor_noreturn(&op->cl, + bch_direct_IO_write_done); + } + + if (is_sync_kiocb(req) || dio->append) { + /* + * appends are sync in order to do the i_size update under + * i_rwsem, after we know the write has completed successfully + */ + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + + if (ret > 0 && + offset > inode->i_size) { + i_size_write(inode, offset); + mark_inode_dirty(inode); + } + + __bch_dio_write_complete(dio); + return ret; + } else { + closure_return_with_destructor_noreturn(&dio->cl, + bch_dio_write_complete); + return -EIOCBQUEUED; + } +} + +static ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct inode *inode = file->f_inode; + struct cache_set *c = inode->i_sb->s_fs_info; + + if ((req->ki_pos|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + return ((iov_iter_rw(iter) == WRITE) + ? bch_direct_IO_write + : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos); +} + +#ifdef CONFIG_MIGRATION +static int bch_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (PageAllocated(page)) { + ClearPageAllocated(page); + SetPageAllocated(newpage); + } + + if (PageAppend(page)) { + ClearPageAppend(page); + SetPageAppend(newpage); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + +static const struct address_space_operations bch_address_space_operations = { + .writepage = bch_writepage, + .readpage = bch_readpage, + .writepages = bch_writepages, + .readpages = bch_readpages, + + .set_page_dirty = __set_page_dirty_nobuffers, + + .write_begin = bch_write_begin, + .write_end = bch_write_end, + .invalidatepage = bch_invalidatepage, + .releasepage = bch_releasepage, + + .direct_IO = bch_direct_IO, + +#ifdef CONFIG_MIGRATION + .migratepage = bch_migrate_page, +#endif + .error_remove_page = generic_error_remove_page, +}; + +static void bch_inode_init(struct bch_inode_info *ei) +{ + struct inode *inode = &ei->vfs_inode; + struct bch_inode *bi = &ei->inode.v; + + pr_debug("init inode %llu with mode %o", + ei->inode.k.p.inode, bi->i_mode); + + BUG_ON(atomic_long_read(&ei->append_count)); + + inode->i_mode = bi->i_mode; + i_uid_write(inode, bi->i_uid); + i_gid_write(inode, bi->i_gid); + + inode->i_ino = ei->inode.k.p.inode; + set_nlink(inode, bi->i_nlink); + inode->i_rdev = bi->i_dev; + inode->i_size = bi->i_size; + inode->i_atime = ns_to_timespec(bi->i_atime); + inode->i_mtime = ns_to_timespec(bi->i_mtime); + inode->i_ctime = ns_to_timespec(bi->i_ctime); + bch_set_inode_flags(inode); + + inode->i_mapping->a_ops = &bch_address_space_operations; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &bch_file_inode_operations; + inode->i_fop = &bch_file_operations; + break; + case S_IFDIR: + inode->i_op = &bch_dir_inode_operations; + inode->i_fop = &bch_dir_file_operations; + break; + case S_IFLNK: + inode_nohighmem(inode); + inode->i_op = &bch_symlink_inode_operations; + break; + default: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_op = &bch_special_inode_operations; + break; + } +} + +static struct inode *bch_alloc_inode(struct super_block *sb) +{ + struct bch_inode_info *ei; + + ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS); + if (!ei) + return NULL; + + pr_debug("allocated %p", &ei->vfs_inode); + + inode_init_once(&ei->vfs_inode); + mutex_init(&ei->update_lock); + ei->journal_seq = 0; + atomic_long_set(&ei->append_count, 0); + + return &ei->vfs_inode; +} + +static void bch_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(bch_inode_cache, to_bch_ei(inode)); +} + +static void bch_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bch_i_callback); +} + +static int bch_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct cache_set *c = inode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(inode); + int ret; + + mutex_lock(&ei->update_lock); + ret = __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + if (!ret && wbc->sync_mode == WB_SYNC_ALL) { + struct closure cl; + + closure_init_stack(&cl); + bch_journal_push_seq(&c->journal, ei->journal_seq, &cl); + closure_sync(&cl); + } + + return ret; +} + +static void bch_evict_inode(struct inode *inode) +{ + struct cache_set *c = inode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(inode); + + if (inode->i_nlink) { + truncate_inode_pages_final(&inode->i_data); + + mutex_lock(&ei->update_lock); + BUG_ON(atomic_long_read(&ei->append_count)); + + if (!(inode->i_state & I_NEW) && + (ei->inode.v.i_flags & BCH_INODE_I_SIZE_DIRTY || + inode->i_size != ei->inode.v.i_size)) + __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + clear_inode(inode); + } else if (!bkey_deleted(&ei->inode.k)) { + atomic_long_inc(&ei->append_count); + + mutex_lock(&ei->update_lock); + ei->inode.v.i_flags |= BCH_INODE_I_SIZE_DIRTY; + ei->inode.v.i_size = 0; + i_size_write(inode, 0); + __bch_write_inode(inode); + mutex_unlock(&ei->update_lock); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + /* + * write_inode() shouldn't be called again - this will cause it + * to BUG(): + */ + ei->inode.k.type = KEY_TYPE_DELETED; + atomic_long_dec_bug(&ei->append_count); + + bch_inode_rm(c, inode->i_ino); + atomic_long_dec(&c->nr_inodes); + } else { + /* bch_inode_create() failed: */ + clear_inode(inode); + } +} + +static int bch_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct cache_set *c = sb->s_fs_info; + + buf->f_type = BCACHE_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = c->capacity >> (PAGE_SHIFT - 9); + buf->f_bfree = (c->capacity - cache_set_sectors_used(c)) >> + (PAGE_SHIFT - 9); + buf->f_bavail = buf->f_bfree; + buf->f_files = atomic_long_read(&c->nr_inodes); + buf->f_namelen = NAME_MAX; + + return 0; +} + +static int bch_sync_fs(struct super_block *sb, int wait) +{ + struct cache_set *c = sb->s_fs_info; + struct closure cl; + + closure_init_stack(&cl); + + /* XXX: should only push a journal write if it's dirty */ + bch_journal_flush(&c->journal, wait ? &cl : NULL); + closure_sync(&cl); + return 0; +} + +static const struct super_operations bch_super_operations = { + .alloc_inode = bch_alloc_inode, + .destroy_inode = bch_destroy_inode, + .write_inode = bch_write_inode, + .evict_inode = bch_evict_inode, + .sync_fs = bch_sync_fs, + .statfs = bch_statfs, + .show_options = generic_show_options, +#if 0 + .put_super = bch_put_super, + .freeze_fs = bch_freeze, + .unfreeze_fs = bch_unfreeze, + .remount_fs = bch_remount, +#endif +}; + +static struct cache_set *bch_open_as_blockdevs(const char *_dev_name) +{ + size_t nr_devs = 0, i = 0; + char *dev_name, *s, **devs; + struct cache_set *c = NULL; + const char *err; + + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; + + for (s = dev_name; s; s = strchr(s + 1, ':')) + nr_devs++; + + devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); + if (!devs) + goto out; + + for (i = 0, s = dev_name; + s; + (s = strchr(s, ':')) && (*s++ = '\0')) + devs[i++] = s; + + err = bch_register_cache_set(devs, nr_devs, &c); + if (err) { + pr_err("register_cache_set err %s", err); + goto out; + } + + set_bit(CACHE_SET_BDEV_MOUNTED, &c->flags); +out: + kfree(devs); + kfree(dev_name); + + return c; +} + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_err, NULL} +}; + +static int parse_options(struct cache_set *c, struct super_block *sb, + char *options) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_err_panic: + /* + * XXX: this will get written to the superblock, don't + * want this option to be persistent + */ + SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_PANIC); + break; + case Opt_err_ro: + SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_RO); + break; + case Opt_err_cont: + SET_CACHE_ERROR_ACTION(&c->sb, BCH_ON_ERROR_CONTINUE); + break; + case Opt_user_xattr: + case Opt_nouser_xattr: + break; + case Opt_acl: + sb->s_flags |= MS_POSIXACL; + break; + case Opt_noacl: + sb->s_flags &= ~MS_POSIXACL; + break; + default: + return 0; + } + } + return 1; +} + +static struct dentry *bch_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct cache_set *c; + struct super_block *sb; + struct inode *inode; + int ret; + + c = bch_open_as_blockdevs(dev_name); + if (!c) + return ERR_PTR(-ENOENT); + + sb = sget(fs_type, NULL, set_anon_super, flags, NULL); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto err; + } + + /* XXX: */ + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &bch_super_operations; + sb->s_xattr = bch_xattr_handlers; + sb->s_magic = BCACHE_STATFS_MAGIC; + sb->s_time_gran = 1; + sb->s_fs_info = c; + + sb->s_flags |= MS_POSIXACL; + + /* XXX */ + sb->s_bdev = c->cache[0]->disk_sb.bdev; + sb->s_bdi = &c->bdi; + + if (!parse_options(c, sb, (char *) data)) { + ret = -EINVAL; + goto err_put_super; + } + + inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto err_put_super; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + ret = -ENOMEM; + goto err_put_super; + } + + sb->s_flags |= MS_ACTIVE; + return dget(sb->s_root); + +err_put_super: + deactivate_locked_super(sb); +err: + closure_put(&c->cl); + return ERR_PTR(ret); +} + +static void bch_kill_sb(struct super_block *sb) +{ + struct cache_set *c = sb->s_fs_info; + + generic_shutdown_super(sb); + + if (test_bit(CACHE_SET_BDEV_MOUNTED, &c->flags)) { + DECLARE_COMPLETION_ONSTACK(complete); + + c->stop_completion = &complete; + bch_cache_set_stop(c); + closure_put(&c->cl); + + /* Killable? */ + wait_for_completion(&complete); + } else + closure_put(&c->cl); +} + +static struct file_system_type bcache_fs_type = { + .owner = THIS_MODULE, + .name = "bcache", + .mount = bch_mount, + .kill_sb = bch_kill_sb, +}; + +MODULE_ALIAS_FS("bcache"); + +void bch_fs_exit(void) +{ + unregister_filesystem(&bcache_fs_type); + if (bch_dio_read_bioset) + bioset_free(bch_dio_read_bioset); + if (bch_fs_bioset) + bioset_free(bch_fs_bioset); + if (bch_inode_cache) + kmem_cache_destroy(bch_inode_cache); +} + +int __init bch_fs_init(void) +{ + int ret = -ENOMEM; + + bch_inode_cache = KMEM_CACHE(bch_inode_info, 0); + if (!bch_inode_cache) + goto err; + + bch_fs_bioset = bioset_create(4, + offsetof(struct bch_writepage_io, bio.bio)); + if (!bch_fs_bioset) + goto err; + + + bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, bio)); + if (!bch_dio_read_bioset) + goto err; + + ret = register_filesystem(&bcache_fs_type); + if (ret) + goto err; + + return 0; +err: + bch_fs_exit(); + return ret; +} diff --git a/drivers/md/bcache/fs.h b/drivers/md/bcache/fs.h new file mode 100644 index 000000000000..9e78cf8189bc --- /dev/null +++ b/drivers/md/bcache/fs.h @@ -0,0 +1,20 @@ +#ifndef _BCACHE_FS_H +#define _BCACHE_FS_H + +struct bch_inode_info { + struct bkey_i_inode inode; + struct inode vfs_inode; + struct mutex update_lock; + u64 journal_seq; + atomic_long_t append_count; +}; + +#define to_bch_ei(_inode) \ + container_of(_inode, struct bch_inode_info, vfs_inode) + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +#endif /* _BCACHE_FS_H */ diff --git a/drivers/md/bcache/inode.c b/drivers/md/bcache/inode.c index 5e458258eaa8..ba6863ec5d01 100644 --- a/drivers/md/bcache/inode.c +++ b/drivers/md/bcache/inode.c @@ -162,14 +162,33 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size) int bch_inode_rm(struct cache_set *c, u64 inode_nr) { + struct btree_iter iter; + struct bkey_s_c k; struct bkey_i delete; int ret; - ret = bch_discard(c, POS(inode_nr, 0), - POS(inode_nr + 1, 0), 0); + ret = bch_inode_truncate(c, inode_nr, 0); if (ret < 0) return ret; + for_each_btree_key_intent(&iter, c, BTREE_ID_XATTRS, + POS(inode_nr, 0), k) { + if (k.k->p.inode > inode_nr) + break; + + bkey_init(&delete.k); + delete.k.p = k.k->p; + + ret = bch_btree_insert_at(&iter, &keylist_single(&delete), + NULL, NULL, 0); + if (ret) { + bch_btree_iter_unlock(&iter); + return ret; + } + + } + bch_btree_iter_unlock(&iter); + bkey_init(&delete.k); delete.k.p.inode = inode_nr; @@ -179,6 +198,33 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr) BTREE_INSERT_NOFAIL); } +int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr, + struct bkey_i_inode *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = -ENOENT; + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES, + POS(inode_nr, 0), k) { + switch (k.k->type) { + case BCH_INODE_FS: + ret = 0; + bkey_reassemble(&inode->k_i, k); + break; + default: + /* hole, not found */ + break; + } + + break; + + } + bch_btree_iter_unlock(&iter); + + return ret; +} + int bch_blockdev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid, struct bkey_i_inode_blockdev *ret) { diff --git a/drivers/md/bcache/inode.h b/drivers/md/bcache/inode.h index 6561e1e71ee6..dc1c26f8240f 100644 --- a/drivers/md/bcache/inode.h +++ b/drivers/md/bcache/inode.h @@ -17,6 +17,7 @@ static inline int bch_inode_update(struct cache_set *c, struct bkey_i *inode, cl, journal_seq); } +int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *); int bch_blockdev_inode_find_by_uuid(struct cache_set *, uuid_le *, struct bkey_i_inode_blockdev *); diff --git a/drivers/md/bcache/siphash.c b/drivers/md/bcache/siphash.c new file mode 100644 index 000000000000..0c6f7f3ec819 --- /dev/null +++ b/drivers/md/bcache/siphash.c @@ -0,0 +1,185 @@ +/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ + +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d + * are the number of compression rounds and the number of finalization rounds. + * A compression round is identical to a finalization round and this round + * function is called SipRound. Given a 128-bit key k and a (possibly empty) + * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). + * + * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, + * by Jean-Philippe Aumasson and Daniel J. Bernstein, + * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa + * https://131002.net/siphash/siphash.pdf + * https://131002.net/siphash/ + */ + +//#include <sys/param.h> +//#include <sys/systm.h> + +#include <asm/byteorder.h> +#include <asm/string.h> + +#include "siphash.h" + +static void SipHash_CRounds(SIPHASH_CTX *, int); +static void SipHash_Rounds(SIPHASH_CTX *, int); + +void +SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) +{ + u64 k0, k1; + + k0 = le64_to_cpu(key->k0); + k1 = le64_to_cpu(key->k1); + + ctx->v[0] = 0x736f6d6570736575ULL ^ k0; + ctx->v[1] = 0x646f72616e646f6dULL ^ k1; + ctx->v[2] = 0x6c7967656e657261ULL ^ k0; + ctx->v[3] = 0x7465646279746573ULL ^ k1; + + memset(ctx->buf, 0, sizeof(ctx->buf)); + ctx->bytes = 0; +} + +void +SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) +{ + const u8 *ptr = src; + size_t left, used; + + if (len == 0) + return; + + used = ctx->bytes % sizeof(ctx->buf); + ctx->bytes += len; + + if (used > 0) { + left = sizeof(ctx->buf) - used; + + if (len >= left) { + memcpy(&ctx->buf[used], ptr, left); + SipHash_CRounds(ctx, rc); + len -= left; + ptr += left; + } else { + memcpy(&ctx->buf[used], ptr, len); + return; + } + } + + while (len >= sizeof(ctx->buf)) { + memcpy(ctx->buf, ptr, sizeof(ctx->buf)); + SipHash_CRounds(ctx, rc); + len -= sizeof(ctx->buf); + ptr += sizeof(ctx->buf); + } + + if (len > 0) + memcpy(&ctx->buf[used], ptr, len); +} + +void +SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + + r = SipHash_End(ctx, rc, rf); + + *((__le64 *) dst) = cpu_to_le64(r); +} + +u64 +SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + size_t left, used; + + used = ctx->bytes % sizeof(ctx->buf); + left = sizeof(ctx->buf) - used; + memset(&ctx->buf[used], 0, left - 1); + ctx->buf[7] = ctx->bytes; + + SipHash_CRounds(ctx, rc); + ctx->v[2] ^= 0xff; + SipHash_Rounds(ctx, rf); + + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + memset(ctx, 0, sizeof(*ctx)); + return (r); +} + +u64 +SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) +{ + SIPHASH_CTX ctx; + + SipHash_Init(&ctx, key); + SipHash_Update(&ctx, rc, rf, src, len); + return (SipHash_End(&ctx, rc, rf)); +} + +#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b))) + +static void +SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) +{ + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = SIP_ROTL(ctx->v[1], 13); + ctx->v[3] = SIP_ROTL(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = SIP_ROTL(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = SIP_ROTL(ctx->v[1], 17); + ctx->v[3] = SIP_ROTL(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = SIP_ROTL(ctx->v[2], 32); + } +} + +static void +SipHash_CRounds(SIPHASH_CTX *ctx, int rounds) +{ + u64 m = le64_to_cpu(*((__le64 *)ctx->buf)); + + ctx->v[3] ^= m; + SipHash_Rounds(ctx, rounds); + ctx->v[0] ^= m; +} diff --git a/drivers/md/bcache/siphash.h b/drivers/md/bcache/siphash.h new file mode 100644 index 000000000000..7a4b2241f1e1 --- /dev/null +++ b/drivers/md/bcache/siphash.h @@ -0,0 +1,86 @@ +/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) + * optimized for speed on short messages returning a 64bit hash/digest value. + * + * The number of rounds is defined during the initialization: + * SipHash24_Init() for the fast and resonable strong version + * SipHash48_Init() for the strong version (half as fast) + * + * struct SIPHASH_CTX ctx; + * SipHash24_Init(&ctx); + * SipHash_SetKey(&ctx, "16bytes long key"); + * SipHash_Update(&ctx, pointer_to_string, length_of_string); + * SipHash_Final(output, &ctx); + */ + +#ifndef _SIPHASH_H_ +#define _SIPHASH_H_ + +#include <linux/types.h> + +#define SIPHASH_BLOCK_LENGTH 8 +#define SIPHASH_KEY_LENGTH 16 +#define SIPHASH_DIGEST_LENGTH 8 + +typedef struct _SIPHASH_CTX { + u64 v[4]; + u8 buf[SIPHASH_BLOCK_LENGTH]; + u32 bytes; +} SIPHASH_CTX; + +typedef struct { + __le64 k0; + __le64 k1; +} SIPHASH_KEY; + +void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); +void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); +u64 SipHash_End(SIPHASH_CTX *, int, int); +void SipHash_Final(void *, SIPHASH_CTX *, int, int); +u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); + +#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) +#define SipHash24_End(_d) SipHash_End((_d), 2, 4) +#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) +#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) + +#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) +#define SipHash48_End(_d) SipHash_End((_d), 4, 8) +#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) +#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) + +#endif /* _SIPHASH_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e82dcc5ae80c..25e570253b1b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -12,6 +12,7 @@ #include "btree.h" #include "clock.h" #include "debug.h" +#include "fs-gc.h" #include "gc.h" #include "inode.h" #include "io.h" @@ -26,6 +27,7 @@ #include "tier.h" #include "writeback.h" +#include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/crc32c.h> #include <linux/debugfs.h> @@ -139,6 +141,41 @@ static const char *bch_blkdev_open(const char *path, void *holder, return NULL; } +static int bch_congested_fn(void *data, int bdi_bits) +{ + struct backing_dev_info *bdi; + struct cache_set *c = data; + struct cache *ca; + unsigned i; + int ret = 0; + + rcu_read_lock(); + if (bdi_bits & (1 << WB_sync_congested)) { + /* Reads - check all devices: */ + for_each_cache_rcu(ca, c, i) { + bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + + if (bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + } else { + /* Writes only go to tier 0: */ + group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) { + bdi = blk_get_backing_dev_info(ca->disk_sb.bdev); + + if (bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + } + rcu_read_unlock(); + + return ret; +} + /* Superblock */ const char *validate_cache_member(struct cache_sb *sb, @@ -601,8 +638,19 @@ static void bch_recalc_capacity(struct cache_set *c) struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers); struct cache *ca; u64 capacity = 0; + unsigned long ra_pages = 0; unsigned i, j; + rcu_read_lock(); + for_each_cache_rcu(ca, c, i) { + struct backing_dev_info *bdi = + blk_get_backing_dev_info(ca->disk_sb.bdev); + + ra_pages += bdi->ra_pages; + } + + c->bdi.ra_pages = ra_pages; + /* * Capacity of the cache set is the capacity of all the devices in the * slowest (highest) tier - we don't include lower tier devices. @@ -752,6 +800,9 @@ void bch_cache_set_fail(struct cache_set *c) void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + if (c->stop_completion) + complete(c->stop_completion); kfree(c); module_put(THIS_MODULE); } @@ -777,6 +828,7 @@ static void cache_set_free(struct closure *cl) percpu_ref_exit(&c->writes); bch_io_clock_exit(&c->io_clock[WRITE]); bch_io_clock_exit(&c->io_clock[READ]); + bdi_destroy(&c->bdi); bioset_exit(&c->btree_bio); bioset_exit(&c->bio_split); mempool_exit(&c->btree_reserve_pool); @@ -998,6 +1050,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio)) || bioset_init(&c->btree_bio, 1, offsetof(struct bbio, bio)) || + bdi_setup_and_register(&c->bdi, "bcache") || bch_io_clock_init(&c->io_clock[READ]) || bch_io_clock_init(&c->io_clock[WRITE]) || bch_journal_alloc(&c->journal) || @@ -1005,6 +1058,10 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_bset_sort_state_init(&c->sort, ilog2(btree_pages(c)))) goto err; + c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + c->bdi.congested_fn = bch_congested_fn; + c->bdi.congested_data = c; + return c; err: bch_cache_set_stop(c); @@ -1144,7 +1201,15 @@ static const char *run_cache_set(struct cache_set *c) } bch_journal_replay(c, &journal); + + err = "error gcing inode nlinks"; + if (bch_gc_inode_nlinks(c)) + goto err; + + bch_verify_inode_refs(c); } else { + struct bkey_i_inode inode; + pr_notice("invalidating existing data"); err = "unable to allocate journal buckets"; @@ -1185,6 +1250,17 @@ static const char *run_cache_set(struct cache_set *c) /* XXX: necessary? */ bch_journal_meta(&c->journal, &cl); closure_sync(&cl); + + bkey_inode_init(&inode.k_i); + inode.k.p.inode = BCACHE_ROOT_INO; + inode.v.i_mode = S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO; + inode.v.i_nlink = 2; + + err = "error creating root directory"; + if (bch_btree_insert(c, BTREE_ID_INODES, + &keylist_single(&inode.k_i), + NULL, &cl, NULL, 0)) + goto err; } bch_prio_timer_start(c, READ); @@ -2342,6 +2418,7 @@ kobj_attribute_write(reboot, reboot_test); static void bcache_exit(void) { bch_debug_exit(); + bch_fs_exit(); bch_blockdev_exit(); if (bcache_kset) kset_unregister(bcache_kset); @@ -2368,6 +2445,7 @@ static int __init bcache_init(void) !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) || sysfs_create_files(&bcache_kset->kobj, files) || bch_blockdev_init() || + bch_fs_init() || bch_debug_init()) goto err; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 5311afcd3a1c..0704697e762e 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -24,6 +24,8 @@ struct closure; #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) #define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) #define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) +#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) +#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) #define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) #define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) @@ -36,6 +38,8 @@ struct closure; #define atomic_inc_bug(v, i) atomic_inc(v) #define atomic_sub_bug(i, v) atomic_sub(i, v) #define atomic_add_bug(i, v) atomic_add(i, v) +#define atomic_long_dec_bug(v) atomic_long_dec(v) +#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) #define atomic64_dec_bug(v) atomic64_dec(v) #define atomic64_inc_bug(v, i) atomic64_inc(v) #define atomic64_sub_bug(i, v) atomic64_sub(i, v) diff --git a/drivers/md/bcache/xattr.c b/drivers/md/bcache/xattr.c new file mode 100644 index 000000000000..404796ff8163 --- /dev/null +++ b/drivers/md/bcache/xattr.c @@ -0,0 +1,414 @@ + +#include "bcache.h" +#include "btree.h" +#include "extents.h" +#include "fs.h" +#include "keylist.h" +#include "siphash.h" +#include "xattr.h" + +#include "linux/crc32c.h" +#include "linux/cryptohash.h" +#include "linux/posix_acl_xattr.h" +#include "linux/xattr.h" + +#if 0 +/* + * XXX: should really include x_type here + */ +static u64 bch_xattr_hash(const struct qstr *name) +{ + union { + u32 b[SHA_DIGEST_WORDS]; + u64 ret; + } digest; + + unsigned done = 0; + + sha_init(digest.b); + + while (done < name->len) { + u32 workspace[SHA_WORKSPACE_WORDS]; + u8 message[SHA_MESSAGE_BYTES]; + unsigned bytes = min_t(unsigned, name->len - done, + SHA_MESSAGE_BYTES); + + memcpy(message, name->name + done, bytes); + memset(message + bytes, 0, SHA_MESSAGE_BYTES - bytes); + sha_transform(digest.b, message, workspace); + done += bytes; + } + + return digest.ret; +} + +static const SIPHASH_KEY bch_siphash_key; + +static u64 bch_xattr_hash(const struct qstr *name, u8 type) +{ +#if 0 + SIPHASH_CTX ctx; + + SipHash24_Init(&ctx, &bch_siphash_key); + SipHash24_Update(&ctx, &type, sizeof(type)); + SipHash24_Update(&ctx, name->name, name->len); + + return SipHash24_End(&ctx) >> 1; +#else + return SipHash24(&bch_siphash_key, name->name, name->len) >> 1; +#endif +} +#endif + +static u64 bch_xattr_hash(const struct qstr *name, u8 type) +{ + return crc32c(0, name->name, name->len); +} + +#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) + +static int xattr_cmp(const struct bch_xattr *xattr, + u8 type, const struct qstr *q) +{ + return xattr->x_type != type || + xattr->x_name_len != q->len || + memcmp(xattr->x_name, q->name, q->len); +} + +static bool bch_xattr_invalid(const struct cache_set *c, struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_XATTR: + if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) + return true; + + return false; + case BCH_XATTR_WHITEOUT: + if (bkey_val_bytes(k.k)) + return true; + + return false; + default: + return true; + } +} + +static void bch_xattr_to_text(struct cache_set *c, char *buf, + size_t size, struct bkey_s_c k) +{ + struct bkey_s_c_xattr xattr; + int n; + + switch (k.k->type) { + case BCH_XATTR: + xattr = bkey_s_c_to_xattr(k); + + if (size) { + n = min_t(unsigned, size, xattr.v->x_name_len); + memcpy(buf, xattr.v->x_name, n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + n = scnprintf(buf, size, " -> "); + buf += n; + size -= n; + + if (size) { + n = min_t(unsigned, size, xattr.v->x_val_len); + memcpy(buf, xattr_val(xattr.v), n); + buf[size - 1] = '\0'; + buf += n; + size -= n; + } + + break; + case BCH_XATTR_WHITEOUT: + scnprintf(buf, size, "whiteout"); + break; + } +} + +const struct btree_keys_ops bch_xattr_ops = { +}; + +const struct bkey_ops bch_bkey_xattr_ops = { + .key_invalid = bch_xattr_invalid, + .val_to_text = bch_xattr_to_text, +}; + +int bch_xattr_get(struct cache_set *c, u64 inum, const char *name, + void *buffer, size_t size, int type) +{ + struct qstr qname = (struct qstr) QSTR_INIT(name, strlen(name)); + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_xattr *xattr; + int ret = -ENODATA; + + for_each_btree_key_with_holes(&iter, c, BTREE_ID_XATTRS, + POS(inum, bch_xattr_hash(&qname, type)), k) { + switch (k.k->type) { + case BCH_XATTR: + xattr = bkey_s_c_to_xattr(k).v; + + /* collision? */ + if (!xattr_cmp(xattr, type, &qname)) { + ret = xattr->x_val_len; + if (buffer) { + if (xattr->x_val_len > size) + ret = -ERANGE; + else + memcpy(buffer, xattr_val(xattr), + xattr->x_val_len); + } + goto out; + } + break; + case BCH_XATTR_WHITEOUT: + break; + default: + /* hole, not found */ + goto out; + } + } +out: + bch_btree_iter_unlock(&iter); + return ret; +} + +int bch_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type) +{ + struct bch_inode_info *ei = to_bch_ei(inode); + struct cache_set *c = inode->i_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + struct keylist keys; + struct qstr qname = (struct qstr) QSTR_INIT((char *) name, + strlen(name)); + int ret = -EINVAL; + unsigned insert_flags = BTREE_INSERT_ATOMIC; + + if (!value) + insert_flags |= BTREE_INSERT_NOFAIL; + + bch_btree_iter_init_intent(&iter, c, BTREE_ID_XATTRS, + POS(inode->i_ino, + bch_xattr_hash(&qname, type))); + + while ((k = bch_btree_iter_peek_with_holes(&iter)).k) { + switch (k.k->type) { + case BCH_XATTR: + /* collision? */ + if (xattr_cmp(bkey_s_c_to_xattr(k).v, type, &qname)) { + bch_btree_iter_advance_pos(&iter); + continue; + } + + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + break; + case BCH_XATTR_WHITEOUT: + bch_btree_iter_advance_pos(&iter); + continue; + default: + /* hole, not found */ + if (flags & XATTR_REPLACE) { + ret = -ENODATA; + goto out; + } + break; + } + + bch_keylist_init(&keys); + + if (value) { + struct bkey_i_xattr *xattr; + unsigned u64s = BKEY_U64s + + DIV_ROUND_UP(sizeof(struct bch_xattr) + + qname.len + size, + sizeof(u64)); + + if (u64s > U8_MAX) { + ret = -ERANGE; + break; + } + + if (bch_keylist_realloc(&keys, u64s)) { + ret = -ENOMEM; + break; + } + + xattr = bkey_xattr_init(keys.top); + xattr->k.u64s = u64s; + xattr->k.p = k.k->p; + xattr->v.x_type = type; + xattr->v.x_name_len = qname.len; + xattr->v.x_val_len = size; + memcpy(xattr->v.x_name, qname.name, qname.len); + memcpy(xattr_val(&xattr->v), value, size); + + BUG_ON(xattr_cmp(&xattr->v, type, &qname)); + } else { + /* removing */ + bkey_init(&keys.top->k); + keys.top->k.type = BCH_XATTR_WHITEOUT; + keys.top->k.p = k.k->p; + } + + bch_keylist_enqueue(&keys); + + ret = bch_btree_insert_at(&iter, &keys, NULL, + &ei->journal_seq, + insert_flags); + bch_keylist_free(&keys); + + if (ret != -EINTR && ret != -EAGAIN) + break; + } +out: + bch_btree_iter_unlock(&iter); + return ret; +} + +static const struct xattr_handler *bch_xattr_type_to_handler(unsigned); + +static size_t bch_xattr_emit(struct dentry *dentry, + const struct bch_xattr *xattr, + char *buffer, size_t buffer_size) +{ + const struct xattr_handler *handler = + bch_xattr_type_to_handler(xattr->x_type); + + if (handler && (!handler->list || handler->list(dentry))) { + const size_t prefix_len = strlen(handler->prefix); + const size_t total_len = prefix_len + xattr->x_name_len + 1; + + if (buffer && total_len <= buffer_size) { + memcpy(buffer, handler->prefix, prefix_len); + memcpy(buffer + prefix_len, + xattr->x_name, xattr->x_name_len); + buffer[prefix_len + xattr->x_name_len] = '\0'; + } + + return total_len; + } else { + return 0; + } +} + +ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct cache_set *c = dentry->d_sb->s_fs_info; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_xattr *xattr; + u64 inum = dentry->d_inode->i_ino; + ssize_t ret = 0; + size_t len; + + for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) { + BUG_ON(k.k->p.inode < inum); + + if (k.k->p.inode > inum) + break; + + if (k.k->type != BCH_XATTR) + continue; + + xattr = bkey_s_c_to_xattr(k).v; + + len = bch_xattr_emit(dentry, xattr, buffer, buffer_size); + if (buffer) { + if (len > buffer_size) { + bch_btree_iter_unlock(&iter); + return -ERANGE; + } + + buffer += len; + buffer_size -= len; + } + + ret += len; + + } + bch_btree_iter_unlock(&iter); + + return ret; +} + +static int bch_xattr_get_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return bch_xattr_get(inode->i_sb->s_fs_info, inode->i_ino, + name, buffer, size, handler->flags); +} + +static int bch_xattr_set_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return bch_xattr_set(inode, name, value, size, flags, + handler->flags); +} + +static const struct xattr_handler bch_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = bch_xattr_get_handler, + .set = bch_xattr_set_handler, + .flags = BCH_XATTR_INDEX_USER, +}; + +static bool bch_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler bch_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = bch_xattr_trusted_list, + .get = bch_xattr_get_handler, + .set = bch_xattr_set_handler, + .flags = BCH_XATTR_INDEX_TRUSTED, +}; + +static const struct xattr_handler bch_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = bch_xattr_get_handler, + .set = bch_xattr_set_handler, + .flags = BCH_XATTR_INDEX_SECURITY, +}; + +static const struct xattr_handler *bch_xattr_handler_map[] = { + [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, + [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = + &posix_acl_access_xattr_handler, + [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &posix_acl_default_xattr_handler, + [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, + [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, +}; + +const struct xattr_handler *bch_xattr_handlers[] = { + &bch_xattr_user_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, + NULL +}; + +static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type) +{ + return type < ARRAY_SIZE(bch_xattr_handler_map) + ? bch_xattr_handler_map[type] + : NULL; +} diff --git a/drivers/md/bcache/xattr.h b/drivers/md/bcache/xattr.h new file mode 100644 index 000000000000..839d47ef6910 --- /dev/null +++ b/drivers/md/bcache/xattr.h @@ -0,0 +1,16 @@ +#ifndef _BCACHE_XATTR_H +#define _BCACHE_XATTR_H + +extern const struct btree_keys_ops bch_xattr_ops; +extern const struct bkey_ops bch_bkey_xattr_ops; + +struct dentry; +struct xattr_handler; + +int bch_xattr_get(struct cache_set *, u64, const char *, void *, size_t, int); +int bch_xattr_set(struct inode *, const char *, const void *, size_t, int, int); +ssize_t bch_xattr_list(struct dentry *, char *, size_t); + +extern const struct xattr_handler *bch_xattr_handlers[]; + +#endif /* _BCACHE_XATTR_H */ diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 27a2926aec21..34523f6f129c 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -288,6 +288,8 @@ BITMASK(EXTENT_CACHED, struct bch_extent, data[0], 63, 64) #define BLOCKDEV_INODE_MAX 4096 +#define BCACHE_ROOT_INO 4096 + enum bch_inode_types { BCH_INODE_FS = 128, BCH_INODE_BLOCKDEV = 129, @@ -336,6 +338,62 @@ BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV); BITMASK(INODE_FLASH_ONLY, struct bch_inode_blockdev, i_inode.i_flags, 0, 1); +/* Dirents */ + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +enum { + BCH_DIRENT = 128, + BCH_DIRENT_WHITEOUT = 129, +}; + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + __u64 d_inum; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __attribute__((packed)); +BKEY_VAL_TYPE(dirent, BCH_DIRENT); + +/* Xattrs */ + +enum { + BCH_XATTR = 128, + BCH_XATTR_WHITEOUT = 129, +}; + +#define BCH_XATTR_INDEX_USER 0 +#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define BCH_XATTR_INDEX_TRUSTED 3 +#define BCH_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __u16 x_val_len; + __u8 x_name[]; +} __attribute__((packed)); +BKEY_VAL_TYPE(xattr, BCH_XATTR); + /* Superblock */ /* Version 0: Cache device @@ -488,6 +546,14 @@ BITMASK(CACHE_BTREE_NODE_SIZE, struct cache_sb, flags, 20, 36); BITMASK(CACHE_SET_META_REPLICAS_HAVE, struct cache_sb, flags, 36, 40); BITMASK(CACHE_SET_DATA_REPLICAS_HAVE, struct cache_sb, flags, 40, 44); +BITMASK(CACHE_SET_DIRENT_CSUM_TYPE, struct cache_sb, flags, 44, 48); +enum { + BCH_DIRENT_CSUM_CRC32C = 0, + BCH_DIRENT_CSUM_CRC64 = 1, + BCH_DIRENT_CSUM_SIPHASH = 2, + BCH_DIRENT_CSUM_SHA1 = 3, +}; + BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); #define CACHE_MODE_WRITETHROUGH 0U #define CACHE_MODE_WRITEBACK 1U @@ -532,6 +598,10 @@ static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) +#define BCACHE_STATFS_MAGIC 0xca451a4e + +#define BCACHE_SB_MAGIC 0xca451a4ef67385c6ULL +#define BCACHE_SB_MAGIC2 0x816dba487ff56582ULL #define JSET_MAGIC 0x245235c1a3625032ULL #define PSET_MAGIC 0x6750e15f87337f91ULL #define BSET_MAGIC 0x90135c78b99e07f5ULL @@ -571,7 +641,9 @@ static inline __u64 bset_magic(struct cache_sb *sb) #define DEFINE_BCH_BTREE_IDS() \ DEF_BTREE_ID(EXTENTS, 0, "extents") \ - DEF_BTREE_ID(INODES, 1, "inodes") + DEF_BTREE_ID(INODES, 1, "inodes") \ + DEF_BTREE_ID(DIRENTS, 2, "dirents") \ + DEF_BTREE_ID(XATTRS, 3, "xattrs") #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, @@ -803,3 +875,5 @@ BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); } #endif #endif /* _LINUX_BCACHE_H */ + +/* vim: set foldnestmax=2: */ |