diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-06-30 16:28:01 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:16 -0400 |
commit | ea5715a73506eb929e43b66eb3b87c94e2b44ab4 (patch) | |
tree | a145b47f47c831f20c6ee694995a5f9b7e2e6e31 | |
parent | 5f6131b81dfa624673447c41cfb69c151086b802 (diff) |
Merge with 1f431b384d bcachefs: Refactor trans_(get|update)_key
132 files changed, 23288 insertions, 14798 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index f5d9f1791769..e695ab786f80 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -6,6 +6,7 @@ config BCACHEFS_FS select EXPORTFS select CLOSURES select LIBCRC32C + select CRC64 select FS_POSIX_ACL select LZ4_COMPRESS select LZ4_DECOMPRESS @@ -17,8 +18,9 @@ config BCACHEFS_FS select CRYPTO_CHACHA20 select CRYPTO_POLY1305 select KEYS - select COMPACTION select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS ---help--- The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -41,3 +43,9 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. + +config BCACHEFS_TESTS + bool "bcachefs unit and performance tests" + depends on BCACHEFS_FS + ---help--- + Include some unit and performance tests for the core btree code diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1a3dd3efd2d8..da42c4fd764d 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -3,9 +3,11 @@ obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o bcachefs-y := \ acl.o \ - alloc.o \ + alloc_background.o \ + alloc_foreground.o \ bkey.o \ bkey_methods.o \ + bkey_sort.o \ bset.o \ btree_cache.o \ btree_gc.o \ @@ -21,6 +23,7 @@ bcachefs-y := \ debug.o \ dirent.o \ disk_groups.o \ + ec.o \ error.o \ extents.o \ fs.o \ @@ -34,18 +37,19 @@ bcachefs-y := \ journal_reclaim.o \ journal_seq_blacklist.o \ keylist.o \ - lz4_decompress.o \ migrate.o \ move.o \ movinggc.o \ opts.o \ quota.o \ rebalance.o \ + recovery.o \ replicas.o \ siphash.o \ super.o \ super-io.o \ sysfs.o \ + tests.o \ trace.o \ util.o \ xattr.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index d29bdafaea66..59d4af1326ee 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #ifdef CONFIG_BCACHEFS_POSIX_ACL #include "bcachefs.h" @@ -12,96 +13,176 @@ #include "fs.h" #include "xattr.h" +static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) +{ + return sizeof(bch_acl_header) + + sizeof(bch_acl_entry_short) * nr_short + + sizeof(bch_acl_entry) * nr_long; +} + +static inline int acl_to_xattr_type(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; + default: + BUG(); + } +} + /* * Convert from filesystem to in-memory representation. */ static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) { - const char *end = (char *)value + size; - int n, count; + const void *p, *end = value + size; struct posix_acl *acl; + struct posix_acl_entry *out; + unsigned count = 0; if (!value) return NULL; if (size < sizeof(bch_acl_header)) - return ERR_PTR(-EINVAL); + goto invalid; if (((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(bch_acl_header); - count = bch2_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) + goto invalid; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *entry = p; + + if (p + sizeof(bch_acl_entry_short) > end) + goto invalid; + + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + case ACL_GROUP: + p += sizeof(bch_acl_entry); + break; + default: + goto invalid; + } + + count++; + } + + if (p > end) + goto invalid; + + if (!count) return NULL; + acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) return ERR_PTR(-ENOMEM); - for (n = 0; n < count; n++) { - bch_acl_entry *entry = - (bch_acl_entry *)value; - if ((char *)value + sizeof(bch_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch (acl->a_entries[n].e_tag) { + + out = acl->a_entries; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + + out->e_tag = le16_to_cpu(in->e_tag); + out->e_perm = le16_to_cpu(in->e_perm); + + switch (out->e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - value = (char *)value + - sizeof(bch_acl_entry_short); + p += sizeof(bch_acl_entry_short); break; - case ACL_USER: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); + out->e_uid = make_kuid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); break; case ACL_GROUP: - value = (char *)value + sizeof(bch_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); + out->e_gid = make_kgid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); break; - - default: - goto fail; } + + out++; } - if (value != end) - goto fail; - return acl; -fail: - posix_acl_release(acl); + BUG_ON(out != acl->a_entries + acl->a_count); + + return acl; +invalid: + pr_err("invalid acl entry"); return ERR_PTR(-EINVAL); } +#define acl_for_each_entry(acl, acl_e) \ + for (acl_e = acl->a_entries; \ + acl_e < acl->a_entries + acl->a_count; \ + acl_e++) + /* * Convert from in-memory to filesystem representation. */ -static void *bch2_acl_to_disk(const struct posix_acl *acl, size_t *size) +static struct bkey_i_xattr * +bch2_acl_to_xattr(struct btree_trans *trans, + const struct posix_acl *acl, + int type) { - bch_acl_header *ext_acl; - char *e; - size_t n; - - *size = bch2_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count * - sizeof(bch_acl_entry), GFP_KERNEL); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION); - e = (char *)ext_acl + sizeof(bch_acl_header); - for (n = 0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - bch_acl_entry *entry = (bch_acl_entry *)e; + struct bkey_i_xattr *xattr; + bch_acl_header *acl_header; + const struct posix_acl_entry *acl_e; + void *outptr; + unsigned nr_short = 0, nr_long = 0, acl_len, u64s; + + acl_for_each_entry(acl, acl_e) { + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: + nr_long++; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + nr_short++; + break; + default: + return ERR_PTR(-EINVAL); + } + } + + acl_len = bch2_acl_size(nr_short, nr_long); + u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); + + if (u64s > U8_MAX) + return ERR_PTR(-E2BIG); + + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return xattr; + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = acl_to_xattr_type(type); + xattr->v.x_name_len = 0, + xattr->v.x_val_len = cpu_to_le16(acl_len); + + acl_header = xattr_val(&xattr->v); + acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); + + outptr = (void *) acl_header + sizeof(*acl_header); + + acl_for_each_entry(acl, acl_e) { + bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); @@ -109,139 +190,196 @@ static void *bch2_acl_to_disk(const struct posix_acl *acl, size_t *size) case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(bch_acl_entry); + outptr += sizeof(bch_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(bch_acl_entry); + outptr += sizeof(bch_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - e += sizeof(bch_acl_entry_short); + outptr += sizeof(bch_acl_entry_short); break; - - default: - goto fail; } } - return (char *)ext_acl; -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); + BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); + + return xattr; } struct posix_acl *bch2_get_acl(struct inode *vinode, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int name_index; - char *value = NULL; - struct posix_acl *acl; - int ret; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - ret = bch2_xattr_get(c, inode, "", NULL, 0, name_index); - if (ret > 0) { - value = kmalloc(ret, GFP_KERNEL); - if (!value) - return ERR_PTR(-ENOMEM); - ret = bch2_xattr_get(c, inode, "", value, - ret, name_index); + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + struct posix_acl *acl = NULL; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(acl_to_xattr_type(type), "", 0), + 0); + if (IS_ERR(iter)) { + if (PTR_ERR(iter) == -EINTR) + goto retry; + + if (PTR_ERR(iter) != -ENOENT) + acl = ERR_CAST(iter); + goto out; } - if (ret > 0) - acl = bch2_acl_from_disk(value, ret); - else if (ret == -ENODATA || ret == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(ret); - kfree(value); + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); - +out: + bch2_trans_exit(&trans); return acl; } -int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +int bch2_set_acl_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) { - struct bch_inode_info *inode = to_bch_ei(vinode); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - umode_t mode = inode->v.i_mode; - int name_index; - void *value = NULL; - size_t size = 0; int ret; - if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&inode->v, &mode, &acl); - if (ret) - return ret; - } + if (type == ACL_TYPE_DEFAULT && + !S_ISDIR(inode_u->bi_mode)) + return acl ? -EACCES : 0; - switch (type) { - case ACL_TYPE_ACCESS: - name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->v.i_mode); - if (ret < 0) - return ret; - if (ret == 0) - acl = NULL; - } - break; + if (acl) { + struct bkey_i_xattr *xattr = + bch2_acl_to_xattr(trans, acl, type); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inode_u->bi_inum, &xattr->k_i, 0); + } else { + struct xattr_search_key search = + X_SEARCH(acl_to_xattr_type(type), "", 0); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, + inode_u->bi_inum, &search); + } - case ACL_TYPE_DEFAULT: - name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->v.i_mode)) - return acl ? -EACCES : 0; - break; + return ret == -ENOENT ? 0 : ret; +} - default: - return -EINVAL; - } +static int inode_update_for_set_acl_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + umode_t mode = (unsigned long) p; - if (acl) { - value = bch2_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } + bi->bi_ctime = bch2_current_time(c); + bi->bi_mode = mode; + return 0; +} - if (mode != inode->v.i_mode) { - mutex_lock(&inode->ei_update_lock); - inode->v.i_mode = mode; - inode->v.i_ctime = current_time(&inode->v); +int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + umode_t mode = inode->v.i_mode; + int ret; - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 0, 0); + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(&inode->v, &mode, &acl); if (ret) goto err; } - - ret = bch2_xattr_set(c, inode, "", value, size, 0, name_index); +retry: + bch2_trans_begin(&trans); + + ret = bch2_set_acl_trans(&trans, + &inode->ei_inode, + &inode->ei_str_hash, + acl, type) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_set_acl_fn, + (void *)(unsigned long) mode) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); err: - kfree(value); + bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); - if (ret == -ERANGE) - ret = -E2BIG; + return ret; +} - if (!ret) - set_cached_acl(&inode->v, type, acl); +int bch2_acl_chmod(struct btree_trans *trans, + struct bch_inode_info *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + struct btree_iter *iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; + int ret = 0; + + iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + + acl = bch2_acl_from_disk(xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + goto err; + + new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto err; + } + new->k.p = iter->pos; + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i)); + *new_acl = acl; + acl = NULL; +err: + kfree(acl); return ret; } diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index b721330e2837..cb62d502a7ff 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -1,6 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_ACL_H #define _BCACHEFS_ACL_H +struct bch_inode_unpacked; +struct bch_hash_info; +struct bch_inode_info; +struct posix_acl; + #ifdef CONFIG_BCACHEFS_POSIX_ACL #define BCH_ACL_VERSION 0x0001 @@ -20,43 +26,30 @@ typedef struct { __le32 a_version; } bch_acl_header; -static inline size_t bch2_acl_size(int count) -{ - if (count <= 4) { - return sizeof(bch_acl_header) + - count * sizeof(bch_acl_entry_short); - } else { - return sizeof(bch_acl_header) + - 4 * sizeof(bch_acl_entry_short) + - (count - 4) * sizeof(bch_acl_entry); - } -} - -static inline int bch2_acl_count(size_t size) -{ - ssize_t s; - - size -= sizeof(bch_acl_header); - s = size - 4 * sizeof(bch_acl_entry_short); - if (s < 0) { - if (size % sizeof(bch_acl_entry_short)) - return -1; - return size / sizeof(bch_acl_entry_short); - } else { - if (s % sizeof(bch_acl_entry)) - return -1; - return s / sizeof(bch_acl_entry) + 4; - } -} - -struct posix_acl; +struct posix_acl *bch2_get_acl(struct inode *, int); -extern struct posix_acl *bch2_get_acl(struct inode *, int); -extern int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_set_acl_trans(struct btree_trans *, + struct bch_inode_unpacked *, + const struct bch_hash_info *, + struct posix_acl *, int); +int bch2_set_acl(struct inode *, struct posix_acl *, int); +int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, + umode_t, struct posix_acl **); #else -static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static inline int bch2_set_acl_trans(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + struct posix_acl *acl, int type) +{ + return 0; +} + +static inline int bch2_acl_chmod(struct btree_trans *trans, + struct bch_inode_info *inode, + umode_t mode, + struct posix_acl **new_acl) { return 0; } diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c deleted file mode 100644 index db21f8bd66c8..000000000000 --- a/fs/bcachefs/alloc.c +++ /dev/null @@ -1,2198 +0,0 @@ -/* - * Primary bucket allocation code - * - * Copyright 2012 Google, Inc. - * - * Allocation in bcache is done in terms of buckets: - * - * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in - * btree pointers - they must match for the pointer to be considered valid. - * - * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a - * bucket simply by incrementing its gen. - * - * The gens (along with the priorities; it's really the gens are important but - * the code is named as if it's the priorities) are written in an arbitrary list - * of buckets on disk, with a pointer to them in the journal header. - * - * When we invalidate a bucket, we have to write its new gen to disk and wait - * for that write to complete before we use it - otherwise after a crash we - * could have pointers that appeared to be good but pointed to data that had - * been overwritten. - * - * Since the gens and priorities are all stored contiguously on disk, we can - * batch this up: We fill up the free_inc list with freshly invalidated buckets, - * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. - * - * free_inc isn't the only freelist - if it was, we'd often have to sleep while - * priorities and gens were being written before we could allocate. c->free is a - * smaller freelist, and buckets on that list are always ready to be used. - * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * - * It's important to ensure that gens don't wrap around - with respect to - * either the oldest gen in the btree or the gen on disk. This is quite - * difficult to do in practice, but we explicitly guard against it anyways - if - * a bucket is in danger of wrapping around we simply skip invalidating it that - * time around, and we garbage collect or rewrite the priorities sooner than we - * would have otherwise. - * - * bch2_bucket_alloc() allocates a single bucket from a specific device. - * - * bch2_bucket_alloc_set() allocates one or more buckets from different devices - * in a given filesystem. - * - * invalidate_buckets() drives all the processes described above. It's called - * from bch2_bucket_alloc() and a few other places that need to make sure free - * buckets are ready. - * - * invalidate_buckets_(lru|fifo)() find buckets that are available to be - * invalidated, and then invalidate them and stick them on the free_inc list - - * in either lru or fifo order. - */ - -#include "bcachefs.h" -#include "alloc.h" -#include "btree_cache.h" -#include "btree_io.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "buckets.h" -#include "checksum.h" -#include "clock.h" -#include "debug.h" -#include "disk_groups.h" -#include "error.h" -#include "extents.h" -#include "io.h" -#include "journal.h" -#include "journal_io.h" -#include "super-io.h" - -#include <linux/blkdev.h> -#include <linux/kthread.h> -#include <linux/math64.h> -#include <linux/random.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/sched/task.h> -#include <linux/sort.h> -#include <trace/events/bcachefs.h> - -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - -/* Ratelimiting/PD controllers */ - -static void pd_controllers_update(struct work_struct *work) -{ - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, - pd_controllers_update); - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); - - u64 free = bucket_to_sector(ca, - __dev_buckets_free(ca, stats)) << 9; - /* - * Bytes of internal fragmentation, which can be - * reclaimed by copy GC - */ - s64 fragmented = (bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER] + - stats.buckets[BCH_DATA_CACHED]) - - (stats.sectors[BCH_DATA_USER] + - stats.sectors[BCH_DATA_CACHED])) << 9; - - fragmented = max(0LL, fragmented); - - bch2_pd_controller_update(&ca->copygc_pd, - free, fragmented, -1); - } - - schedule_delayed_work(&c->pd_controllers_update, - c->pd_controllers_update_seconds * HZ); -} - -/* Persistent alloc info: */ - -static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) -{ - unsigned bytes = offsetof(struct bch_alloc, data); - - if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - bytes += 2; - if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - bytes += 2; - - return DIV_ROUND_UP(bytes, sizeof(u64)); -} - -const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - if (k.k->p.inode >= c->sb.nr_devices || - !c->devs[k.k->p.inode]) - return "invalid device"; - - switch (k.k->type) { - case BCH_ALLOC: { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - - if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k)) - return "incorrect value size"; - break; - } - default: - return "invalid type"; - } - - return NULL; -} - -void bch2_alloc_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - buf[0] = '\0'; - - switch (k.k->type) { - case BCH_ALLOC: - break; - } -} - -static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) -{ - unsigned v; - - switch (bytes) { - case 1: - v = **p; - break; - case 2: - v = le16_to_cpup((void *) *p); - break; - case 4: - v = le32_to_cpup((void *) *p); - break; - default: - BUG(); - } - - *p += bytes; - return v; -} - -static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v) -{ - switch (bytes) { - case 1: - **p = v; - break; - case 2: - *((__le16 *) *p) = cpu_to_le16(v); - break; - case 4: - *((__le32 *) *p) = cpu_to_le32(v); - break; - default: - BUG(); - } - - *p += bytes; -} - -static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) -{ - struct bch_dev *ca; - struct bkey_s_c_alloc a; - struct bucket_mark new; - struct bucket *g; - const u8 *d; - - if (k.k->type != BCH_ALLOC) - return; - - a = bkey_s_c_to_alloc(k); - ca = bch_dev_bkey_exists(c, a.k->p.inode); - - if (a.k->p.offset >= ca->mi.nbuckets) - return; - - percpu_down_read_preempt_disable(&c->usage_lock); - - g = bucket(ca, a.k->p.offset); - bucket_cmpxchg(g, new, ({ - new.gen = a.v->gen; - new.gen_valid = 1; - })); - - d = a.v->data; - if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - g->io_time[READ] = get_alloc_field(&d, 2); - if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - g->io_time[WRITE] = get_alloc_field(&d, 2); - - percpu_up_read_preempt_enable(&c->usage_lock); -} - -int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) -{ - struct journal_replay *r; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; - unsigned i; - int ret; - - for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) { - bch2_alloc_read_key(c, k); - bch2_btree_iter_cond_resched(&iter); - } - - ret = bch2_btree_iter_unlock(&iter); - if (ret) - return ret; - - list_for_each_entry(r, journal_replay_list, list) { - struct bkey_i *k, *n; - struct jset_entry *entry; - - for_each_jset_key(k, n, entry, &r->j) - if (entry->btree_id == BTREE_ID_ALLOC) - bch2_alloc_read_key(c, bkey_i_to_s_c(k)); - } - - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - - return 0; -} - -static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct btree_iter *iter, - u64 *journal_seq) -{ - struct bucket_mark m; - __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; - struct bucket *g; - struct bkey_i_alloc *a; - u8 *d; - int ret; - - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); - - do { - ret = btree_iter_err(bch2_btree_iter_peek_slot(iter)); - if (ret) - break; - - percpu_down_read_preempt_disable(&c->usage_lock); - g = bucket(ca, b); - - /* read mark under btree node lock: */ - m = READ_ONCE(g->mark); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - a->v.fields = 0; - a->v.gen = m.gen; - set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); - - d = a->v.data; - if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - put_alloc_field(&d, 2, g->io_time[READ]); - if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - put_alloc_field(&d, 2, g->io_time[WRITE]); - percpu_up_read_preempt_enable(&c->usage_lock); - - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOWAIT, - BTREE_INSERT_ENTRY(iter, &a->k_i)); - bch2_btree_iter_cond_resched(iter); - } while (ret == -EINTR); - - return ret; -} - -int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) -{ - struct bch_dev *ca; - struct btree_iter iter; - int ret; - - if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) - return 0; - - ca = bch_dev_bkey_exists(c, pos.inode); - - if (pos.offset >= ca->mi.nbuckets) - return 0; - - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL); - bch2_btree_iter_unlock(&iter); - return ret; -} - -int bch2_alloc_write(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - int ret = 0; - - for_each_rw_member(ca, c, i) { - struct btree_iter iter; - unsigned long bucket; - - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - down_read(&ca->bucket_lock); - for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, &iter, NULL); - if (ret) - break; - - clear_bit(bucket, ca->buckets_dirty); - } - up_read(&ca->bucket_lock); - bch2_btree_iter_unlock(&iter); - - if (ret) { - percpu_ref_put(&ca->io_ref); - break; - } - } - - return ret; -} - -/* Bucket IO clocks: */ - -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += capacity >> 10; - - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = c->capacity >> 10; - mutex_init(&clock->lock); -} - -/* Background allocator thread: */ - -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ - -static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - if (expensive_debug_checks(c) && - test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { - size_t iter; - long i; - unsigned j; - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - BUG_ON(i == bucket); - fifo_for_each_entry(i, &ca->free_inc, iter) - BUG_ON(i == bucket); - } -} - -#define BUCKET_GC_GEN_MAX 96U - -/** - * wait_buckets_available - wait on reclaimable buckets - * - * If there aren't enough available buckets to fill up free_inc, wait until - * there are. - */ -static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned long gc_count = c->gc_count; - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - ret = 1; - break; - } - - if (gc_count != c->gc_count) - ca->inc_gen_really_needs_gc = 0; - - if ((ssize_t) (dev_buckets_available(c, ca) - - ca->inc_gen_really_needs_gc) >= - (ssize_t) fifo_free(&ca->free_inc)) - break; - - up_read(&c->gc_lock); - schedule(); - try_to_freeze(); - down_read(&c->gc_lock); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) -{ - u8 gc_gen; - - if (!is_available_bucket(mark)) - return false; - - gc_gen = bucket_gc_gen(ca, bucket); - - if (gc_gen >= BUCKET_GC_GEN_MAX / 2) - ca->inc_gen_needs_gc++; - - if (gc_gen >= BUCKET_GC_GEN_MAX) - ca->inc_gen_really_needs_gc++; - - return gc_gen < BUCKET_GC_GEN_MAX; -} - -static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - struct bucket_mark m; - - percpu_down_read_preempt_disable(&c->usage_lock); - spin_lock(&c->freelist_lock); - - if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { - spin_unlock(&c->freelist_lock); - percpu_up_read_preempt_enable(&c->usage_lock); - return; - } - - verify_not_on_freelist(c, ca, bucket); - BUG_ON(!fifo_push(&ca->free_inc, bucket)); - - spin_unlock(&c->freelist_lock); - percpu_up_read_preempt_enable(&c->usage_lock); - - /* gc lock held: */ - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); - - if (m.cached_sectors) { - ca->allocator_invalidating_data = true; - } else if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - ca->allocator_journal_seq_flush = - max(ca->allocator_journal_seq_flush, bucket_seq); - } -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. - */ - -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) -{ - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; - - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; - - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); - - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); - - return (data_wantness << 9) | - (needs_journal_commit << 8) | - bucket_gc_gen(ca, b); -} - -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) -{ - return (l.key > r.key) - (l.key < r.key) ?: - (l.nr < r.nr) - (l.nr > r.nr) ?: - (l.bucket > r.bucket) - (l.bucket < r.bucket); -} - -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - size_t b; - - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); - down_read(&ca->bucket_lock); - - buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); - - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; - - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } - - cond_resched(); - } - - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); - - up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp); - - while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) { - for (b = e.bucket; - b < e.bucket + e.nr; - b++) { - if (fifo_full(&ca->free_inc)) - return; - - bch2_invalidate_one_bucket(c, ca, b); - } - } -} - -static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t b, checked; - - for (checked = 0; - checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc); - checked++) { - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - b = ca->fifo_last_bucket++; - - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); - - cond_resched(); - } -} - -static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t checked; - - for (checked = 0; - checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc); - checked++) { - size_t b = bch2_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + - ca->mi.first_bucket; - - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); - - cond_resched(); - } -} - -static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; - - switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: - find_reclaimable_buckets_lru(c, ca); - break; - case CACHE_REPLACEMENT_FIFO: - find_reclaimable_buckets_fifo(c, ca); - break; - case CACHE_REPLACEMENT_RANDOM: - find_reclaimable_buckets_random(c, ca); - break; - } -} - -static int size_t_cmp(const void *_l, const void *_r) -{ - const size_t *l = _l, *r = _r; - - return (*l > *r) - (*l < *r); -} - -static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca) -{ - BUG_ON(ca->free_inc.front); - - spin_lock(&c->freelist_lock); - sort(ca->free_inc.data, - ca->free_inc.back, - sizeof(ca->free_inc.data[0]), - size_t_cmp, NULL); - spin_unlock(&c->freelist_lock); -} - -static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, size_t nr) -{ - struct btree_iter iter; - int ret = 0; - - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - /* - * XXX: if ca->nr_invalidated != 0, just return if we'd block doing the - * btree update or journal_res_get - */ - while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) { - size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated); - - ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq); - if (ret) - break; - - ca->nr_invalidated++; - } - - bch2_btree_iter_unlock(&iter); - return ret; -} - -static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -{ - unsigned i; - - /* - * Don't remove from free_inc until after it's added to - * freelist, so gc can find it: - */ - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - if (fifo_push(&ca->free[i], bucket)) { - fifo_pop(&ca->free_inc, bucket); - --ca->nr_invalidated; - closure_wake_up(&c->freelist_wait); - spin_unlock(&c->freelist_lock); - return true; - } - spin_unlock(&c->freelist_lock); - - return false; -} - -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -{ - int ret = 0; - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - - if (__push_invalidated_bucket(c, ca, bucket)) - break; - - if ((current->flags & PF_KTHREAD) && - kthread_should_stop()) { - ret = 1; - break; - } - - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - return ret; -} - -/* - * Given an invalidated, ready to use bucket: issue a discard to it if enabled, - * then add it to the freelist, waiting until there's room if necessary: - */ -static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - while (ca->nr_invalidated) { - size_t bucket = fifo_peek(&ca->free_inc); - - BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated); - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - if (push_invalidated_bucket(c, ca, bucket)) - return 1; - } - - return 0; -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - u64 journal_seq; - int ret; - - set_freezable(); - - while (1) { - while (1) { - cond_resched(); - - pr_debug("discarding %zu invalidated buckets", - ca->nr_invalidated); - - ret = discard_invalidated_buckets(c, ca); - if (ret) - goto stop; - - if (fifo_empty(&ca->free_inc)) - break; - - pr_debug("invalidating %zu buckets", - fifo_used(&ca->free_inc)); - - journal_seq = 0; - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX); - if (ret) { - bch_err(ca, "error invalidating buckets: %i", ret); - goto stop; - } - - if (!ca->nr_invalidated) { - bch_err(ca, "allocator thread unable to make forward progress!"); - goto stop; - } - - if (ca->allocator_invalidating_data) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - else if (ca->allocator_journal_seq_flush) - ret = bch2_journal_flush_seq(&c->journal, - ca->allocator_journal_seq_flush); - - /* - * journal error - buckets haven't actually been - * invalidated, can't discard them: - */ - if (ret) { - bch_err(ca, "journal error: %i", ret); - goto stop; - } - } - - pr_debug("free_inc now empty"); - - /* Reset front/back so we can easily sort fifo entries later: */ - ca->free_inc.front = ca->free_inc.back = 0; - ca->allocator_journal_seq_flush = 0; - ca->allocator_invalidating_data = false; - - down_read(&c->gc_lock); - while (1) { - size_t prev = fifo_used(&ca->free_inc); - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { - up_read(&c->gc_lock); - bch_err(ca, "gc failure"); - goto stop; - } - - /* - * Find some buckets that we can invalidate, either - * they're completely unused, or only contain clean data - * that's been written back to the backing device or - * another cache tier - */ - - pr_debug("scanning for reclaimable buckets"); - - find_reclaimable_buckets(c, ca); - - pr_debug("found %zu buckets (free_inc %zu/%zu)", - fifo_used(&ca->free_inc) - prev, - fifo_used(&ca->free_inc), ca->free_inc.size); - - trace_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); - - if ((ca->inc_gen_needs_gc >= ca->free_inc.size || - (!fifo_full(&ca->free_inc) && - ca->inc_gen_really_needs_gc >= - fifo_free(&ca->free_inc))) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } - - if (fifo_full(&ca->free_inc)) - break; - - if (!fifo_empty(&ca->free_inc) && - !fifo_full(&ca->free[RESERVE_MOVINGGC])) - break; - - /* - * copygc may be waiting until either its reserve fills - * up, or we can't make forward progress: - */ - ca->allocator_blocked = true; - closure_wake_up(&c->freelist_wait); - - ret = wait_buckets_available(c, ca); - if (ret) { - up_read(&c->gc_lock); - goto stop; - } - } - - ca->allocator_blocked = false; - up_read(&c->gc_lock); - - pr_debug("free_inc now %zu/%zu", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - sort_free_inc(c, ca); - - /* - * free_inc is now full of newly-invalidated buckets: next, - * write out the new bucket gens: - */ - } - -stop: - pr_debug("alloc thread stopping (ret %i)", ret); - return 0; -} - -/* Allocation */ - -/* - * Open buckets represent a bucket that's currently being allocated from. They - * serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ - -void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - percpu_down_read_preempt_disable(&c->usage_lock); - spin_lock(&ob->lock); - - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), - false, gc_pos_alloc(c, ob), 0); - ob->valid = false; - - spin_unlock(&ob->lock); - percpu_up_read_preempt_enable(&c->usage_lock); - - spin_lock(&c->freelist_lock); - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - c->open_buckets_nr_free++; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); -} - -static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -{ - struct open_bucket *ob; - - BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); - - ob = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ob->freelist; - atomic_set(&ob->pin, 1); - - c->open_buckets_nr_free--; - return ob; -} - -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - struct bucket_array *buckets; - ssize_t b; - - rcu_read_lock(); - buckets = bucket_array(ca); - - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark)) - goto success; - b = -1; -success: - rcu_read_unlock(); - return b; -} - -static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) -{ - switch (reserve) { - case RESERVE_ALLOC: - return 0; - case RESERVE_BTREE: - return BTREE_NODE_RESERVE / 2; - default: - return BTREE_NODE_RESERVE; - } -} - -/** - * bch_bucket_alloc - allocate a single bucket from a specific device - * - * Returns index of bucket on success, 0 on failure - * */ -int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, - struct closure *cl) -{ - struct bucket_array *buckets; - struct open_bucket *ob; - long bucket; - - spin_lock(&c->freelist_lock); - - if (may_alloc_partial && - ca->open_buckets_partial_nr) { - int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - c->open_buckets[ret].on_partial_list = false; - spin_unlock(&c->freelist_lock); - return ret; - } - - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - spin_unlock(&c->freelist_lock); - trace_open_bucket_alloc_fail(ca, reserve); - return OPEN_BUCKETS_EMPTY; - } - - if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) - goto out; - - switch (reserve) { - case RESERVE_ALLOC: - if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; - case RESERVE_BTREE: - if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= - ca->free[RESERVE_BTREE].size && - fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; - case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) - goto out; - break; - default: - break; - } - - if (cl) - closure_wait(&c->freelist_wait, cl); - - spin_unlock(&c->freelist_lock); - - trace_bucket_alloc_fail(ca, reserve); - return FREELIST_EMPTY; -out: - verify_not_on_freelist(c, ca, bucket); - - ob = bch2_open_bucket_alloc(c); - - spin_lock(&ob->lock); - buckets = bucket_array(ca); - - ob->valid = true; - ob->sectors_free = ca->mi.bucket_size; - ob->ptr = (struct bch_extent_ptr) { - .gen = buckets->b[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), - .dev = ca->dev_idx, - }; - - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); - spin_unlock(&ob->lock); - - spin_unlock(&c->freelist_lock); - - bch2_wake_allocator(ca); - - trace_bucket_alloc(ca, reserve); - return ob - c->open_buckets; -} - -static int __dev_alloc_cmp(struct write_point *wp, - unsigned l, unsigned r) -{ - return ((wp->next_alloc[l] > wp->next_alloc[r]) - - (wp->next_alloc[l] < wp->next_alloc[r])); -} - -#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r) - -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs) -{ - struct dev_alloc_list ret = { .nr = 0 }; - struct bch_dev *ca; - unsigned i; - - for_each_member_device_rcu(ca, c, i, devs) - ret.devs[ret.nr++] = i; - - bubble_sort(ret.devs, ret.nr, dev_alloc_cmp); - return ret; -} - -void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) -{ - u64 *v = wp->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_free(c, ca); - u64 free_space_inv = free_space - ? div64_u64(1ULL << 48, free_space) - : 1ULL << 48; - u64 scale = *v / 4; - - if (*v + free_space_inv >= *v) - *v += free_space_inv; - else - *v = U64_MAX; - - for (v = wp->next_alloc; - v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; -} - -static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c, - struct write_point *wp, - unsigned nr_replicas, - enum alloc_reserve reserve, - struct bch_devs_mask *devs, - struct closure *cl) -{ - enum bucket_alloc_ret ret = NO_DEVICES; - struct dev_alloc_list devs_sorted; - struct bch_dev *ca; - unsigned i, nr_ptrs_effective = 0; - bool have_cache_dev = false; - - BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs)); - - for (i = wp->first_ptr; i < wp->nr_ptrs; i++) { - ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev); - - nr_ptrs_effective += ca->mi.durability; - have_cache_dev |= !ca->mi.durability; - } - - if (nr_ptrs_effective >= nr_replicas) - return ALLOC_SUCCESS; - - devs_sorted = bch2_wp_alloc_list(c, wp, devs); - - for (i = 0; i < devs_sorted.nr; i++) { - int ob; - - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - if (!ca) - continue; - - if (!ca->mi.durability && - (have_cache_dev || - wp->type != BCH_DATA_USER)) - continue; - - ob = bch2_bucket_alloc(c, ca, reserve, - wp->type == BCH_DATA_USER, cl); - if (ob < 0) { - ret = ob; - if (ret == OPEN_BUCKETS_EMPTY) - break; - continue; - } - - BUG_ON(ob <= 0 || ob > U8_MAX); - BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs)); - - wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob; - - bch2_wp_rescale(c, ca, wp); - - nr_ptrs_effective += ca->mi.durability; - have_cache_dev |= !ca->mi.durability; - - __clear_bit(ca->dev_idx, devs->d); - - if (nr_ptrs_effective >= nr_replicas) { - ret = ALLOC_SUCCESS; - break; - } - } - - EBUG_ON(reserve == RESERVE_MOVINGGC && - ret != ALLOC_SUCCESS && - ret != OPEN_BUCKETS_EMPTY); - - switch (ret) { - case ALLOC_SUCCESS: - return 0; - case NO_DEVICES: - return -EROFS; - case FREELIST_EMPTY: - case OPEN_BUCKETS_EMPTY: - return cl ? -EAGAIN : -ENOSPC; - default: - BUG(); - } -} - -/* Sector allocator */ - -static void writepoint_drop_ptr(struct bch_fs *c, - struct write_point *wp, - unsigned i) -{ - struct open_bucket *ob = wp->ptrs[i]; - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - BUG_ON(ca->open_buckets_partial_nr >= - ARRAY_SIZE(ca->open_buckets_partial)); - - if (wp->type == BCH_DATA_USER) { - spin_lock(&c->freelist_lock); - ob->on_partial_list = true; - ca->open_buckets_partial[ca->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); - - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); - } else { - bch2_open_bucket_put(c, ob); - } - - array_remove_item(wp->ptrs, wp->nr_ptrs, i); - - if (i < wp->first_ptr) - wp->first_ptr--; -} - -static void writepoint_drop_ptrs(struct bch_fs *c, - struct write_point *wp, - u16 target, bool in_target) -{ - int i; - - for (i = wp->first_ptr - 1; i >= 0; --i) - if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev, - target) == in_target) - writepoint_drop_ptr(c, wp, i); -} - -static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct open_bucket *ob; - unsigned i; - - writepoint_for_each_ptr_all(wp, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - BUG_ON(ptr_stale(ca, &ob->ptr)); - } -#endif -} - -static int open_bucket_add_buckets(struct bch_fs *c, - u16 target, - struct write_point *wp, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - enum alloc_reserve reserve, - struct closure *cl) -{ - struct bch_devs_mask devs = c->rw_devs[wp->type]; - const struct bch_devs_mask *t; - struct open_bucket *ob; - unsigned i; - int ret; - - percpu_down_read_preempt_disable(&c->usage_lock); - rcu_read_lock(); - - /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); - - writepoint_for_each_ptr_all(wp, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - - t = bch2_target_to_mask(c, target); - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); - - ret = bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl); - - rcu_read_unlock(); - percpu_up_read_preempt_enable(&c->usage_lock); - - return ret; -} - -static struct write_point *__writepoint_find(struct hlist_head *head, - unsigned long write_point) -{ - struct write_point *wp; - - hlist_for_each_entry_rcu(wp, head, node) - if (wp->write_point == write_point) - return wp; - - return NULL; -} - -static struct hlist_head *writepoint_hash(struct bch_fs *c, - unsigned long write_point) -{ - unsigned hash = - hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); - - return &c->write_points_hash[hash]; -} - -static struct write_point *writepoint_find(struct bch_fs *c, - unsigned long write_point) -{ - struct write_point *wp, *oldest; - struct hlist_head *head; - - if (!(write_point & 1UL)) { - wp = (struct write_point *) write_point; - mutex_lock(&wp->lock); - return wp; - } - - head = writepoint_hash(c, write_point); -restart_find: - wp = __writepoint_find(head, write_point); - if (wp) { -lock_wp: - mutex_lock(&wp->lock); - if (wp->write_point == write_point) - goto out; - mutex_unlock(&wp->lock); - goto restart_find; - } - - oldest = NULL; - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); - wp++) - if (!oldest || time_before64(wp->last_used, oldest->last_used)) - oldest = wp; - - mutex_lock(&oldest->lock); - mutex_lock(&c->write_points_hash_lock); - wp = __writepoint_find(head, write_point); - if (wp && wp != oldest) { - mutex_unlock(&c->write_points_hash_lock); - mutex_unlock(&oldest->lock); - goto lock_wp; - } - - wp = oldest; - hlist_del_rcu(&wp->node); - wp->write_point = write_point; - hlist_add_head_rcu(&wp->node, head); - mutex_unlock(&c->write_points_hash_lock); -out: - wp->last_used = sched_clock(); - return wp; -} - -/* - * Get us an open_bucket we can allocate from, return with it locked: - */ -struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, - unsigned target, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) -{ - struct write_point *wp; - struct open_bucket *ob; - struct bch_dev *ca; - unsigned nr_ptrs_have, nr_ptrs_effective; - int ret, i, cache_idx = -1; - - BUG_ON(!nr_replicas || !nr_replicas_required); - - wp = writepoint_find(c, write_point.v); - - wp->first_ptr = 0; - - /* does writepoint have ptrs we can't use? */ - writepoint_for_each_ptr(wp, ob, i) - if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) { - swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); - wp->first_ptr++; - } - - nr_ptrs_have = wp->first_ptr; - - /* does writepoint have ptrs we don't want to use? */ - if (target) - writepoint_for_each_ptr(wp, ob, i) - if (!bch2_dev_in_target(c, ob->ptr.dev, target)) { - swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); - wp->first_ptr++; - } - - if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) { - ret = open_bucket_add_buckets(c, target, wp, devs_have, - nr_replicas, reserve, cl); - } else { - ret = open_bucket_add_buckets(c, target, wp, devs_have, - nr_replicas, reserve, NULL); - if (!ret) - goto alloc_done; - - wp->first_ptr = nr_ptrs_have; - - ret = open_bucket_add_buckets(c, 0, wp, devs_have, - nr_replicas, reserve, cl); - } - - if (ret && ret != -EROFS) - goto err; -alloc_done: - /* check for more than one cache: */ - for (i = wp->nr_ptrs - 1; i >= wp->first_ptr; --i) { - ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev); - - if (ca->mi.durability) - continue; - - /* - * if we ended up with more than one cache device, prefer the - * one in the target we want: - */ - if (cache_idx >= 0) { - if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev, - target)) { - writepoint_drop_ptr(c, wp, i); - } else { - writepoint_drop_ptr(c, wp, cache_idx); - cache_idx = i; - } - } else { - cache_idx = i; - } - } - - /* we might have more effective replicas than required: */ - nr_ptrs_effective = 0; - writepoint_for_each_ptr(wp, ob, i) { - ca = bch_dev_bkey_exists(c, ob->ptr.dev); - nr_ptrs_effective += ca->mi.durability; - } - - if (ret == -EROFS && - nr_ptrs_effective >= nr_replicas_required) - ret = 0; - - if (ret) - goto err; - - if (nr_ptrs_effective > nr_replicas) { - writepoint_for_each_ptr(wp, ob, i) { - ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - if (ca->mi.durability && - ca->mi.durability <= nr_ptrs_effective - nr_replicas && - !bch2_dev_in_target(c, ob->ptr.dev, target)) { - swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); - wp->first_ptr++; - nr_ptrs_effective -= ca->mi.durability; - } - } - } - - if (nr_ptrs_effective > nr_replicas) { - writepoint_for_each_ptr(wp, ob, i) { - ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - if (ca->mi.durability && - ca->mi.durability <= nr_ptrs_effective - nr_replicas) { - swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]); - wp->first_ptr++; - nr_ptrs_effective -= ca->mi.durability; - } - } - } - - /* Remove pointers we don't want to use: */ - if (target) - writepoint_drop_ptrs(c, wp, target, false); - - BUG_ON(wp->first_ptr >= wp->nr_ptrs); - BUG_ON(nr_ptrs_effective < nr_replicas_required); - - wp->sectors_free = UINT_MAX; - - writepoint_for_each_ptr(wp, ob, i) - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); - - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - - verify_not_stale(c, wp); - - return wp; -err: - mutex_unlock(&wp->lock); - return ERR_PTR(ret); -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i_extent *e, unsigned sectors) -{ - struct open_bucket *ob; - unsigned i; - - BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; - - writepoint_for_each_ptr(wp, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - struct bch_extent_ptr tmp = ob->ptr; - - EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev)); - - tmp.cached = bkey_extent_is_cached(&e->k) || - (!ca->mi.durability && wp->type == BCH_DATA_USER); - - tmp.offset += ca->mi.bucket_size - ob->sectors_free; - extent_ptr_append(e, tmp); - - BUG_ON(sectors > ob->sectors_free); - ob->sectors_free -= sectors; - } -} - -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ -void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -{ - int i; - - for (i = wp->nr_ptrs - 1; i >= 0; --i) { - struct open_bucket *ob = wp->ptrs[i]; - - if (!ob->sectors_free) { - array_remove_item(wp->ptrs, wp->nr_ptrs, i); - bch2_open_bucket_put(c, ob); - } - } - - mutex_unlock(&wp->lock); -} - -/* Startup/shutdown (ro/rw): */ - -void bch2_recalc_capacity(struct bch_fs *c) -{ - struct bch_dev *ca; - u64 total_capacity, capacity = 0, reserved_sectors = 0; - unsigned long ra_pages = 0; - unsigned i, j; - - lockdep_assert_held(&c->state_lock); - - for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; - - ra_pages += bdi->ra_pages; - } - - bch2_set_ra_pages(c, ra_pages); - - for_each_rw_member(ca, c, i) { - size_t reserve = 0; - - /* - * We need to reserve buckets (from the number - * of currently available buckets) against - * foreground writes so that mainly copygc can - * make forward progress. - * - * We need enough to refill the various reserves - * from scratch - copygc will use its entire - * reserve all at once, then run against when - * its reserve is refilled (from the formerly - * available buckets). - * - * This reserve is just used when considering if - * allocations for foreground writes must wait - - * not -ENOSPC calculations. - */ - for (j = 0; j < RESERVE_NONE; j++) - reserve += ca->free[j].size; - - reserve += ca->free_inc.size; - - reserve += ARRAY_SIZE(c->write_points); - - reserve += 1; /* btree write point */ - - reserved_sectors += bucket_to_sector(ca, reserve); - - capacity += bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket); - } - - total_capacity = capacity; - - capacity *= (100 - c->opts.gc_reserve_percent); - capacity = div64_u64(capacity, 100); - - BUG_ON(reserved_sectors > total_capacity); - - capacity = min(capacity, total_capacity - reserved_sectors); - - c->capacity = capacity; - - if (c->capacity) { - bch2_io_timer_add(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } else { - bch2_io_timer_del(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } - - /* Wake up case someone was waiting for buckets */ - closure_wake_up(&c->freelist_wait); -} - -static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) -{ - struct bch_devs_mask not_self; - - bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX); - - mutex_lock(&wp->lock); - wp->first_ptr = wp->nr_ptrs; - writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx), true); - mutex_unlock(&wp->lock); -} - -static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -{ - struct open_bucket *ob; - bool ret = false; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->ptr.dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); - } - - return ret; -} - -/* device goes ro: */ -void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned i; - - BUG_ON(ca->alloc_thread); - - /* First, remove device from allocation groups: */ - - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - clear_bit(ca->dev_idx, c->rw_devs[i].d); - - /* - * Capacity is calculated based off of devices in allocation groups: - */ - bch2_recalc_capacity(c); - - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_stop_write_point(c, ca, &c->write_points[i]); - - bch2_stop_write_point(c, ca, &ca->copygc_write_point); - bch2_stop_write_point(c, ca, &c->rebalance_write_point); - bch2_stop_write_point(c, ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - /* - * Wake up threads that were blocked on allocation, so they can notice - * the device can no longer be removed and the capacity has changed: - */ - closure_wake_up(&c->freelist_wait); - - /* - * journal_res_get() can block waiting for free space in the journal - - * it needs to notice there may not be devices to allocate from anymore: - */ - wake_up(&c->journal.wait); - - /* Now wait for any in flight writes: */ - - closure_wait_event(&c->open_buckets_wait, - !bch2_dev_has_open_write_point(c, ca)); -} - -/* device goes rw: */ -void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned i; - - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (ca->mi.data_allowed & (1 << i)) - set_bit(ca->dev_idx, c->rw_devs[i].d); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch_alloc[%s]", ca->name); - if (IS_ERR(p)) - return PTR_ERR(p); - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - -static void allocator_start_issue_discards(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned dev_iter; - size_t i, bu; - - for_each_rw_member(ca, c, dev_iter) { - unsigned done = 0; - - fifo_for_each_entry(bu, &ca->free_inc, i) { - if (done == ca->nr_invalidated) - break; - - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bu), - ca->mi.bucket_size, GFP_NOIO, 0); - done++; - } - } -} - -static int __bch2_fs_allocator_start(struct bch_fs *c) -{ - struct bch_dev *ca; - size_t bu, i; - unsigned dev_iter; - u64 journal_seq = 0; - bool invalidating_data = false; - int ret = 0; - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return -1; - - /* Scan for buckets that are already invalidated: */ - for_each_rw_member(ca, c, dev_iter) { - struct btree_iter iter; - struct bucket_mark m; - struct bkey_s_c k; - - for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) { - if (k.k->type != BCH_ALLOC) - continue; - - bu = k.k->p.offset; - m = READ_ONCE(bucket(ca, bu)->mark); - - if (!is_available_bucket(m) || m.cached_sectors) - continue; - - percpu_down_read_preempt_disable(&c->usage_lock); - bch2_mark_alloc_bucket(c, ca, bu, true, - gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - percpu_up_read_preempt_enable(&c->usage_lock); - - fifo_push(&ca->free_inc, bu); - ca->nr_invalidated++; - - if (fifo_full(&ca->free_inc)) - break; - } - bch2_btree_iter_unlock(&iter); - } - - /* did we find enough buckets? */ - for_each_rw_member(ca, c, dev_iter) - if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) { - percpu_ref_put(&ca->io_ref); - goto not_enough; - } - - return 0; -not_enough: - pr_debug("did not find enough empty buckets; issuing discards"); - - /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */ - for_each_rw_member(ca, c, dev_iter) - discard_invalidated_buckets(c, ca); - - pr_debug("scanning for reclaimable buckets"); - - for_each_rw_member(ca, c, dev_iter) { - BUG_ON(!fifo_empty(&ca->free_inc)); - ca->free_inc.front = ca->free_inc.back = 0; - - find_reclaimable_buckets(c, ca); - sort_free_inc(c, ca); - - invalidating_data |= ca->allocator_invalidating_data; - - fifo_for_each_entry(bu, &ca->free_inc, i) - if (!fifo_push(&ca->free[RESERVE_BTREE], bu)) - break; - } - - pr_debug("done scanning for reclaimable buckets"); - - /* - * We're moving buckets to freelists _before_ they've been marked as - * invalidated on disk - we have to so that we can allocate new btree - * nodes to mark them as invalidated on disk. - * - * However, we can't _write_ to any of these buckets yet - they might - * have cached data in them, which is live until they're marked as - * invalidated on disk: - */ - if (invalidating_data) { - pr_debug("invalidating existing data"); - set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); - } else { - pr_debug("issuing discards"); - allocator_start_issue_discards(c); - } - - /* - * XXX: it's possible for this to deadlock waiting on journal reclaim, - * since we're holding btree writes. What then? - */ - - for_each_rw_member(ca, c, dev_iter) { - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - ca->free[RESERVE_BTREE].size); - if (ret) { - percpu_ref_put(&ca->io_ref); - return ret; - } - } - - if (invalidating_data) { - pr_debug("flushing journal"); - - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) - return ret; - - pr_debug("issuing discards"); - allocator_start_issue_discards(c); - } - - for_each_rw_member(ca, c, dev_iter) - while (ca->nr_invalidated) { - BUG_ON(!fifo_pop(&ca->free_inc, bu)); - ca->nr_invalidated--; - } - - set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); - - /* now flush dirty btree nodes: */ - if (invalidating_data) { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - bool flush_updates; - size_t nr_pending_updates; - - clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); -again: - pr_debug("flushing dirty btree nodes"); - cond_resched(); - - flush_updates = false; - nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); - - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (btree_node_dirty(b) && (!b->written || b->level)) { - if (btree_node_may_write(b)) { - rcu_read_unlock(); - btree_node_lock_type(c, b, SIX_LOCK_read); - bch2_btree_node_write(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto again; - } else { - flush_updates = true; - } - } - rcu_read_unlock(); - - /* - * This is ugly, but it's needed to flush btree node writes - * without spinning... - */ - if (flush_updates) { - closure_wait_event(&c->btree_interior_update_wait, - bch2_btree_interior_updates_nr_pending(c) < - nr_pending_updates); - goto again; - } - } - - return 0; -} - -int bch2_fs_allocator_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i; - int ret; - - down_read(&c->gc_lock); - ret = __bch2_fs_allocator_start(c); - up_read(&c->gc_lock); - - if (ret) - return ret; - - for_each_rw_member(ca, c, i) { - ret = bch2_dev_allocator_start(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - return ret; - } - } - - return bch2_alloc_write(c); -} - -void bch2_fs_allocator_init(struct bch_fs *c) -{ - struct open_bucket *ob; - struct write_point *wp; - - mutex_init(&c->write_points_hash_lock); - spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); - - /* open bucket 0 is a sentinal NULL: */ - spin_lock_init(&c->open_buckets[0].lock); - - for (ob = c->open_buckets + 1; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock_init(&ob->lock); - c->open_buckets_nr_free++; - - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - } - - writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); - writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); - - for (wp = c->write_points; - wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) { - writepoint_init(wp, BCH_DATA_USER); - - wp->last_used = sched_clock(); - wp->write_point = (unsigned long) wp; - hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - } - - c->pd_controllers_update_seconds = 5; - INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); -} diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h deleted file mode 100644 index 00d01f464c68..000000000000 --- a/fs/bcachefs/alloc.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef _BCACHEFS_ALLOC_H -#define _BCACHEFS_ALLOC_H - -#include "bcachefs.h" -#include "alloc_types.h" - -struct bkey; -struct bch_dev; -struct bch_fs; -struct bch_devs_List; - -const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); - -#define bch2_bkey_alloc_ops (struct bkey_ops) { \ - .key_invalid = bch2_alloc_invalid, \ - .val_to_text = bch2_alloc_to_text, \ -} - -struct dev_alloc_list { - unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; -}; - -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *, - struct write_point *, - struct bch_devs_mask *); -void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, - struct write_point *); - -int bch2_alloc_read(struct bch_fs *, struct list_head *); -int bch2_alloc_replay_key(struct bch_fs *, struct bpos); - -enum bucket_alloc_ret { - ALLOC_SUCCESS = 0, - OPEN_BUCKETS_EMPTY = -1, - FREELIST_EMPTY = -2, /* Allocator thread not keeping up */ - NO_DEVICES = -3, /* -EROFS */ -}; - -long bch2_bucket_alloc_new_fs(struct bch_dev *); - -int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool, - struct closure *); - -#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \ - for ((_i) = (_start); \ - (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \ - (_i)++) - -#define writepoint_for_each_ptr_all(_wp, _ob, _i) \ - __writepoint_for_each_ptr(_wp, _ob, _i, 0) - -#define writepoint_for_each_ptr(_wp, _ob, _i) \ - __writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr) - -void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); - -static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -{ - if (atomic_dec_and_test(&ob->pin)) - __bch2_open_bucket_put(c, ob); -} - -static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs) -{ - unsigned i; - - for (i = 0; i < *nr; i++) - bch2_open_bucket_put(c, c->open_buckets + refs[i]); - - *nr = 0; -} - -static inline void bch2_open_bucket_get(struct bch_fs *c, - struct write_point *wp, - u8 *nr, u8 *refs) -{ - struct open_bucket *ob; - unsigned i; - - writepoint_for_each_ptr(wp, ob, i) { - atomic_inc(&ob->pin); - refs[(*nr)++] = ob - c->open_buckets; - } -} - -struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - unsigned, - struct write_point_specifier, - struct bch_devs_list *, - unsigned, unsigned, - enum alloc_reserve, - unsigned, - struct closure *); - -void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i_extent *, unsigned); -void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); - -static inline void bch2_wake_allocator(struct bch_dev *ca) -{ - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(ca->alloc_thread); - if (p) - wake_up_process(p); - rcu_read_unlock(); -} - -static inline struct write_point_specifier writepoint_hashed(unsigned long v) -{ - return (struct write_point_specifier) { .v = v | 1 }; -} - -static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -{ - return (struct write_point_specifier) { .v = (unsigned long) wp }; -} - -void bch2_recalc_capacity(struct bch_fs *); - -void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); - -void bch2_dev_allocator_stop(struct bch_dev *); -int bch2_dev_allocator_start(struct bch_dev *); - -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->type = type; -} - -int bch2_alloc_write(struct bch_fs *); -int bch2_fs_allocator_start(struct bch_fs *); -void bch2_fs_allocator_init(struct bch_fs *); - -#endif /* _BCACHEFS_ALLOC_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 index 000000000000..43dc2f270dc6 --- /dev/null +++ b/fs/bcachefs/alloc_background.c @@ -0,0 +1,1690 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "recovery.h" + +#include <linux/kthread.h> +#include <linux/math64.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/sched/task.h> +#include <linux/sort.h> +#include <trace/events/bcachefs.h> + +static const char * const bch2_alloc_field_names[] = { +#define x(name, bytes) #name, + BCH_ALLOC_FIELDS() +#undef x + NULL +}; + +static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); + +/* Ratelimiting/PD controllers */ + +static void pd_controllers_update(struct work_struct *work) +{ + struct bch_fs *c = container_of(to_delayed_work(work), + struct bch_fs, + pd_controllers_update); + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + + u64 free = bucket_to_sector(ca, + __dev_buckets_free(ca, stats)) << 9; + /* + * Bytes of internal fragmentation, which can be + * reclaimed by copy GC + */ + s64 fragmented = (bucket_to_sector(ca, + stats.buckets[BCH_DATA_USER] + + stats.buckets[BCH_DATA_CACHED]) - + (stats.sectors[BCH_DATA_USER] + + stats.sectors[BCH_DATA_CACHED])) << 9; + + fragmented = max(0LL, fragmented); + + bch2_pd_controller_update(&ca->copygc_pd, + free, fragmented, -1); + } + + schedule_delayed_work(&c->pd_controllers_update, + c->pd_controllers_update_seconds * HZ); +} + +/* Persistent alloc info: */ + +static inline u64 get_alloc_field(const struct bch_alloc *a, + const void **p, unsigned field) +{ + unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + u64 v; + + if (!(a->fields & (1 << field))) + return 0; + + switch (bytes) { + case 1: + v = *((const u8 *) *p); + break; + case 2: + v = le16_to_cpup(*p); + break; + case 4: + v = le32_to_cpup(*p); + break; + case 8: + v = le64_to_cpup(*p); + break; + default: + BUG(); + } + + *p += bytes; + return v; +} + +static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) +{ + unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + + if (!v) + return; + + a->v.fields |= 1 << field; + + switch (bytes) { + case 1: + *((u8 *) *p) = v; + break; + case 2: + *((__le16 *) *p) = cpu_to_le16(v); + break; + case 4: + *((__le32 *) *p) = cpu_to_le32(v); + break; + case 8: + *((__le64 *) *p) = cpu_to_le64(v); + break; + default: + BUG(); + } + + *p += bytes; +} + +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { .gen = 0 }; + + if (k.k->type == KEY_TYPE_alloc) { + const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; + const void *d = a->data; + unsigned idx = 0; + + ret.gen = a->gen; + +#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); + BCH_ALLOC_FIELDS() +#undef x + } + return ret; +} + +void bch2_alloc_pack(struct bkey_i_alloc *dst, + const struct bkey_alloc_unpacked src) +{ + unsigned idx = 0; + void *d = dst->v.data; + + dst->v.fields = 0; + dst->v.gen = src.gen; + +#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); + BCH_ALLOC_FIELDS() +#undef x + + set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v); +} + +static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) +{ + unsigned i, bytes = offsetof(struct bch_alloc, data); + + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + if (a->fields & (1 << i)) + bytes += BCH_ALLOC_FIELD_BYTES[i]; + + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + +const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + + /* allow for unknown fields */ + if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) + return "incorrect value size"; + + return NULL; +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + const void *d = a.v->data; + unsigned i; + + pr_buf(out, "gen %u", a.v->gen); + + for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) + if (a.v->fields & (1 << i)) + pr_buf(out, " %s %llu", + bch2_alloc_field_names[i], + get_alloc_field(a.v, &d, i)); +} + +static inline struct bkey_alloc_unpacked +alloc_mem_to_key(struct bucket *g, struct bucket_mark m) +{ + return (struct bkey_alloc_unpacked) { + .gen = m.gen, + .oldest_gen = g->oldest_gen, + .data_type = m.data_type, + .dirty_sectors = m.dirty_sectors, + .cached_sectors = m.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + }; +} + +int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct bch_dev *ca; + struct journal_key *j; + unsigned i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) + bch2_mark_key(c, k, 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { + bch_err(c, "error reading alloc info: %i", ret); + return ret; + } + + for_each_journal_key(*journal_keys, j) + if (j->btree_id == BTREE_ID_ALLOC) + bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); + + percpu_down_write(&c->mark_lock); + bch2_dev_usage_from_buckets(c); + percpu_up_write(&c->mark_lock); + + mutex_lock(&c->bucket_clock[READ].lock); + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + bch2_recalc_oldest_io(c, ca, READ); + up_read(&ca->bucket_lock); + } + mutex_unlock(&c->bucket_clock[READ].lock); + + mutex_lock(&c->bucket_clock[WRITE].lock); + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + bch2_recalc_oldest_io(c, ca, WRITE); + up_read(&ca->bucket_lock); + } + mutex_unlock(&c->bucket_clock[WRITE].lock); + + return 0; +} + +int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bch_dev *ca; + int ret; + + if (k->k.p.inode >= c->sb.nr_devices || + !c->devs[k->k.p.inode]) + return 0; + + ca = bch_dev_bkey_exists(c, k->k.p.inode); + + if (k->k.p.offset >= ca->mi.nbuckets) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* check buckets_written with btree node locked: */ + if (test_bit(k->k.p.offset, ca->buckets_written)) { + ret = 0; + goto err; + } + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); +err: + bch2_trans_exit(&trans); + return ret; +} + +int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bucket_array *buckets; + struct bch_dev *ca; + struct bucket *g; + struct bucket_mark m, new; + struct bkey_alloc_unpacked old_u, new_u; + __BKEY_PADDED(k, 8) alloc_key; /* hack: */ + struct bkey_i_alloc *a; + struct bkey_s_c k; + unsigned i; + size_t b; + int ret = 0; + + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + for_each_rw_member(ca, c, i) { + down_read(&ca->bucket_lock); +restart: + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; + b < buckets->nbuckets; + b++) { + if (!buckets->b[b].mark.dirty) + continue; + + bch2_btree_iter_set_pos(iter, POS(i, b)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + old_u = bch2_alloc_unpack(k); + + percpu_down_read(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + new_u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + + if (!m.dirty) + continue; + + if ((flags & BTREE_INSERT_LAZY_RW) && + percpu_ref_is_zero(&c->writes)) { + up_read(&ca->bucket_lock); + bch2_trans_unlock(&trans); + + ret = bch2_fs_read_write_early(c); + down_read(&ca->bucket_lock); + + if (ret) + goto err; + goto restart; + } + + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + bch2_alloc_pack(a, new_u); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOMARK| + flags); +err: + if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { + bch_err(c, "error %i writing alloc info", ret); + printk(KERN_CONT "dev %llu bucket %llu\n", + iter->pos.inode, iter->pos.offset); + printk(KERN_CONT "gen %u -> %u\n", old_u.gen, new_u.gen); +#define x(_name, _bits) printk(KERN_CONT #_name " %u -> %u\n", old_u._name, new_u._name); + BCH_ALLOC_FIELDS() +#undef x + } + if (ret) + break; + + new = m; + new.dirty = false; + atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter); + + if (ca->buckets_written) + set_bit(b, ca->buckets_written); + + bch2_trans_cond_resched(&trans); + *wrote = true; + } + up_read(&ca->bucket_lock); + + if (ret) { + percpu_ref_put(&ca->io_ref); + break; + } + } + + bch2_trans_exit(&trans); + + return ret; +} + +/* Bucket IO clocks: */ + +static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) +{ + struct bucket_clock *clock = &c->bucket_clock[rw]; + struct bucket_array *buckets = bucket_array(ca); + struct bucket *g; + u16 max_last_io = 0; + unsigned i; + + lockdep_assert_held(&c->bucket_clock[rw].lock); + + /* Recalculate max_last_io for this device: */ + for_each_bucket(g, buckets) + max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); + + ca->max_last_bucket_io[rw] = max_last_io; + + /* Recalculate global max_last_io: */ + max_last_io = 0; + + for_each_member_device(ca, c, i) + max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); + + clock->max_last_io = max_last_io; +} + +static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) +{ + struct bucket_clock *clock = &c->bucket_clock[rw]; + struct bucket_array *buckets; + struct bch_dev *ca; + struct bucket *g; + unsigned i; + + trace_rescale_prios(c); + + for_each_member_device(ca, c, i) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) + g->io_time[rw] = clock->hand - + bucket_last_io(c, g, rw) / 2; + + bch2_recalc_oldest_io(c, ca, rw); + + up_read(&ca->bucket_lock); + } +} + +static inline u64 bucket_clock_freq(u64 capacity) +{ + return max(capacity >> 10, 2028ULL); +} + +static void bch2_inc_clock_hand(struct io_timer *timer) +{ + struct bucket_clock *clock = container_of(timer, + struct bucket_clock, rescale); + struct bch_fs *c = container_of(clock, + struct bch_fs, bucket_clock[clock->rw]); + struct bch_dev *ca; + u64 capacity; + unsigned i; + + mutex_lock(&clock->lock); + + /* if clock cannot be advanced more, rescale prio */ + if (clock->max_last_io >= U16_MAX - 2) + bch2_rescale_bucket_io_times(c, clock->rw); + + BUG_ON(clock->max_last_io >= U16_MAX - 2); + + for_each_member_device(ca, c, i) + ca->max_last_bucket_io[clock->rw]++; + clock->max_last_io++; + clock->hand++; + + mutex_unlock(&clock->lock); + + capacity = READ_ONCE(c->capacity); + + if (!capacity) + return; + + /* + * we only increment when 0.1% of the filesystem capacity has been read + * or written too, this determines if it's time + * + * XXX: we shouldn't really be going off of the capacity of devices in + * RW mode (that will be 0 when we're RO, yet we can still service + * reads) + */ + timer->expire += bucket_clock_freq(capacity); + + bch2_io_timer_add(&c->io_clock[clock->rw], timer); +} + +static void bch2_bucket_clock_init(struct bch_fs *c, int rw) +{ + struct bucket_clock *clock = &c->bucket_clock[rw]; + + clock->hand = 1; + clock->rw = rw; + clock->rescale.fn = bch2_inc_clock_hand; + clock->rescale.expire = bucket_clock_freq(c->capacity); + mutex_init(&clock->lock); +} + +/* Background allocator thread: */ + +/* + * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens + * (marking them as invalidated on disk), then optionally issues discard + * commands to the newly free buckets, then puts them on the various freelists. + */ + +#define BUCKET_GC_GEN_MAX 96U + +/** + * wait_buckets_available - wait on reclaimable buckets + * + * If there aren't enough available buckets to fill up free_inc, wait until + * there are. + */ +static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned long gc_count = c->gc_count; + int ret = 0; + + ca->allocator_state = ALLOCATOR_BLOCKED; + closure_wake_up(&c->freelist_wait); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + ret = 1; + break; + } + + if (gc_count != c->gc_count) + ca->inc_gen_really_needs_gc = 0; + + if ((ssize_t) (dev_buckets_available(c, ca) - + ca->inc_gen_really_needs_gc) >= + (ssize_t) fifo_free(&ca->free_inc)) + break; + + up_read(&c->gc_lock); + schedule(); + try_to_freeze(); + down_read(&c->gc_lock); + } + + __set_current_state(TASK_RUNNING); + ca->allocator_state = ALLOCATOR_RUNNING; + closure_wake_up(&c->freelist_wait); + + return ret; +} + +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, + size_t bucket, + struct bucket_mark mark) +{ + u8 gc_gen; + + if (!is_available_bucket(mark)) + return false; + + if (ca->buckets_nouse && + test_bit(bucket, ca->buckets_nouse)) + return false; + + gc_gen = bucket_gc_gen(ca, bucket); + + if (gc_gen >= BUCKET_GC_GEN_MAX / 2) + ca->inc_gen_needs_gc++; + + if (gc_gen >= BUCKET_GC_GEN_MAX) + ca->inc_gen_really_needs_gc++; + + return gc_gen < BUCKET_GC_GEN_MAX; +} + +/* + * Determines what order we're going to reuse buckets, smallest bucket_key() + * first. + * + * + * - We take into account the read prio of the bucket, which gives us an + * indication of how hot the data is -- we scale the prio so that the prio + * farthest from the clock is worth 1/8th of the closest. + * + * - The number of sectors of cached data in the bucket, which gives us an + * indication of the cost in cache misses this eviction will cause. + * + * - If hotness * sectors used compares equal, we pick the bucket with the + * smallest bucket_gc_gen() - since incrementing the same bucket's generation + * number repeatedly forces us to run mark and sweep gc to avoid generation + * number wraparound. + */ + +static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark m) +{ + unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); + unsigned max_last_io = ca->max_last_bucket_io[READ]; + + /* + * Time since last read, scaled to [0, 8) where larger value indicates + * more recently read data: + */ + unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; + + /* How much we want to keep the data in this bucket: */ + unsigned long data_wantness = + (hotness + 1) * bucket_sectors_used(m); + + unsigned long needs_journal_commit = + bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + + return (data_wantness << 9) | + (needs_journal_commit << 8) | + (bucket_gc_gen(ca, b) / 16); +} + +static inline int bucket_alloc_cmp(alloc_heap *h, + struct alloc_heap_entry l, + struct alloc_heap_entry r) +{ + return cmp_int(l.key, r.key) ?: + cmp_int(r.nr, l.nr) ?: + cmp_int(l.bucket, r.bucket); +} + +static inline int bucket_idx_cmp(const void *_l, const void *_r) +{ + const struct alloc_heap_entry *l = _l, *r = _r; + + return cmp_int(l->bucket, r->bucket); +} + +static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) +{ + struct bucket_array *buckets; + struct alloc_heap_entry e = { 0 }; + size_t b, i, nr = 0; + + ca->alloc_heap.used = 0; + + mutex_lock(&c->bucket_clock[READ].lock); + down_read(&ca->bucket_lock); + + buckets = bucket_array(ca); + + bch2_recalc_oldest_io(c, ca, READ); + + /* + * Find buckets with lowest read priority, by building a maxheap sorted + * by read priority and repeatedly replacing the maximum element until + * all buckets have been visited. + */ + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + unsigned long key = bucket_sort_key(c, ca, b, m); + + if (!bch2_can_invalidate_bucket(ca, b, m)) + continue; + + if (e.nr && e.bucket + e.nr == b && e.key == key) { + e.nr++; + } else { + if (e.nr) + heap_add_or_replace(&ca->alloc_heap, e, + -bucket_alloc_cmp, NULL); + + e = (struct alloc_heap_entry) { + .bucket = b, + .nr = 1, + .key = key, + }; + } + + cond_resched(); + } + + if (e.nr) + heap_add_or_replace(&ca->alloc_heap, e, + -bucket_alloc_cmp, NULL); + + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; + + while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { + nr -= ca->alloc_heap.data[0].nr; + heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); + } + + up_read(&ca->bucket_lock); + mutex_unlock(&c->bucket_clock[READ].lock); +} + +static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) +{ + struct bucket_array *buckets = bucket_array(ca); + struct bucket_mark m; + size_t b, start; + + if (ca->fifo_last_bucket < ca->mi.first_bucket || + ca->fifo_last_bucket >= ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + + start = ca->fifo_last_bucket; + + do { + ca->fifo_last_bucket++; + if (ca->fifo_last_bucket == ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + + b = ca->fifo_last_bucket; + m = READ_ONCE(buckets->b[b].mark); + + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + if (heap_full(&ca->alloc_heap)) + break; + } + + cond_resched(); + } while (ca->fifo_last_bucket != start); +} + +static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) +{ + struct bucket_array *buckets = bucket_array(ca); + struct bucket_mark m; + size_t checked, i; + + for (checked = 0; + checked < ca->mi.nbuckets / 2; + checked++) { + size_t b = bch2_rand_range(ca->mi.nbuckets - + ca->mi.first_bucket) + + ca->mi.first_bucket; + + m = READ_ONCE(buckets->b[b].mark); + + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + if (heap_full(&ca->alloc_heap)) + break; + } + + cond_resched(); + } + + sort(ca->alloc_heap.data, + ca->alloc_heap.used, + sizeof(ca->alloc_heap.data[0]), + bucket_idx_cmp, NULL); + + /* remove duplicates: */ + for (i = 0; i + 1 < ca->alloc_heap.used; i++) + if (ca->alloc_heap.data[i].bucket == + ca->alloc_heap.data[i + 1].bucket) + ca->alloc_heap.data[i].nr = 0; +} + +static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +{ + size_t i, nr = 0; + + ca->inc_gen_needs_gc = 0; + + switch (ca->mi.replacement) { + case CACHE_REPLACEMENT_LRU: + find_reclaimable_buckets_lru(c, ca); + break; + case CACHE_REPLACEMENT_FIFO: + find_reclaimable_buckets_fifo(c, ca); + break; + case CACHE_REPLACEMENT_RANDOM: + find_reclaimable_buckets_random(c, ca); + break; + } + + heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); + + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; + + return nr; +} + +static inline long next_alloc_bucket(struct bch_dev *ca) +{ + struct alloc_heap_entry e, *top = ca->alloc_heap.data; + + while (ca->alloc_heap.used) { + if (top->nr) { + size_t b = top->bucket; + + top->bucket++; + top->nr--; + return b; + } + + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + } + + return -1; +} + +/* + * returns sequence number of most recent journal entry that updated this + * bucket: + */ +static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) +{ + if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + return bucket_seq; + } else { + return 0; + } +} + +static int bch2_invalidate_one_bucket2(struct btree_trans *trans, + struct bch_dev *ca, + struct btree_iter *iter, + u64 *journal_seq, unsigned flags) +{ +#if 0 + __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; +#else + /* hack: */ + __BKEY_PADDED(k, 8) alloc_key; +#endif + struct bch_fs *c = trans->c; + struct bkey_i_alloc *a; + struct bkey_alloc_unpacked u; + struct bucket *g; + struct bucket_mark m; + struct bkey_s_c k; + bool invalidating_cached_data; + size_t b; + int ret; + + BUG_ON(!ca->alloc_heap.used || + !ca->alloc_heap.data[0].nr); + b = ca->alloc_heap.data[0].bucket; + + /* first, put on free_inc and mark as owned by allocator: */ + percpu_down_read(&c->mark_lock); + spin_lock(&c->freelist_lock); + + verify_not_on_freelist(c, ca, b); + + BUG_ON(!fifo_push(&ca->free_inc, b)); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_unlock(&c->freelist_lock); + percpu_up_read(&c->mark_lock); + + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); +retry: + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + /* + * The allocator has to start before journal replay is finished - thus, + * we have to trust the in memory bucket @m, not the version in the + * btree: + */ + percpu_down_read(&c->mark_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); + u = alloc_mem_to_key(g, m); + percpu_up_read(&c->mark_lock); + + invalidating_cached_data = m.cached_sectors != 0; + + u.gen++; + u.data_type = 0; + u.dirty_sectors = 0; + u.cached_sectors = 0; + u.read_time = c->bucket_clock[READ].hand; + u.write_time = c->bucket_clock[WRITE].hand; + + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + + /* + * XXX: + * when using deferred btree updates, we have journal reclaim doing + * btree updates and thus requiring the allocator to make forward + * progress, and here the allocator is requiring space in the journal - + * so we need a journal pre-reservation: + */ + ret = bch2_trans_commit(trans, NULL, + invalidating_cached_data ? journal_seq : NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_BUCKET_INVALIDATE| + flags); + if (ret == -EINTR) + goto retry; + + if (!ret) { + /* remove from alloc_heap: */ + struct alloc_heap_entry e, *top = ca->alloc_heap.data; + + top->bucket++; + top->nr--; + + if (!top->nr) + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); + + /* with btree still locked: */ + if (ca->buckets_written) + set_bit(b, ca->buckets_written); + + /* + * Make sure we flush the last journal entry that updated this + * bucket (i.e. deleting the last reference) before writing to + * this bucket again: + */ + *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + } else { + size_t b2; + + /* remove from free_inc: */ + percpu_down_read(&c->mark_lock); + spin_lock(&c->freelist_lock); + + bch2_mark_alloc_bucket(c, ca, b, false, + gc_pos_alloc(c, NULL), 0); + + BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); + BUG_ON(b != b2); + + spin_unlock(&c->freelist_lock); + percpu_up_read(&c->mark_lock); + } + + return ret; +} + +static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t bucket, u64 *flush_seq) +{ + struct bucket_mark m; + + percpu_down_read(&c->mark_lock); + spin_lock(&c->freelist_lock); + + bch2_invalidate_bucket(c, ca, bucket, &m); + + verify_not_on_freelist(c, ca, bucket); + BUG_ON(!fifo_push(&ca->free_inc, bucket)); + + spin_unlock(&c->freelist_lock); + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + + percpu_up_read(&c->mark_lock); + + *flush_seq = max(*flush_seq, bucket_journal_seq(c, m)); + + return m.cached_sectors != 0; +} + +/* + * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: + */ +static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) +{ + struct btree_trans trans; + struct btree_iter *iter; + u64 journal_seq = 0; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, + POS(ca->dev_idx, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + /* Only use nowait if we've already invalidated at least one bucket: */ + while (!ret && + !fifo_full(&ca->free_inc) && + ca->alloc_heap.used) + ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, + BTREE_INSERT_GC_LOCK_HELD| + (!fifo_empty(&ca->free_inc) + ? BTREE_INSERT_NOWAIT : 0)); + + bch2_trans_exit(&trans); + + /* If we used NOWAIT, don't return the error: */ + if (!fifo_empty(&ca->free_inc)) + ret = 0; + if (ret) { + bch_err(ca, "error invalidating buckets: %i", ret); + return ret; + } + + if (journal_seq) + ret = bch2_journal_flush_seq(&c->journal, journal_seq); + if (ret) { + bch_err(ca, "journal error: %i", ret); + return ret; + } + + return 0; +} + +static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) +{ + unsigned i; + int ret = 0; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + + closure_wake_up(&c->freelist_wait); + ca->allocator_state = ALLOCATOR_RUNNING; + + spin_unlock(&c->freelist_lock); + goto out; + } + + if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { + ca->allocator_state = ALLOCATOR_BLOCKED_FULL; + closure_wake_up(&c->freelist_wait); + } + + spin_unlock(&c->freelist_lock); + + if ((current->flags & PF_KTHREAD) && + kthread_should_stop()) { + ret = 1; + break; + } + + schedule(); + try_to_freeze(); + } +out: + __set_current_state(TASK_RUNNING); + return ret; +} + +/* + * Pulls buckets off free_inc, discards them (if enabled), then adds them to + * freelists, waiting until there's room if necessary: + */ +static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) +{ + while (!fifo_empty(&ca->free_inc)) { + size_t bucket = fifo_peek(&ca->free_inc); + + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bucket), + ca->mi.bucket_size, GFP_NOIO, 0); + + if (push_invalidated_bucket(c, ca, bucket)) + return 1; + } + + return 0; +} + +/** + * bch_allocator_thread - move buckets from free_inc to reserves + * + * The free_inc FIFO is populated by find_reclaimable_buckets(), and + * the reserves are depleted by bucket allocation. When we run out + * of free_inc, try to invalidate some buckets and write out + * prios and gens. + */ +static int bch2_allocator_thread(void *arg) +{ + struct bch_dev *ca = arg; + struct bch_fs *c = ca->fs; + size_t nr; + int ret; + + set_freezable(); + ca->allocator_state = ALLOCATOR_RUNNING; + + while (1) { + cond_resched(); + + pr_debug("discarding %zu invalidated buckets", + fifo_used(&ca->free_inc)); + + ret = discard_invalidated_buckets(c, ca); + if (ret) + goto stop; + + down_read(&c->gc_lock); + + ret = bch2_invalidate_buckets(c, ca); + if (ret) { + up_read(&c->gc_lock); + goto stop; + } + + if (!fifo_empty(&ca->free_inc)) { + up_read(&c->gc_lock); + continue; + } + + pr_debug("free_inc now empty"); + + do { + /* + * Find some buckets that we can invalidate, either + * they're completely unused, or only contain clean data + * that's been written back to the backing device or + * another cache tier + */ + + pr_debug("scanning for reclaimable buckets"); + + nr = find_reclaimable_buckets(c, ca); + + pr_debug("found %zu buckets", nr); + + trace_alloc_batch(ca, nr, ca->alloc_heap.size); + + if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || + ca->inc_gen_really_needs_gc) && + c->gc_thread) { + atomic_inc(&c->kick_gc); + wake_up_process(c->gc_thread); + } + + /* + * If we found any buckets, we have to invalidate them + * before we scan for more - but if we didn't find very + * many we may want to wait on more buckets being + * available so we don't spin: + */ + if (!nr || + (nr < ALLOC_SCAN_BATCH(ca) && + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { + ret = wait_buckets_available(c, ca); + if (ret) { + up_read(&c->gc_lock); + goto stop; + } + } + } while (!nr); + + up_read(&c->gc_lock); + + pr_debug("%zu buckets to invalidate", nr); + + /* + * alloc_heap is now full of newly-invalidated buckets: next, + * write out the new bucket gens: + */ + } + +stop: + pr_debug("alloc thread stopping (ret %i)", ret); + ca->allocator_state = ALLOCATOR_STOPPED; + closure_wake_up(&c->freelist_wait); + return 0; +} + +/* Startup/shutdown (ro/rw): */ + +void bch2_recalc_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; + unsigned i, j; + + lockdep_assert_held(&c->state_lock); + + for_each_online_member(ca, c, i) { + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; + + ra_pages += bdi->ra_pages; + } + + bch2_set_ra_pages(c, ra_pages); + + for_each_rw_member(ca, c, i) { + u64 dev_reserve = 0; + + /* + * We need to reserve buckets (from the number + * of currently available buckets) against + * foreground writes so that mainly copygc can + * make forward progress. + * + * We need enough to refill the various reserves + * from scratch - copygc will use its entire + * reserve all at once, then run against when + * its reserve is refilled (from the formerly + * available buckets). + * + * This reserve is just used when considering if + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ + for (j = 0; j < RESERVE_NONE; j++) + dev_reserve += ca->free[j].size; + + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ + dev_reserve += 1; /* rebalance write point */ + + dev_reserve *= ca->mi.bucket_size; + + ca->copygc_threshold = dev_reserve; + + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + + reserved_sectors += dev_reserve * 2; + + bucket_size_max = max_t(unsigned, bucket_size_max, + ca->mi.bucket_size); + } + + gc_reserve = c->opts.gc_reserve_bytes + ? c->opts.gc_reserve_bytes >> 9 + : div64_u64(capacity * c->opts.gc_reserve_percent, 100); + + reserved_sectors = max(gc_reserve, reserved_sectors); + + reserved_sectors = min(reserved_sectors, capacity); + + c->capacity = capacity - reserved_sectors; + + c->bucket_size_max = bucket_size_max; + + if (c->capacity) { + bch2_io_timer_add(&c->io_clock[READ], + &c->bucket_clock[READ].rescale); + bch2_io_timer_add(&c->io_clock[WRITE], + &c->bucket_clock[WRITE].rescale); + } else { + bch2_io_timer_del(&c->io_clock[READ], + &c->bucket_clock[READ].rescale); + bch2_io_timer_del(&c->io_clock[WRITE], + &c->bucket_clock[WRITE].rescale); + } + + /* Wake up case someone was waiting for buckets */ + closure_wake_up(&c->freelist_wait); +} + +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) +{ + struct open_bucket *ob; + bool ret = false; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && + ob->ptr.dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } + + return ret; +} + +/* device goes ro: */ +void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + BUG_ON(ca->alloc_thread); + + /* First, remove device from allocation groups: */ + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + clear_bit(ca->dev_idx, c->rw_devs[i].d); + + /* + * Capacity is calculated based off of devices in allocation groups: + */ + bch2_recalc_capacity(c); + + /* Next, close write points that point to this device... */ + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, &c->write_points[i]); + + bch2_writepoint_stop(c, ca, &ca->copygc_write_point); + bch2_writepoint_stop(c, ca, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + while (1) { + struct open_bucket *ob; + + spin_lock(&c->freelist_lock); + if (!ca->open_buckets_partial_nr) { + spin_unlock(&c->freelist_lock); + break; + } + ob = c->open_buckets + + ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + + bch2_open_bucket_put(c, ob); + } + + bch2_ec_stop_dev(c, ca); + + /* + * Wake up threads that were blocked on allocation, so they can notice + * the device can no longer be removed and the capacity has changed: + */ + closure_wake_up(&c->freelist_wait); + + /* + * journal_res_get() can block waiting for free space in the journal - + * it needs to notice there may not be devices to allocate from anymore: + */ + wake_up(&c->journal.wait); + + /* Now wait for any in flight writes: */ + + closure_wait_event(&c->open_buckets_wait, + !bch2_dev_has_open_write_point(c, ca)); +} + +/* device goes rw: */ +void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (ca->mi.data_allowed & (1 << i)) + set_bit(ca->dev_idx, c->rw_devs[i].d); +} + +void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) +{ + if (ca->alloc_thread) + closure_wait_event(&c->freelist_wait, + ca->allocator_state != ALLOCATOR_RUNNING); +} + +/* stop allocator thread: */ +void bch2_dev_allocator_stop(struct bch_dev *ca) +{ + struct task_struct *p; + + p = rcu_dereference_protected(ca->alloc_thread, 1); + ca->alloc_thread = NULL; + + /* + * We need an rcu barrier between setting ca->alloc_thread = NULL and + * the thread shutting down to avoid bch2_wake_allocator() racing: + * + * XXX: it would be better to have the rcu barrier be asynchronous + * instead of blocking us here + */ + synchronize_rcu(); + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +/* start allocator thread: */ +int bch2_dev_allocator_start(struct bch_dev *ca) +{ + struct task_struct *p; + + /* + * allocator thread already started? + */ + if (ca->alloc_thread) + return 0; + + p = kthread_create(bch2_allocator_thread, ca, + "bch_alloc[%s]", ca->name); + if (IS_ERR(p)) + return PTR_ERR(p); + + get_task_struct(p); + rcu_assign_pointer(ca->alloc_thread, p); + wake_up_process(p); + return 0; +} + +static bool flush_held_btree_writes(struct bch_fs *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + bool nodes_unwritten; + size_t i; +again: + cond_resched(); + nodes_unwritten = false; + + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (btree_node_need_write(b)) { + if (btree_node_may_write(b)) { + rcu_read_unlock(); + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); + goto again; + } else { + nodes_unwritten = true; + } + } + rcu_read_unlock(); + + if (c->btree_roots_dirty) { + bch2_journal_meta(&c->journal); + goto again; + } + + return !nodes_unwritten && + !bch2_btree_interior_updates_nr_pending(c); +} + +static void allocator_start_issue_discards(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned dev_iter; + size_t bu; + + for_each_rw_member(ca, c, dev_iter) + while (fifo_pop(&ca->free_inc, bu)) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bu), + ca->mi.bucket_size, GFP_NOIO, 0); +} + +static int resize_free_inc(struct bch_dev *ca) +{ + alloc_fifo free_inc; + + if (!fifo_full(&ca->free_inc)) + return 0; + + if (!init_fifo(&free_inc, + ca->free_inc.size * 2, + GFP_KERNEL)) + return -ENOMEM; + + fifo_move(&free_inc, &ca->free_inc); + swap(free_inc, ca->free_inc); + free_fifo(&free_inc); + return 0; +} + +static bool bch2_fs_allocator_start_fast(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned dev_iter; + bool ret = true; + + if (test_alloc_startup(c)) + return false; + + down_read(&c->gc_lock); + + /* Scan for buckets that are already invalidated: */ + for_each_rw_member(ca, c, dev_iter) { + struct bucket_array *buckets; + struct bucket_mark m; + long bu; + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (bu = buckets->first_bucket; + bu < buckets->nbuckets; bu++) { + m = READ_ONCE(buckets->b[bu].mark); + + if (!buckets->b[bu].gen_valid || + !is_available_bucket(m) || + m.cached_sectors || + (ca->buckets_nouse && + test_bit(bu, ca->buckets_nouse))) + continue; + + percpu_down_read(&c->mark_lock); + bch2_mark_alloc_bucket(c, ca, bu, true, + gc_pos_alloc(c, NULL), 0); + percpu_up_read(&c->mark_lock); + + fifo_push(&ca->free_inc, bu); + + discard_invalidated_buckets(c, ca); + + if (fifo_full(&ca->free[RESERVE_BTREE])) + break; + } + up_read(&ca->bucket_lock); + } + + up_read(&c->gc_lock); + + /* did we find enough buckets? */ + for_each_rw_member(ca, c, dev_iter) + if (!fifo_full(&ca->free[RESERVE_BTREE])) + ret = false; + + return ret; +} + +int bch2_fs_allocator_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned dev_iter; + u64 journal_seq = 0; + bool wrote; + long bu; + int ret = 0; + + if (!test_alloc_startup(c) && + bch2_fs_allocator_start_fast(c)) + return 0; + + pr_debug("not enough empty buckets; scanning for reclaimable buckets"); + + /* + * We're moving buckets to freelists _before_ they've been marked as + * invalidated on disk - we have to so that we can allocate new btree + * nodes to mark them as invalidated on disk. + * + * However, we can't _write_ to any of these buckets yet - they might + * have cached data in them, which is live until they're marked as + * invalidated on disk: + */ + set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); + + down_read(&c->gc_lock); + do { + wrote = false; + + for_each_rw_member(ca, c, dev_iter) { + find_reclaimable_buckets(c, ca); + + while (!fifo_full(&ca->free[RESERVE_BTREE]) && + (bu = next_alloc_bucket(ca)) >= 0) { + ret = resize_free_inc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + up_read(&c->gc_lock); + goto err; + } + + bch2_invalidate_one_bucket(c, ca, bu, + &journal_seq); + + fifo_push(&ca->free[RESERVE_BTREE], bu); + } + } + + pr_debug("done scanning for reclaimable buckets"); + + /* + * XXX: it's possible for this to deadlock waiting on journal reclaim, + * since we're holding btree writes. What then? + */ + ret = bch2_alloc_write(c, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOWAIT, &wrote); + + /* + * If bch2_alloc_write() did anything, it may have used some + * buckets, and we need the RESERVE_BTREE freelist full - so we + * need to loop and scan again. + * And if it errored, it may have been because there weren't + * enough buckets, so just scan and loop again as long as it + * made some progress: + */ + } while (wrote); + up_read(&c->gc_lock); + + if (ret) + goto err; + + pr_debug("flushing journal"); + + ret = bch2_journal_flush(&c->journal); + if (ret) + goto err; + + pr_debug("issuing discards"); + allocator_start_issue_discards(c); +err: + clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); + closure_wait_event(&c->btree_interior_update_wait, + flush_held_btree_writes(c)); + + return ret; +} + +void bch2_fs_allocator_background_init(struct bch_fs *c) +{ + spin_lock_init(&c->freelist_lock); + bch2_bucket_clock_init(c, READ); + bch2_bucket_clock_init(c, WRITE); + + c->pd_controllers_update_seconds = 5; + INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 index 000000000000..0c1a0f0dd2ab --- /dev/null +++ b/fs/bcachefs/alloc_background.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" +#include "debug.h" + +struct bkey_alloc_unpacked { + u8 gen; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS() +#undef x +}; + +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); +void bch2_alloc_pack(struct bkey_i_alloc *, + const struct bkey_alloc_unpacked); + +#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) + +const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_alloc (struct bkey_ops) { \ + .key_invalid = bch2_alloc_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +struct journal_keys; +int bch2_alloc_read(struct bch_fs *, struct journal_keys *); +int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); + +static inline void bch2_wake_allocator(struct bch_dev *ca) +{ + struct task_struct *p; + + rcu_read_lock(); + p = rcu_dereference(ca->alloc_thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + +static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) +{ + if (expensive_debug_checks(c) && + test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { + size_t iter; + long i; + unsigned j; + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + BUG_ON(i == bucket); + fifo_for_each_entry(i, &ca->free_inc, iter) + BUG_ON(i == bucket); + } +} + +void bch2_recalc_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + +void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_stop(struct bch_dev *); +int bch2_dev_allocator_start(struct bch_dev *); + +int bch2_alloc_write(struct bch_fs *, unsigned, bool *); +int bch2_fs_allocator_start(struct bch_fs *); +void bch2_fs_allocator_background_init(struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 index 000000000000..e64f8449462f --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c @@ -0,0 +1,1045 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often have to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * It's important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch2_bucket_alloc() allocates a single bucket from a specific device. + * + * bch2_bucket_alloc_set() allocates one or more buckets from different devices + * in a given filesystem. + * + * invalidate_buckets() drives all the processes described above. It's called + * from bch2_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "io.h" + +#include <linux/math64.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <trace/events/bcachefs.h> + +enum bucket_alloc_ret { + ALLOC_SUCCESS, + OPEN_BUCKETS_EMPTY, + FREELIST_EMPTY, /* Allocator thread not keeping up */ +}; + +/* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + if (ob->ec) { + bch2_ec_bucket_written(c, ob); + return; + } + + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), + false, gc_pos_alloc(c, ob), 0); + ob->valid = false; + ob->type = 0; + + spin_unlock(&ob->lock); + percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + c->open_buckets_nr_free++; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +} + +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ptr.dev == dev && + ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + +static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) +{ + struct open_bucket *ob; + + BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); + + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); + ob->type = 0; + + c->open_buckets_nr_free--; + return ob; +} + +static void open_bucket_free_unused(struct bch_fs *c, + struct open_bucket *ob, + bool may_realloc) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + BUG_ON(ca->open_buckets_partial_nr >= + ARRAY_SIZE(ca->open_buckets_partial)); + + if (ca->open_buckets_partial_nr < + ARRAY_SIZE(ca->open_buckets_partial) && + may_realloc) { + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + ca->open_buckets_partial[ca->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); + } else { + bch2_open_bucket_put(c, ob); + } +} + +static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + BUG_ON(ptr_stale(ca, &ob->ptr)); + } +#endif +} + +/* _only_ for allocating the journal on a new device: */ +long bch2_bucket_alloc_new_fs(struct bch_dev *ca) +{ + struct bucket_array *buckets; + ssize_t b; + + rcu_read_lock(); + buckets = bucket_array(ca); + + for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) + if (is_available_bucket(buckets->b[b].mark)) + goto success; + b = -1; +success: + rcu_read_unlock(); + return b; +} + +static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +{ + switch (reserve) { + case RESERVE_ALLOC: + return 0; + case RESERVE_BTREE: + return BTREE_NODE_OPEN_BUCKET_RESERVE; + default: + return BTREE_NODE_OPEN_BUCKET_RESERVE * 2; + } +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + * */ +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) +{ + struct bucket_array *buckets; + struct open_bucket *ob; + long bucket = 0; + + spin_lock(&c->freelist_lock); + + if (may_alloc_partial && + ca->open_buckets_partial_nr) { + ob = c->open_buckets + + ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + return ob; + } + + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + if (!c->blocked_allocate_open_bucket) + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); + trace_open_bucket_alloc_fail(ca, reserve); + return ERR_PTR(-OPEN_BUCKETS_EMPTY); + } + + if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) + goto out; + + switch (reserve) { + case RESERVE_ALLOC: + if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) + goto out; + break; + case RESERVE_BTREE: + if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= + ca->free[RESERVE_BTREE].size && + fifo_pop(&ca->free[RESERVE_BTREE], bucket)) + goto out; + break; + case RESERVE_MOVINGGC: + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + goto out; + break; + default: + break; + } + + if (cl) + closure_wait(&c->freelist_wait, cl); + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + spin_unlock(&c->freelist_lock); + + trace_bucket_alloc_fail(ca, reserve); + return ERR_PTR(-FREELIST_EMPTY); +out: + verify_not_on_freelist(c, ca, bucket); + + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); + buckets = bucket_array(ca); + + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->ptr = (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = buckets->b[bucket].mark.gen, + .offset = bucket_to_sector(ca, bucket), + .dev = ca->dev_idx, + }; + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + spin_unlock(&ob->lock); + + if (c->blocked_allocate_open_bucket) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate_open_bucket], + c->blocked_allocate_open_bucket); + c->blocked_allocate_open_bucket = 0; + } + + if (c->blocked_allocate) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate], + c->blocked_allocate); + c->blocked_allocate = 0; + } + + spin_unlock(&c->freelist_lock); + + bch2_wake_allocator(ca); + + trace_bucket_alloc(ca, reserve); + return ob; +} + +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) +{ + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); +} + +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) +{ + struct dev_alloc_list ret = { .nr = 0 }; + struct bch_dev *ca; + unsigned i; + + for_each_member_device_rcu(ca, c, i, devs) + ret.devs[ret.nr++] = i; + + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + return ret; +} + +void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, + struct dev_stripe_state *stripe) +{ + u64 *v = stripe->next_alloc + ca->dev_idx; + u64 free_space = dev_buckets_free(c, ca); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; + u64 scale = *v / 4; + + if (*v + free_space_inv >= *v) + *v += free_space_inv; + else + *v = U64_MAX; + + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + +#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) +#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) + +static void add_new_bucket(struct bch_fs *c, + struct open_buckets *ptrs, + struct bch_devs_mask *devs_may_alloc, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + struct open_bucket *ob) +{ + unsigned durability = + bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; + + __clear_bit(ob->ptr.dev, devs_may_alloc->d); + *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) + ? durability : 1; + *have_cache |= !durability; + + ob_push(c, ptrs, ob); +} + +static int bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) +{ + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct bch_dev *ca; + bool alloc_failure = false; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); + + for (i = 0; i < devs_sorted.nr; i++) { + struct open_bucket *ob; + + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; + + if (!ca->mi.durability && *have_cache) + continue; + + ob = bch2_bucket_alloc(c, ca, reserve, + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); + if (IS_ERR(ob)) { + enum bucket_alloc_ret ret = -PTR_ERR(ob); + + WARN_ON(reserve == RESERVE_MOVINGGC && + ret != OPEN_BUCKETS_EMPTY); + + if (cl) + return -EAGAIN; + if (ret == OPEN_BUCKETS_EMPTY) + return -ENOSPC; + alloc_failure = true; + continue; + } + + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, flags, ob); + + bch2_dev_stripe_increment(c, ca, stripe); + + if (*nr_effective >= nr_replicas) + return 0; + } + + return alloc_failure ? -ENOSPC : -EROFS; +} + +/* Allocate from stripes: */ + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have = 0, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + BUG_ON(h->blocks.nr > nr_data); + BUG_ON(h->parity.nr > h->redundancy); + + devs = h->devs; + + open_bucket_for_each(c, &h->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read(&c->mark_lock); + rcu_read_lock(); + + if (h->parity.nr < h->redundancy) { + nr_have = h->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->blocks.nr < nr_data) { + nr_have = h->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + + return bch2_ec_stripe_new_alloc(c, h); +err: + rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + return -1; +} + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static void bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags) +{ + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + struct bch_dev *ca; + unsigned i, ec_idx; + + if (!erasure_code) + return; + + if (nr_replicas < 2) + return; + + if (ec_open_bucket(c, ptrs)) + return; + + h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + if (!h) + return; + + if (!h->s && ec_stripe_alloc(c, h)) + goto out_put_head; + + rcu_read_lock(); + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + rcu_read_unlock(); + + for (i = 0; i < devs_sorted.nr; i++) + open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + if (ob->ptr.dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + goto out_put_head; +got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; + + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, flags, ob); + atomic_inc(&h->s->pin); +out_put_head: + bch2_ec_stripe_head_put(h); +} + +/* Sector allocator */ + +static void get_buckets_from_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + bool need_ec) +{ + struct open_buckets ptrs_skip = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + if (*nr_effective < nr_replicas && + test_bit(ob->ptr.dev, devs_may_alloc->d) && + (ca->mi.durability || + (wp->type == BCH_DATA_USER && !*have_cache)) && + (ob->ec || !need_ec)) { + add_new_bucket(c, ptrs, devs_may_alloc, + nr_effective, have_cache, + flags, ob); + } else { + ob_push(c, &ptrs_skip, ob); + } + } + wp->ptrs = ptrs_skip; +} + +static int open_bucket_add_buckets(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *_cl) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; + unsigned i; + int ret; + + rcu_read_lock(); + devs = target_rw_devs(c, wp->type, target); + rcu_read_unlock(); + + /* Don't allocate from devices we already have pointers to: */ + for (i = 0; i < devs_have->nr; i++) + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + if (erasure_code) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, flags, true); + if (*nr_effective >= nr_replicas) + return 0; + + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags); + if (*nr_effective >= nr_replicas) + return 0; + } + + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, flags, false); + if (*nr_effective >= nr_replicas) + return 0; + + percpu_down_read(&c->mark_lock); + rcu_read_lock(); + +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + reserve, flags, cl); + if (ret && ret != -EROFS && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } + + rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + + return ret; +} + +void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + struct open_buckets *obs, + enum bch_data_type data_type) +{ + struct open_buckets ptrs = { .nr = 0 }; + struct open_bucket *ob, *ob2; + unsigned i, j; + + open_bucket_for_each(c, obs, ob, i) { + bool drop = !ca || ob->ptr.dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); + open_bucket_for_each(c, &ob->ec->blocks, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + open_bucket_for_each(c, &ob->ec->parity, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + mutex_unlock(&ob->ec->lock); + } + + if (drop) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + } + + *obs = ptrs; +} + +void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) +{ + mutex_lock(&wp->lock); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); + mutex_unlock(&wp->lock); +} + +static inline struct hlist_head *writepoint_hash(struct bch_fs *c, + unsigned long write_point) +{ + unsigned hash = + hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); + + return &c->write_points_hash[hash]; +} + +static struct write_point *__writepoint_find(struct hlist_head *head, + unsigned long write_point) +{ + struct write_point *wp; + + hlist_for_each_entry_rcu(wp, head, node) + if (wp->write_point == write_point) + return wp; + + return NULL; +} + +static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +{ + u64 stranded = c->write_points_nr * c->bucket_size_max; + u64 free = bch2_fs_usage_read_short(c).free; + + return stranded * factor > free; +} + +static bool try_increase_writepoints(struct bch_fs *c) +{ + struct write_point *wp; + + if (c->write_points_nr == ARRAY_SIZE(c->write_points) || + too_many_writepoints(c, 32)) + return false; + + wp = c->write_points + c->write_points_nr++; + hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); + return true; +} + +static bool try_decrease_writepoints(struct bch_fs *c, + unsigned old_nr) +{ + struct write_point *wp; + + mutex_lock(&c->write_points_hash_lock); + if (c->write_points_nr < old_nr) { + mutex_unlock(&c->write_points_hash_lock); + return true; + } + + if (c->write_points_nr == 1 || + !too_many_writepoints(c, 8)) { + mutex_unlock(&c->write_points_hash_lock); + return false; + } + + wp = c->write_points + --c->write_points_nr; + + hlist_del_rcu(&wp->node); + mutex_unlock(&c->write_points_hash_lock); + + bch2_writepoint_stop(c, NULL, wp); + return true; +} + +static struct write_point *writepoint_find(struct bch_fs *c, + unsigned long write_point) +{ + struct write_point *wp, *oldest; + struct hlist_head *head; + + if (!(write_point & 1UL)) { + wp = (struct write_point *) write_point; + mutex_lock(&wp->lock); + return wp; + } + + head = writepoint_hash(c, write_point); +restart_find: + wp = __writepoint_find(head, write_point); + if (wp) { +lock_wp: + mutex_lock(&wp->lock); + if (wp->write_point == write_point) + goto out; + mutex_unlock(&wp->lock); + goto restart_find; + } +restart_find_oldest: + oldest = NULL; + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) + if (!oldest || time_before64(wp->last_used, oldest->last_used)) + oldest = wp; + + mutex_lock(&oldest->lock); + mutex_lock(&c->write_points_hash_lock); + if (oldest >= c->write_points + c->write_points_nr || + try_increase_writepoints(c)) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto restart_find_oldest; + } + + wp = __writepoint_find(head, write_point); + if (wp && wp != oldest) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto lock_wp; + } + + wp = oldest; + hlist_del_rcu(&wp->node); + wp->write_point = write_point; + hlist_add_head_rcu(&wp->node, head); + mutex_unlock(&c->write_points_hash_lock); +out: + wp->last_used = sched_clock(); + return wp; +} + +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ +struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) +{ + struct write_point *wp; + struct open_bucket *ob; + struct open_buckets ptrs; + unsigned nr_effective, write_points_nr; + unsigned ob_flags = 0; + bool have_cache; + int ret, i; + + if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ob_flags |= BUCKET_ALLOC_USE_DURABILITY; + + BUG_ON(!nr_replicas || !nr_replicas_required); +retry: + ptrs.nr = 0; + nr_effective = 0; + write_points_nr = c->write_points_nr; + have_cache = false; + + wp = writepoint_find(c, write_point.v); + + if (wp->type == BCH_DATA_USER) + ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; + + /* metadata may not allocate on cache devices: */ + if (wp->type != BCH_DATA_USER) + have_cache = true; + + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + ob_flags, cl); + } else { + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + ob_flags, NULL); + if (!ret) + goto alloc_done; + + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + 0, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + ob_flags, cl); + } +alloc_done: + BUG_ON(!ret && nr_effective < nr_replicas); + + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + + if (ret == -EROFS && + nr_effective >= nr_replicas_required) + ret = 0; + + if (ret) + goto err; + + /* Free buckets we didn't use: */ + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); + + wp->ptrs = ptrs; + + wp->sectors_free = UINT_MAX; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + + verify_not_stale(c, &wp->ptrs); + + return wp; +err: + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ptrs.nr < ARRAY_SIZE(ptrs.v)) + ob_push(c, &ptrs, ob); + else + open_bucket_free_unused(c, ob, + wp->type == BCH_DATA_USER); + wp->ptrs = ptrs; + + mutex_unlock(&wp->lock); + + if (ret == -ENOSPC && + try_decrease_writepoints(c, write_points_nr)) + goto retry; + + return ERR_PTR(ret); +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors) + +{ + struct open_bucket *ob; + unsigned i; + + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_extent_ptr tmp = ob->ptr; + + tmp.cached = !ca->mi.durability && + wp->type == BCH_DATA_USER; + + tmp.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_bkey_append_ptr(k, tmp); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; + } +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + wp->ptrs = keep; + + mutex_unlock(&wp->lock); + + bch2_open_buckets_put(c, &ptrs); +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) +{ + struct open_bucket *ob; + struct write_point *wp; + + mutex_init(&c->write_points_hash_lock); + c->write_points_nr = ARRAY_SIZE(c->write_points); + + /* open bucket 0 is a sentinal NULL: */ + spin_lock_init(&c->open_buckets[0].lock); + + for (ob = c->open_buckets + 1; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { + spin_lock_init(&ob->lock); + c->open_buckets_nr_free++; + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + } + + writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); + writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { + writepoint_init(wp, BCH_DATA_USER); + + wp->last_used = sched_clock(); + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, + writepoint_hash(c, wp->write_point)); + } +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 index 000000000000..6d8ffb0cd06d --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" + +#include <linux/hash.h> + +struct bkey; +struct bch_dev; +struct bch_fs; +struct bch_devs_List; + +struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +}; + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, + struct dev_stripe_state *); + +long bch2_bucket_alloc_new_fs(struct bch_dev *); + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum alloc_reserve, bool, + struct closure *); + +static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, + struct open_bucket *ob) +{ + BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); + + obs->v[obs->nr++] = ob - c->open_buckets; +} + +#define open_bucket_for_each(_c, _obs, _ob, _i) \ + for ((_i) = 0; \ + (_i) < (_obs)->nr && \ + ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ + (_i)++) + +static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, + struct open_buckets *obs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ec) + return ob; + + return NULL; +} + +void bch2_open_bucket_write_error(struct bch_fs *, + struct open_buckets *, unsigned); + +void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + if (atomic_dec_and_test(&ob->pin)) + __bch2_open_bucket_put(c, ob); +} + +static inline void bch2_open_buckets_put(struct bch_fs *c, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) + bch2_open_bucket_put(c, ob); + ptrs->nr = 0; +} + +static inline void bch2_open_bucket_get(struct bch_fs *c, + struct write_point *wp, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + ob->type = wp->type; + atomic_inc(&ob->pin); + ob_push(c, ptrs, ob); + } +} + +struct write_point *bch2_alloc_sectors_start(struct bch_fs *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum alloc_reserve, + unsigned, + struct closure *); + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, + struct bkey_i *, unsigned); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); + +void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, + struct open_buckets *, enum bch_data_type); + +void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, + struct write_point *); + +static inline struct write_point_specifier writepoint_hashed(unsigned long v) +{ + return (struct write_point_specifier) { .v = v | 1 }; +} + +static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) +{ + return (struct write_point_specifier) { .v = (unsigned long) wp }; +} + +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->type = type; +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 8a71a37637de..832568dc9551 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_ALLOC_TYPES_H #define _BCACHEFS_ALLOC_TYPES_H @@ -7,6 +8,8 @@ #include "clock_types.h" #include "fifo.h" +struct ec_bucket_buf; + /* There's two of these clocks, one for reads and one for writes: */ struct bucket_clock { /* @@ -45,16 +48,32 @@ typedef FIFO(long) alloc_fifo; /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 -#define WRITE_POINT_COUNT 32 + +#define WRITE_POINT_HASH_NR 32 +#define WRITE_POINT_MAX 32 struct open_bucket { spinlock_t lock; atomic_t pin; u8 freelist; - bool valid; - bool on_partial_list; + u8 ec_idx; + u8 type; + unsigned valid:1; + unsigned on_partial_list:1; unsigned sectors_free; struct bch_extent_ptr ptr; + struct ec_stripe_new *ec; +}; + +#define OPEN_BUCKET_LIST_MAX 15 + +struct open_buckets { + u8 nr; + u8 v[OPEN_BUCKET_LIST_MAX]; +}; + +struct dev_stripe_state { + u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; struct write_point { @@ -63,15 +82,13 @@ struct write_point { u64 last_used; unsigned long write_point; enum bch_data_type type; - - u8 nr_ptrs; - u8 first_ptr; + bool is_ec; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; - struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2]; - u64 next_alloc[BCH_SB_MEMBERS_MAX]; + struct open_buckets ptrs; + struct dev_stripe_state stripe; }; struct write_point_specifier { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 6beff8810c09..907d1b605cf4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_H #define _BCACHEFS_H @@ -183,6 +184,7 @@ #include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> +#include <linux/math64.h> #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> @@ -201,7 +203,7 @@ #include <linux/dynamic_fault.h> -#define bch2_fs_init_fault(name) \ +#define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) #define bch2_meta_read_fault(name) \ dynamic_fault("bcachefs:meta:read:" name) @@ -220,18 +222,22 @@ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_verbose(c, fmt, ...) \ do { \ - if ((c)->opts.verbose_recovery) \ + if ((c)->opts.verbose) \ bch_info(c, fmt, ##__VA_ARGS__); \ } while (0) #define pr_verbose_init(opts, fmt, ...) \ do { \ - if (opt_get(opts, verbose_init)) \ + if (opt_get(opts, verbose)) \ pr_info(fmt, ##__VA_ARGS__); \ } while (0) @@ -252,6 +258,8 @@ do { \ BCH_DEBUG_PARAM(expensive_debug_checks, \ "Enables various runtime debugging checks that " \ "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ @@ -259,6 +267,25 @@ do { \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ "done in memory") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(inject_invalid_keys, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(test_alloc_startup, \ + "Force allocator startup to use the slowpath where it" \ + "can't find enough free buckets without invalidating" \ + "cached data") \ + BCH_DEBUG_PARAM(force_reconstruct_read, \ + "Force reads to use the reconstruct path, when reading" \ + "from erasure coded extents") \ + BCH_DEBUG_PARAM(test_restart_gc, \ + "Test restarting mark and sweep gc when bucket gens change")\ + BCH_DEBUG_PARAM(test_reconstruct_alloc, \ + "Test reconstructing the alloc btree") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -270,10 +297,11 @@ do { \ #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ + x(btree_node_split) \ + x(btree_node_sort) \ + x(btree_node_read) \ x(btree_gc) \ - x(btree_split) \ - x(btree_sort) \ - x(btree_read) \ + x(btree_update) \ x(btree_lock_contended_read) \ x(btree_lock_contended_intent) \ x(btree_lock_contended_write) \ @@ -282,8 +310,10 @@ do { \ x(data_promote) \ x(journal_write) \ x(journal_delay) \ - x(journal_blocked) \ - x(journal_flush_seq) + x(journal_flush_seq) \ + x(blocked_journal) \ + x(blocked_allocate) \ + x(blocked_allocate_open_bucket) enum bch_time_stats { #define x(name) BCH_TIME_##name, @@ -296,35 +326,42 @@ enum bch_time_stats { #include "btree_types.h" #include "buckets_types.h" #include "clock_types.h" +#include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" +#include "replicas_types.h" #include "super_types.h" -/* - * Number of nodes we might have to allocate in a worst case btree split - * operation - we split all the way up to the root, then allocate a new root. - */ -#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1) - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U /* Maximum number of nodes we might need to allocate atomically: */ -#define BTREE_RESERVE_MAX \ - (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES) +#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) /* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) +#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX + +#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) struct btree; enum gc_phase { - GC_PHASE_SB = BTREE_ID_NR + 1, + GC_PHASE_NOT_RUNNING, + GC_PHASE_START, + GC_PHASE_SB, + + GC_PHASE_BTREE_EC, + GC_PHASE_BTREE_EXTENTS, + GC_PHASE_BTREE_INODES, + GC_PHASE_BTREE_DIRENTS, + GC_PHASE_BTREE_XATTRS, + GC_PHASE_BTREE_ALLOC, + GC_PHASE_BTREE_QUOTAS, + GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, - GC_PHASE_DONE }; struct gc_pos { @@ -356,6 +393,7 @@ struct bch_dev { char name[BDEVNAME_SIZE]; struct bch_sb_handle disk_sb; + struct bch_sb *sb_read_scratch; int sb_write_error; struct bch_devs_mask self; @@ -365,18 +403,16 @@ struct bch_dev { /* * Buckets: - * Per-bucket arrays are protected by c->usage_lock, bucket_lock and + * Per-bucket arrays are protected by c->mark_lock, bucket_lock and * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets; - unsigned long *buckets_dirty; - /* most out of date gen in the btree */ - u8 *oldest_gens; + struct bucket_array __rcu *buckets[2]; + unsigned long *buckets_nouse; + unsigned long *buckets_written; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; + struct bch_dev_usage __percpu *usage[2]; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -393,7 +429,6 @@ struct bch_dev { alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; spinlock_t freelist_lock; - size_t nr_invalidated; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -403,12 +438,19 @@ struct bch_dev { /* last calculated minimum prio */ u16 max_last_bucket_io[2]; - atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; - u64 allocator_journal_seq_flush; - bool allocator_invalidating_data; - bool allocator_blocked; + + /* + * XXX: this should be an enum for allocator state, so as to include + * error state + */ + enum { + ALLOCATOR_STOPPED, + ALLOCATOR_RUNNING, + ALLOCATOR_BLOCKED, + ALLOCATOR_BLOCKED_FULL, + } allocator_state; alloc_heap alloc_heap; @@ -417,6 +459,7 @@ struct bch_dev { copygc_heap copygc_heap; struct bch_pd_controller copygc_pd; struct write_point copygc_write_point; + u64 copygc_threshold; atomic64_t rebalance_work; @@ -435,33 +478,27 @@ struct bch_dev { struct io_count __percpu *io_done; }; -/* - * Flag bits for what phase of startup/shutdown the cache set is at, how we're - * shutting down, etc.: - * - * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching - * all the backing devices first (their cached data gets invalidated, and they - * won't automatically reattach). - */ enum { /* startup: */ BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOCATOR_STARTED, + BCH_FS_ALLOCATOR_RUNNING, BCH_FS_INITIAL_GC_DONE, BCH_FS_FSCK_DONE, BCH_FS_STARTED, + BCH_FS_RW, /* shutdown: */ + BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, /* errors: */ BCH_FS_ERROR, - BCH_FS_GC_FAILURE, + BCH_FS_ERRORS_FIXED, /* misc: */ BCH_FS_BDEV_MOUNTED, - BCH_FS_FSCK_FIXED_ERRORS, BCH_FS_FIXED_GENS, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, @@ -474,11 +511,17 @@ struct btree_debug { struct dentry *failed; }; -enum bch_fs_state { - BCH_FS_STARTING = 0, - BCH_FS_STOPPING, - BCH_FS_RO, - BCH_FS_RW, +struct bch_fs_pcpu { + u64 sectors_available; +}; + +struct journal_seq_blacklist_table { + size_t nr; + struct journal_seq_blacklist_table_entry { + u64 start; + u64 end; + bool dirty; + } entries[0]; }; struct bch_fs { @@ -498,7 +541,6 @@ struct bch_fs { /* ro/rw, add/remove devices: */ struct mutex state_lock; - enum bch_fs_state state; /* Counts outstanding writes, for clean transition to read-only */ struct percpu_ref writes; @@ -506,10 +548,12 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - struct bch_replicas_cpu __rcu *replicas; - struct bch_replicas_cpu __rcu *replicas_gc; + struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; + struct journal_entry_res replicas_journal_res; + struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; @@ -519,6 +563,7 @@ struct bch_fs { uuid_le uuid; uuid_le user_uuid; + u16 version; u16 encoded_extent_max; u8 nr_devices; @@ -530,6 +575,7 @@ struct bch_fs { u32 time_base_hi; u32 time_precision; u64 features; + u64 compat; } sb; struct bch_sb_handle disk_sb; @@ -568,9 +614,12 @@ struct bch_fs { struct mutex btree_interior_update_lock; struct closure_waitlist btree_interior_update_wait; + mempool_t btree_iters_pool; + struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; + struct workqueue_struct *journal_reclaim_wq; /* ALLOCATION */ struct delayed_work pd_controllers_update; @@ -586,14 +635,22 @@ struct bch_fs { * and forces them to be revalidated */ u32 capacity_gen; + unsigned bucket_size_max; atomic64_t sectors_available; - struct bch_fs_usage __percpu *usage_percpu; - struct bch_fs_usage usage_cached; - struct percpu_rw_semaphore usage_lock; + struct bch_fs_pcpu __percpu *pcpu; - struct closure_waitlist freelist_wait; + struct percpu_rw_semaphore mark_lock; + + seqcount_t usage_lock; + struct bch_fs_usage *usage_base; + struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage_gc; + + /* single element mempool: */ + struct mutex usage_scratch_lock; + struct bch_fs_usage *usage_scratch; /* * When we invalidate buckets, we use both the priority and the amount @@ -605,8 +662,16 @@ struct bch_fs { struct io_clock io_clock[2]; + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; + /* ALLOCATOR */ spinlock_t freelist_lock; + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; u8 open_buckets_freelist; u8 open_buckets_nr_free; struct closure_waitlist open_buckets_wait; @@ -615,9 +680,10 @@ struct bch_fs { struct write_point btree_write_point; struct write_point rebalance_write_point; - struct write_point write_points[WRITE_POINT_COUNT]; - struct hlist_head write_points_hash[WRITE_POINT_COUNT]; + struct write_point write_points[WRITE_POINT_MAX]; + struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; struct mutex write_points_hash_lock; + unsigned write_points_nr; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; @@ -630,9 +696,6 @@ struct bch_fs { * * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) * - * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not - * currently running, and gc marks are currently valid - * * Protected by gc_pos_lock. Only written to by GC thread, so GC thread * can read without a lock. */ @@ -659,7 +722,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -667,6 +730,22 @@ struct bch_fs { /* REBALANCE */ struct bch_fs_rebalance rebalance; + /* STRIPES: */ + GENRADIX(struct stripe) stripes[2]; + struct mutex ec_stripe_create_lock; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; + + /* ERASURE CODING */ + struct list_head ec_new_stripe_list; + struct mutex ec_new_stripe_lock; + + struct bio_set ec_bioset; + + struct work_struct ec_stripe_delete_work; + struct llist_head ec_stripe_delete_list; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; @@ -681,9 +760,6 @@ struct bch_fs { struct mutex fsck_error_lock; bool fsck_alloc_err; - /* FILESYSTEM */ - atomic_long_t nr_inodes; - /* QUOTAS */ struct bch_memquota_type quotas[QTYP_NR]; @@ -708,7 +784,7 @@ struct bch_fs { struct journal journal; - unsigned bucket_journal_seq; + u64 last_bucket_seq_cleanup; /* The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; @@ -734,11 +810,6 @@ static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) #endif } -static inline bool bch2_fs_running(struct bch_fs *c) -{ - return c->state == BCH_FS_RO || c->state == BCH_FS_RW; -} - static inline unsigned bucket_bytes(const struct bch_dev *ca) { return ca->mi.bucket_size << 9; @@ -749,4 +820,32 @@ static inline unsigned block_bytes(const struct bch_fs *c) return c->opts.block_size << 9; } +static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) +{ + return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); +} + +static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) +{ + s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; + + if (c->sb.time_precision == 1) + return ns; + + return div_s64(ns, c->sb.time_precision); +} + +static inline s64 bch2_current_time(struct bch_fs *c) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + return timespec_to_bch2_time(c, now); +} + +static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +{ + return dev < c->sb.nr_devices && c->devs[dev]; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ab8b944634e8..362f9bc9b82c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FORMAT_H #define _BCACHEFS_FORMAT_H @@ -73,6 +74,7 @@ #include <asm/types.h> #include <asm/byteorder.h> +#include <linux/kernel.h> #include <linux/uuid.h> #define LE_BITMASK(_bits, name, type, field, offset, end) \ @@ -233,6 +235,9 @@ struct bkey_packed { } __attribute__((packed, aligned(8))); #define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX +#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) + #define KEY_PACKED_BITS_START 24 #define KEY_FORMAT_LOCAL_BTREE 0 @@ -299,15 +304,6 @@ static inline void bkey_init(struct bkey *k) #define __BKEY_PADDED(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } -#define BKEY_VAL_TYPE(name, nr) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -} - /* * - DELETED keys are used internally to mark keys that should be ignored but * override keys in composition order. Their version number is ignored. @@ -322,19 +318,37 @@ struct bkey_i_##name { \ * by new writes or cluster-wide GC. Node repair can also overwrite them with * the same or a more recent version number, but not with an older version * number. + * + * - WHITEOUT: for hash table btrees */ -#define KEY_TYPE_DELETED 0 -#define KEY_TYPE_DISCARD 1 -#define KEY_TYPE_ERROR 2 -#define KEY_TYPE_COOKIE 3 -#define KEY_TYPE_PERSISTENT_DISCARD 4 -#define KEY_TYPE_GENERIC_NR 128 +#define BCH_BKEY_TYPES() \ + x(deleted, 0) \ + x(discard, 1) \ + x(error, 2) \ + x(cookie, 3) \ + x(whiteout, 4) \ + x(btree_ptr, 5) \ + x(extent, 6) \ + x(reservation, 7) \ + x(inode, 8) \ + x(inode_generation, 9) \ + x(dirent, 10) \ + x(xattr, 11) \ + x(alloc, 12) \ + x(quota, 13) \ + x(stripe, 14) + +enum bch_bkey_type { +#define x(name, nr) KEY_TYPE_##name = nr, + BCH_BKEY_TYPES() +#undef x + KEY_TYPE_MAX, +}; struct bch_cookie { struct bch_val v; __le64 cookie; }; -BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); /* Extents */ @@ -426,6 +440,16 @@ enum bch_csum_type { BCH_CSUM_NR = 7, }; +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C_NONZERO] = 4, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64_NONZERO] = 8, + [BCH_CSUM_CRC64] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +}; + static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { @@ -446,15 +470,20 @@ enum bch_compression_type { BCH_COMPRESSION_NR = 5, }; +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) +#define BCH_EXTENT_ENTRY_MAX 5 + enum bch_extent_entry_type { - BCH_EXTENT_ENTRY_ptr = 0, - BCH_EXTENT_ENTRY_crc32 = 1, - BCH_EXTENT_ENTRY_crc64 = 2, - BCH_EXTENT_ENTRY_crc128 = 3, +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x }; -#define BCH_EXTENT_ENTRY_MAX 4 - /* Compressed/uncompressed size are stored biased by 1: */ struct bch_extent_crc32 { #if defined(__LITTLE_ENDIAN_BITFIELD) @@ -538,7 +567,7 @@ struct bch_extent_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:1, cached:1, - erasure_coded:1, + unused:1, reservation:1, offset:44, /* 8 petabytes */ dev:8, @@ -548,23 +577,35 @@ struct bch_extent_ptr { dev:8, offset:44, reservation:1, - erasure_coded:1, + unused:1, cached:1, type:1; #endif } __attribute__((packed, aligned(8))); -struct bch_extent_reservation { +struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, - unused:23, + block:8, + idx:51; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:51, + block:8, + type:5; +#endif +}; + +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:22, replicas:4, generation:32; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 generation:32, replicas:4, - unused:23, - type:5; + unused:22, + type:6; #endif }; @@ -579,27 +620,18 @@ union bch_extent_entry { #else #error edit for your odd byteorder. #endif - struct bch_extent_crc32 crc32; - struct bch_extent_crc64 crc64; - struct bch_extent_crc128 crc128; - struct bch_extent_ptr ptr; -}; -enum { - BCH_EXTENT = 128, +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; - /* - * This is kind of a hack, we're overloading the type for a boolean that - * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED - * have the same value type: - */ - BCH_EXTENT_CACHED = 129, +struct bch_btree_ptr { + struct bch_val v; - /* - * Persistent reservation: - */ - BCH_RESERVATION = 130, -}; + struct bch_extent_ptr start[0]; + __u64 _data[0]; +} __attribute__((packed, aligned(8))); struct bch_extent { struct bch_val v; @@ -607,7 +639,6 @@ struct bch_extent { union bch_extent_entry start[0]; __u64 _data[0]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(extent, BCH_EXTENT); struct bch_reservation { struct bch_val v; @@ -616,7 +647,6 @@ struct bch_reservation { __u8 nr_replicas; __u8 pad[3]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(reservation, BCH_RESERVATION); /* Maximum size (in u64s) a single pointer could be: */ #define BKEY_EXTENT_PTR_U64s_MAX\ @@ -644,12 +674,6 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION); #define BCACHEFS_ROOT_INO 4096 -enum bch_inode_types { - BCH_INODE_FS = 128, - BCH_INODE_BLOCKDEV = 129, - BCH_INODE_GENERATION = 130, -}; - struct bch_inode { struct bch_val v; @@ -658,7 +682,6 @@ struct bch_inode { __le16 bi_mode; __u8 fields[0]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(inode, BCH_INODE_FS); struct bch_inode_generation { struct bch_val v; @@ -666,38 +689,49 @@ struct bch_inode_generation { __le32 bi_generation; __le32 pad; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); - -#define BCH_INODE_FIELDS() \ - BCH_INODE_FIELD(bi_atime, 64) \ - BCH_INODE_FIELD(bi_ctime, 64) \ - BCH_INODE_FIELD(bi_mtime, 64) \ - BCH_INODE_FIELD(bi_otime, 64) \ - BCH_INODE_FIELD(bi_size, 64) \ - BCH_INODE_FIELD(bi_sectors, 64) \ - BCH_INODE_FIELD(bi_uid, 32) \ - BCH_INODE_FIELD(bi_gid, 32) \ - BCH_INODE_FIELD(bi_nlink, 32) \ - BCH_INODE_FIELD(bi_generation, 32) \ - BCH_INODE_FIELD(bi_dev, 32) \ - BCH_INODE_FIELD(bi_data_checksum, 8) \ - BCH_INODE_FIELD(bi_compression, 8) \ - BCH_INODE_FIELD(bi_project, 32) \ - BCH_INODE_FIELD(bi_background_compression, 8) \ - BCH_INODE_FIELD(bi_data_replicas, 8) \ - BCH_INODE_FIELD(bi_promote_target, 16) \ - BCH_INODE_FIELD(bi_foreground_target, 16) \ - BCH_INODE_FIELD(bi_background_target, 16) - -#define BCH_INODE_FIELDS_INHERIT() \ - BCH_INODE_FIELD(bi_data_checksum) \ - BCH_INODE_FIELD(bi_compression) \ - BCH_INODE_FIELD(bi_project) \ - BCH_INODE_FIELD(bi_background_compression) \ - BCH_INODE_FIELD(bi_data_replicas) \ - BCH_INODE_FIELD(bi_promote_target) \ - BCH_INODE_FIELD(bi_foreground_target) \ - BCH_INODE_FIELD(bi_background_target) + +#define BCH_INODE_FIELDS() \ + x(bi_atime, 64) \ + x(bi_ctime, 64) \ + x(bi_mtime, 64) \ + x(bi_otime, 64) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; enum { /* @@ -712,9 +746,7 @@ enum { __BCH_INODE_I_SIZE_DIRTY= 5, __BCH_INODE_I_SECTORS_DIRTY= 6, - - /* not implemented yet: */ - __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */ + __BCH_INODE_UNLINKED = 7, /* bits 20+ reserved for packed fields below: */ }; @@ -726,29 +758,11 @@ enum { #define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS) +#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); -struct bch_inode_blockdev { - struct bch_val v; - - __le64 i_size; - __le64 i_flags; - - /* Seconds: */ - __le64 i_ctime; - __le64 i_mtime; - - uuid_le i_uuid; - __u8 i_label[32]; -} __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV); - -/* Thin provisioned volume, or cache for another block device? */ -LE64_BITMASK(CACHED_DEV, struct bch_inode_blockdev, i_flags, 0, 1) - /* Dirents */ /* @@ -762,11 +776,6 @@ LE64_BITMASK(CACHED_DEV, struct bch_inode_blockdev, i_flags, 0, 1) * collision: */ -enum { - BCH_DIRENT = 128, - BCH_DIRENT_WHITEOUT = 129, -}; - struct bch_dirent { struct bch_val v; @@ -781,20 +790,19 @@ struct bch_dirent { __u8 d_name[]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(dirent, BCH_DIRENT); -/* Xattrs */ +#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ + sizeof(struct bkey) - \ + offsetof(struct bch_dirent, d_name)) -enum { - BCH_XATTR = 128, - BCH_XATTR_WHITEOUT = 129, -}; -#define BCH_XATTR_INDEX_USER 0 -#define BCH_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define BCH_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define BCH_XATTR_INDEX_TRUSTED 3 -#define BCH_XATTR_INDEX_SECURITY 4 +/* Xattrs */ + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 struct bch_xattr { struct bch_val v; @@ -803,33 +811,47 @@ struct bch_xattr { __le16 x_val_len; __u8 x_name[]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(xattr, BCH_XATTR); /* Bucket/allocation information: */ -enum { - BCH_ALLOC = 128, -}; - -enum { - BCH_ALLOC_FIELD_READ_TIME = 0, - BCH_ALLOC_FIELD_WRITE_TIME = 1, -}; - struct bch_alloc { struct bch_val v; __u8 fields; __u8 gen; __u8 data[]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(alloc, BCH_ALLOC); -/* Quotas: */ +#define BCH_ALLOC_FIELDS() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) enum { - BCH_QUOTA = 128, +#define x(name, bytes) BCH_ALLOC_FIELD_##name, + BCH_ALLOC_FIELDS() +#undef x + BCH_ALLOC_FIELD_NR }; +static const unsigned BCH_ALLOC_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, + BCH_ALLOC_FIELDS() +#undef x +}; + +#define x(name, bits) + (bits / 8) +static const unsigned BKEY_ALLOC_VAL_U64s_MAX = + DIV_ROUND_UP(offsetof(struct bch_alloc, data) + BCH_ALLOC_FIELDS(), sizeof(u64)); +#undef x + +static const unsigned BKEY_ALLOC_U64s_MAX = BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX; + +/* Quotas: */ + enum quota_types { QTYP_USR = 0, QTYP_GRP = 1, @@ -852,7 +874,22 @@ struct bch_quota { struct bch_val v; struct bch_quota_counter c[Q_COUNTERS]; } __attribute__((packed, aligned(8))); -BKEY_VAL_TYPE(quota, BCH_QUOTA); + +/* Erasure coding */ + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[0]; +} __attribute__((packed, aligned(8))); /* Optional/variable size superblock sections: */ @@ -866,9 +903,12 @@ struct bch_sb_field { x(journal, 0) \ x(members, 1) \ x(crypt, 2) \ - x(replicas, 3) \ + x(replicas_v0, 3) \ x(quota, 4) \ - x(disk_groups, 5) + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -886,6 +926,8 @@ struct bch_sb_field_journal { /* BCH_SB_FIELD_members: */ +#define BCH_MIN_NR_NBUCKETS (1 << 6) + struct bch_member { uuid_le uuid; __le64 nbuckets; /* device size */ @@ -992,16 +1034,28 @@ enum bch_data_type { BCH_DATA_NR = 6, }; +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[0]; +} __attribute__((packed)); + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[0]; +} __attribute__((packed, aligned(8))); + struct bch_replicas_entry { - u8 data_type; - u8 nr; - u8 devs[0]; -}; + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[0]; +} __attribute__((packed)); struct bch_sb_field_replicas { struct bch_sb_field field; struct bch_replicas_entry entries[0]; -}; +} __attribute__((packed, aligned(8))); /* BCH_SB_FIELD_quota: */ @@ -1027,7 +1081,7 @@ struct bch_sb_field_quota { struct bch_disk_group { __u8 label[BCH_SB_LABEL_SIZE]; __le64 flags[2]; -}; +} __attribute__((packed, aligned(8))); LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) @@ -1036,20 +1090,71 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; struct bch_disk_group entries[0]; +} __attribute__((packed, aligned(8))); + +/* + * On clean shutdown, store btree roots and current journal sequence number in + * the superblock: + */ +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; + + union { + struct bkey_i start[0]; + __u64 _data[0]; + }; +}; + +struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; + __le16 read_clock; + __le16 write_clock; + __le64 journal_seq; + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +}; + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + + union { + struct journal_seq_blacklist_entry start[0]; + __u64 _data[0]; + }; }; /* Superblock: */ /* - * Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS - * BCH_MEMBER_DATA_ALLOWED - * Version 9: incompatible extent nonce change + * New versioning scheme: + * One common version number for all on disk data structures - superblock, btree + * nodes, journal entries */ +#define BCH_JSET_VERSION_OLD 2 +#define BCH_BSET_VERSION_OLD 3 + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, + bcachefs_metadata_version_new_versioning = 10, + bcachefs_metadata_version_bkey_renumber = 10, + bcachefs_metadata_version_max = 11, +}; -#define BCH_SB_VERSION_MIN 7 -#define BCH_SB_VERSION_EXTENT_MAX 8 -#define BCH_SB_VERSION_EXTENT_NONCE_V1 9 -#define BCH_SB_VERSION_MAX 9 +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 #define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ @@ -1068,6 +1173,9 @@ struct bch_sb_layout { /* * @offset - sector where this sb was written * @version - on disk format version + * @version_min - Oldest metadata version this filesystem contains; so we can + * safely drop compatibility code and refuse to mount filesystems + * we'd need it for * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) * @seq - incremented each time superblock is written * @uuid - used for generating various magic numbers and identifying @@ -1080,7 +1188,9 @@ struct bch_sb_layout { */ struct bch_sb { struct bch_csum csum; - __le64 version; + __le16 version; + __le16 version_min; + __le16 pad[2]; uuid_le magic; uuid_le uuid; uuid_le user_uuid; @@ -1144,7 +1254,9 @@ LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); -/* 60-64 unused */ +LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); + +/* 61-64 unused */ LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); @@ -1169,12 +1281,24 @@ LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, struct bch_sb, flags[2], 0, 4); +LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + +LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); /* Features: */ enum bch_sb_features { BCH_FEATURE_LZ4 = 0, BCH_FEATURE_GZIP = 1, BCH_FEATURE_ZSTD = 2, + BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ + BCH_FEATURE_EC = 4, + BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_NR, +}; + +enum bch_sb_compat { + BCH_COMPAT_FEAT_ALLOC_INFO = 0, + BCH_COMPAT_FEAT_ALLOC_METADATA = 1, }; /* options: */ @@ -1250,24 +1374,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb) /* Journal */ -#define BCACHE_JSET_VERSION_UUIDv1 1 -#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ -#define BCACHE_JSET_VERSION_JKEYS 2 -#define BCACHE_JSET_VERSION 2 - -struct jset_entry { - __le16 u64s; - __u8 btree_id; - __u8 level; - __u8 type; /* designates what this jset holds */ - __u8 pad[3]; - - union { - struct bkey_i start[0]; - __u64 _data[0]; - }; -}; - #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) #define BCH_JSET_ENTRY_TYPES() \ @@ -1275,7 +1381,9 @@ struct jset_entry { x(btree_root, 1) \ x(prio_ptrs, 2) \ x(blacklist, 3) \ - x(blacklist_v2, 4) + x(blacklist_v2, 4) \ + x(usage, 5) \ + x(data_usage, 6) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1305,6 +1413,24 @@ struct jset_entry_blacklist_v2 { __le64 end; }; +enum { + FS_USAGE_RESERVED = 0, + FS_USAGE_INODES = 1, + FS_USAGE_KEY_VERSION = 2, + FS_USAGE_NR = 3 +}; + +struct jset_entry_usage { + struct jset_entry entry; + __le64 v; +} __attribute__((packed)); + +struct jset_entry_data_usage { + struct jset_entry entry; + __le64 v; + struct bch_replicas_entry r; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1343,38 +1469,30 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -#define BCH_JOURNAL_BUCKETS_MIN 20 +#define BCH_JOURNAL_BUCKETS_MIN 8 /* Btree: */ -#define DEFINE_BCH_BTREE_IDS() \ - DEF_BTREE_ID(EXTENTS, 0, "extents") \ - DEF_BTREE_ID(INODES, 1, "inodes") \ - DEF_BTREE_ID(DIRENTS, 2, "dirents") \ - DEF_BTREE_ID(XATTRS, 3, "xattrs") \ - DEF_BTREE_ID(ALLOC, 4, "alloc") \ - DEF_BTREE_ID(QUOTAS, 5, "quotas") - -#define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, +#define BCH_BTREE_IDS() \ + x(EXTENTS, 0, "extents") \ + x(INODES, 1, "inodes") \ + x(DIRENTS, 2, "dirents") \ + x(XATTRS, 3, "xattrs") \ + x(ALLOC, 4, "alloc") \ + x(QUOTAS, 5, "quotas") \ + x(EC, 6, "erasure_coding") enum btree_id { - DEFINE_BCH_BTREE_IDS() +#define x(kwd, val, name) BTREE_ID_##kwd = val, + BCH_BTREE_IDS() +#undef x BTREE_ID_NR }; -#undef DEF_BTREE_ID - #define BTREE_MAX_DEPTH 4U /* Btree nodes */ -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_CSUM 1 -#define BCACHE_BSET_KEY_v1 2 -#define BCACHE_BSET_JOURNAL_SEQ 3 -#define BCACHE_BSET_VERSION 3 - /* * Btree nodes * diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 73e5d887ccd8..d668ede5491a 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_IOCTL_H #define _BCACHEFS_IOCTL_H @@ -70,7 +71,11 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize) +#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) + +/* ioctl below act on a particular file, not the filesystem as a whole: */ + +#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) /* * BCH_IOCTL_QUERY_UUID: get filesystem UUID diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 850ba72cc5a1..0f9dfe37b0af 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey.h" @@ -59,8 +60,8 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, char buf1[160], buf2[160]; char buf3[160], buf4[160]; - bch2_bkey_to_text(buf1, sizeof(buf1), unpacked); - bch2_bkey_to_text(buf2, sizeof(buf2), &tmp); + bch2_bkey_to_text(&PBUF(buf1), unpacked); + bch2_bkey_to_text(&PBUF(buf2), &tmp); bch2_to_binary(buf3, (void *) unpacked, 80); bch2_to_binary(buf4, high_word(format, packed), 80); @@ -484,7 +485,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, pack_state_finish(&state, out); out->u64s = f->key_u64s; out->format = KEY_FORMAT_LOCAL_BTREE; - out->type = KEY_TYPE_DELETED; + out->type = KEY_TYPE_deleted; #ifdef CONFIG_BCACHEFS_DEBUG if (exact) { @@ -1010,11 +1011,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, nr_key_bits -= 64; } - if (l_v != r_v) - return l_v < r_v ? -1 : 1; - - if (!nr_key_bits) - return 0; + if (!nr_key_bits || l_v != r_v) + break; l = next_word(l); r = next_word(r); @@ -1022,6 +1020,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, l_v = *l; r_v = *r; } + + return cmp_int(l_v, r_v); } #endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 2d6c8a230a73..1acff9d0fd7e 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BKEY_H #define _BCACHEFS_BKEY_H @@ -32,10 +33,7 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) -static inline unsigned bkey_val_u64s(const struct bkey *k) -{ - return k->u64s - BKEY_U64s; -} +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) { @@ -52,25 +50,12 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); } -/* - * Mark a key as deleted without changing the size of the value (i.e. modifying - * keys in the btree in place) - */ -static inline void __set_bkey_deleted(struct bkey *k) -{ - k->type = KEY_TYPE_DELETED; -} - -static inline void set_bkey_deleted(struct bkey *k) -{ - __set_bkey_deleted(k); - set_bkey_val_u64s(k, 0); -} +#define bkey_val_end(_k) vstruct_idx((_k).v, bkey_val_u64s((_k).k)) -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED) +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) #define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD) + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) #define bkey_packed_typecheck(_k) \ ({ \ @@ -221,14 +206,12 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); static __always_inline int bversion_cmp(struct bversion l, struct bversion r) { - if (l.hi != r.hi) - return l.hi < r.hi ? -1 : 1; - if (l.lo != r.lo) - return l.lo < r.lo ? -1 : 1; - return 0; + return cmp_int(l.hi, r.hi) ?: + cmp_int(l.lo, r.lo); } #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) +#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) static __always_inline int bversion_zero(struct bversion v) { @@ -284,6 +267,16 @@ static inline struct bpos bkey_successor(struct bpos p) return ret; } +static inline struct bpos bkey_predecessor(struct bpos p) +{ + struct bpos ret = p; + + if (!ret.offset--) + BUG_ON(!ret.inode--); + + return ret; +} + static inline u64 bkey_start_offset(const struct bkey *k) { return k->p.offset - k->size; @@ -437,7 +430,15 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion * functions. */ -#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \ +#define BKEY_VAL_ACCESSORS(name) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ struct bkey_s_c_##name { \ union { \ struct { \ @@ -462,20 +463,20 @@ struct bkey_s_##name { \ \ static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ { \ - _assert(k->k.type, nr); \ + EBUG_ON(k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline const struct bkey_i_##name * \ bkey_i_to_##name##_c(const struct bkey_i *k) \ { \ - _assert(k->k.type, nr); \ + EBUG_ON(k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ { \ - _assert(k.k->type, nr); \ + EBUG_ON(k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ @@ -484,7 +485,7 @@ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ \ static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ { \ - _assert(k.k->type, nr); \ + EBUG_ON(k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ @@ -510,7 +511,7 @@ name##_i_to_s_c(const struct bkey_i_##name *k) \ \ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ { \ - _assert(k->k.type, nr); \ + EBUG_ON(k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ @@ -520,27 +521,13 @@ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ static inline struct bkey_s_c_##name \ bkey_i_to_s_c_##name(const struct bkey_i *k) \ { \ - _assert(k->k.type, nr); \ + EBUG_ON(k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ }; \ } \ \ -static inline struct bch_##name * \ -bkey_p_##name##_val(const struct bkey_format *f, \ - struct bkey_packed *k) \ -{ \ - return container_of(bkeyp_val(f, k), struct bch_##name, v); \ -} \ - \ -static inline const struct bch_##name * \ -bkey_p_c_##name##_val(const struct bkey_format *f, \ - const struct bkey_packed *k) \ -{ \ - return container_of(bkeyp_val(f, k), struct bch_##name, v); \ -} \ - \ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ { \ struct bkey_i_##name *k = \ @@ -548,43 +535,23 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ \ bkey_init(&k->k); \ memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = nr; \ + k->k.type = KEY_TYPE_##name; \ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ \ return k; \ } -#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr) - -#define BKEY_VAL_ACCESSORS(name, _nr) \ - static inline void __bch_##name##_assert(u8 type, u8 nr) \ - { \ - EBUG_ON(type != _nr); \ - } \ - \ - __BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert) - -BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE); - -static inline void __bch2_extent_assert(u8 type, u8 nr) -{ - EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED); -} - -__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch2_extent_assert); -BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION); - -BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); -BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); -BKEY_VAL_ACCESSORS(inode_generation, BCH_INODE_GENERATION); - -BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); - -BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); - -BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC); - -BKEY_VAL_ACCESSORS(quota, BCH_QUOTA); +BKEY_VAL_ACCESSORS(cookie); +BKEY_VAL_ACCESSORS(btree_ptr); +BKEY_VAL_ACCESSORS(extent); +BKEY_VAL_ACCESSORS(reservation); +BKEY_VAL_ACCESSORS(inode); +BKEY_VAL_ACCESSORS(inode_generation); +BKEY_VAL_ACCESSORS(dirent); +BKEY_VAL_ACCESSORS(xattr); +BKEY_VAL_ACCESSORS(alloc); +BKEY_VAL_ACCESSORS(quota); +BKEY_VAL_ACCESSORS(stripe); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e4f62f905f11..09ee958c5568 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -1,74 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" #include "btree_types.h" -#include "alloc.h" +#include "alloc_background.h" #include "dirent.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "inode.h" #include "quota.h" #include "xattr.h" -const struct bkey_ops bch2_bkey_ops[] = { - [BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops, - [BKEY_TYPE_INODES] = bch2_bkey_inode_ops, - [BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops, - [BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops, - [BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops, - [BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops, - [BKEY_TYPE_BTREE] = bch2_bkey_btree_ops, +const char * const bch_bkey_types[] = { +#define x(name, nr) #name, + BCH_BKEY_TYPES() +#undef x + NULL }; -const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) +static const char *deleted_key_invalid(const struct bch_fs *c, + struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[type]; + return NULL; +} - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - return NULL; +#define bch2_bkey_ops_deleted (struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +} - case KEY_TYPE_ERROR: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; +#define bch2_bkey_ops_discard (struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +} - case KEY_TYPE_COOKIE: - return bkey_val_bytes(k.k) != sizeof(struct bch_cookie) - ? "incorrect value size" - : NULL; +static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_val_bytes(k.k)) + return "value size should be zero"; - default: - if (k.k->type < KEY_TYPE_GENERIC_NR) - return "invalid type"; + return NULL; +} - return ops->key_invalid(c, k); - } +#define bch2_bkey_ops_error (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ } -const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) +static const char *key_type_cookie_invalid(const struct bch_fs *c, + struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[type]; + if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) + return "incorrect value size"; + + return NULL; +} +#define bch2_bkey_ops_cookie (struct bkey_ops) { \ + .key_invalid = key_type_cookie_invalid, \ +} + +#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +} + +static const struct bkey_ops bch2_bkey_ops[] = { +#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() +#undef x +}; + +const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) +{ + if (k.k->type >= KEY_TYPE_MAX) + return "invalid type"; + + return bch2_bkey_ops[k.k->type].key_invalid(c, k); +} + +const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type) +{ if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (!ops->is_extents) { - if (k.k->size) - return "nonzero size field"; - } else { + if ((btree_node_type_is_extents(type) || + type == BKEY_TYPE_BTREE) && + bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) + return "value too big"; + + if (btree_node_type_is_extents(type)) { if ((k.k->size == 0) != bkey_deleted(k.k)) return "bad size field"; + } else { + if (k.k->size) + return "nonzero size field"; } - if (ops->is_extents && - !k.k->size && - !bkey_deleted(k.k)) - return "zero size field"; - if (k.k->p.snapshot) return "nonzero snapshot"; @@ -79,11 +105,11 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, return NULL; } -const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) +const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type) { - return __bch2_bkey_invalid(c, type, k) ?: - bch2_bkey_val_invalid(c, type, k); + return __bch2_bkey_invalid(c, k, type) ?: + bch2_bkey_val_invalid(c, k); } const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) @@ -99,93 +125,135 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { - enum bkey_type type = btree_node_type(b); - const struct bkey_ops *ops = &bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; const char *invalid; BUG_ON(!k.k->u64s); - invalid = bch2_bkey_invalid(c, type, k) ?: + invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, k); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k); + bch2_bkey_val_to_text(&PBUF(buf), c, k); bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid); return; } - if (k.k->type >= KEY_TYPE_GENERIC_NR && - ops->key_debugcheck) + if (ops->key_debugcheck) ops->key_debugcheck(c, b, k); } -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - -int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k) +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { - char *out = buf, *end = buf + size; + if (!bkey_cmp(pos, POS_MIN)) + pr_buf(out, "POS_MIN"); + else if (!bkey_cmp(pos, POS_MAX)) + pr_buf(out, "POS_MAX"); + else + pr_buf(out, "%llu:%llu", pos.inode, pos.offset); +} - p("u64s %u type %u ", k->u64s, k->type); +void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) +{ + pr_buf(out, "u64s %u type %u ", k->u64s, k->type); - if (bkey_cmp(k->p, POS_MAX)) - p("%llu:%llu", k->p.inode, k->p.offset); - else - p("POS_MAX"); - - p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo); - - return out - buf; -} - -int bch2_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) -{ - const struct bkey_ops *ops = &bch2_bkey_ops[type]; - char *out = buf, *end = buf + size; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - p(" deleted"); - break; - case KEY_TYPE_DISCARD: - p(" discard"); - break; - case KEY_TYPE_ERROR: - p(" error"); - break; - case KEY_TYPE_COOKIE: - p(" cookie"); - break; - default: - if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text) - ops->val_to_text(c, buf, size, k); - break; - } + bch2_bpos_to_text(out, k->p); - return out - buf; + pr_buf(out, " snap %u len %u ver %llu", + k->p.snapshot, k->size, k->version.lo); } -int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type, - char *buf, size_t size, struct bkey_s_c k) +void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - char *out = buf, *end = buf + size; + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; - out += bch2_bkey_to_text(out, end - out, k.k); - out += scnprintf(out, end - out, ": "); - out += bch2_val_to_text(c, type, out, end - out, k); + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); + else + pr_buf(out, " %s", bch_bkey_types[k.k->type]); +} - return out - buf; +void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_to_text(out, k.k); + pr_buf(out, ": "); + bch2_val_to_text(out, c, k); } -void bch2_bkey_swab(enum bkey_type type, - const struct bkey_format *f, - struct bkey_packed *k) +void bch2_bkey_swab(const struct bkey_format *f, + struct bkey_packed *k) { - const struct bkey_ops *ops = &bch2_bkey_ops[type]; + const struct bkey_ops *ops = &bch2_bkey_ops[k->type]; bch2_bkey_swab_key(f, k); if (ops->swab) ops->swab(f, k); } + +bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + + return ops->key_normalize + ? ops->key_normalize(c, k) + : false; +} + +enum merge_result bch2_bkey_merge(struct bch_fs *c, + struct bkey_s l, struct bkey_s r) +{ + const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; + enum merge_result ret; + + if (key_merging_disabled(c) || + !ops->key_merge || + l.k->type != r.k->type || + bversion_cmp(l.k->version, r.k->version) || + bkey_cmp(l.k->p, bkey_start_pos(r.k))) + return BCH_MERGE_NOMERGE; + + ret = ops->key_merge(c, l, r); + + if (ret != BCH_MERGE_NOMERGE) + l.k->needs_whiteout |= r.k->needs_whiteout; + return ret; +} + +static const struct old_bkey_type { + u8 btree_node_type; + u8 old; + u8 new; +} bkey_renumber_table[] = { + {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, + {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, + {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, + {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, + {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, + {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, + {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, + {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, + {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, + {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, + {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, + {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, +}; + +void bch2_bkey_renumber(enum btree_node_type btree_node_type, + struct bkey_packed *k, + int write) +{ + const struct old_bkey_type *i; + + for (i = bkey_renumber_table; + i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); + i++) + if (btree_node_type == i->btree_node_type && + k->type == (write ? i->new : i->old)) { + k->type = write ? i->old : i->new; + break; + } +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 9e2c90d54e42..08b976633360 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -1,37 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BKEY_METHODS_H #define _BCACHEFS_BKEY_METHODS_H #include "bkey.h" -#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val, - -enum bkey_type { - DEFINE_BCH_BTREE_IDS() - BKEY_TYPE_BTREE, -}; - -#undef DEF_BTREE_ID - -/* Type of a key in btree @id at level @level: */ -static inline enum bkey_type bkey_type(unsigned level, enum btree_id id) -{ - return level ? BKEY_TYPE_BTREE : (enum bkey_type) id; -} - -static inline bool btree_type_has_ptrs(enum bkey_type type) -{ - switch (type) { - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - return true; - default: - return false; - } -} - struct bch_fs; struct btree; struct bkey; +enum btree_node_type; + +extern const char * const bch_bkey_types[]; enum merge_result { BCH_MERGE_NOMERGE, @@ -44,43 +22,43 @@ enum merge_result { BCH_MERGE_MERGE, }; -typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *, - struct bkey_s); -typedef enum merge_result (*key_merge_fn)(struct bch_fs *, - struct btree *, - struct bkey_i *, struct bkey_i *); - struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, struct bkey_s_c); void (*key_debugcheck)(struct bch_fs *, struct btree *, struct bkey_s_c); - void (*val_to_text)(struct bch_fs *, char *, - size_t, struct bkey_s_c); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); void (*swab)(const struct bkey_format *, struct bkey_packed *); - key_filter_fn key_normalize; - key_merge_fn key_merge; - bool is_extents; + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + enum merge_result (*key_merge)(struct bch_fs *, + struct bkey_s, struct bkey_s); }; -const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type, - struct bkey_s_c); -const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); -const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c); +const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); +const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type); +const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type); const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -int bch2_bkey_to_text(char *, size_t, const struct bkey *); -int bch2_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); -int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type, - char *, size_t, struct bkey_s_c); +void bch2_bpos_to_text(struct printbuf *, struct bpos); +void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +void bch2_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *); + +bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -void bch2_bkey_swab(enum bkey_type, const struct bkey_format *, - struct bkey_packed *); +enum merge_result bch2_bkey_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); -extern const struct bkey_ops bch2_bkey_ops[]; +void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); #endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 index 000000000000..9f5d9b4bf1c9 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,633 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_sort.h" +#include "bset.h" +#include "extents.h" + +/* too many iterators, need to clean this up */ + +/* btree_node_iter_large: */ + +#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r) + +static inline bool +bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter) +{ + return !iter->used; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter, + struct btree *b) +{ + return bch2_btree_node_iter_large_end(iter) + ? NULL + : __btree_node_offset_to_key(b, iter->data->k); +} + +static void +bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, + struct btree *b) +{ + iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; + + EBUG_ON(!iter->used); + EBUG_ON(iter->data->k > iter->data->end); + + if (iter->data->k == iter->data->end) + heap_del(iter, 0, btree_node_iter_cmp_heap, NULL); + else + heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter, + struct btree *b) +{ + struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b); + + if (ret) + bch2_btree_node_iter_large_advance(iter, b); + + return ret; +} + +void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) { + struct btree_node_iter_set n = + ((struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }); + + __heap_add(iter, n, btree_node_iter_cmp_heap, NULL); + } +} + +static void sort_key_next(struct btree_node_iter_large *iter, + struct btree *b, + struct btree_node_iter_set *i) +{ + i->k += __btree_node_offset_to_key(b, i->k)->u64s; + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +/* regular sort_iters */ + +typedef int (*sort_cmp_fn)(struct btree *, + struct bkey_packed *, + struct bkey_packed *); + +static inline void __sort_iter_sift(struct sort_iter *iter, + unsigned from, + sort_cmp_fn cmp) +{ + unsigned i; + + for (i = from; + i + 1 < iter->used && + cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; + i++) + swap(iter->data[i], iter->data[i + 1]); +} + +static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) +{ + + __sort_iter_sift(iter, 0, cmp); +} + +static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) +{ + unsigned i = iter->used; + + while (i--) + __sort_iter_sift(iter, i, cmp); +} + +static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) +{ + return iter->used ? iter->data->k : NULL; +} + +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +{ + iter->data->k = bkey_next(iter->data->k); + + BUG_ON(iter->data->k > iter->data->end); + + if (iter->data->k == iter->data->end) + array_remove_item(iter->data, iter->used, 0); + else + sort_iter_sift(iter, cmp); +} + +static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, + sort_cmp_fn cmp) +{ + struct bkey_packed *ret = sort_iter_peek(iter); + + if (ret) + sort_iter_advance(iter, cmp); + + return ret; +} + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +#define key_sort_cmp(h, l, r) \ +({ \ + bkey_cmp_packed(b, \ + __btree_node_offset_to_key(b, (l).k), \ + __btree_node_offset_to_key(b, (r).k)) \ + \ + ?: (l).k - (r).k; \ +}) + +static inline bool should_drop_next_key(struct btree_node_iter_large *iter, + struct btree *b) +{ + struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; + struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); + + if (bkey_whiteout(k)) + return true; + + if (iter->used < 2) + return false; + + if (iter->used > 2 && + key_sort_cmp(iter, r[0], r[1]) >= 0) + r++; + + /* + * key_sort_cmp() ensures that when keys compare equal the older key + * comes first; so if l->k compares equal to r->k then l->k is older and + * should be dropped. + */ + return !bkey_cmp_packed(b, + __btree_node_offset_to_key(b, l->k), + __btree_node_offset_to_key(b, r->k)); +} + +struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, + struct btree *b, + struct btree_node_iter_large *iter) +{ + struct bkey_packed *out = dst->start; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + heap_resort(iter, key_sort_cmp, NULL); + + while (!bch2_btree_node_iter_large_end(iter)) { + if (!should_drop_next_key(iter, b)) { + struct bkey_packed *k = + __btree_node_offset_to_key(b, iter->data->k); + + bkey_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_next(out); + } + + sort_key_next(iter, b, iter->data); + heap_sift_down(iter, 0, key_sort_cmp, NULL); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* + * If keys compare equal, compare by pointer order: + * + * Necessary for sort_fix_overlapping() - if there are multiple keys that + * compare equal in different sets, we have to process them newest to oldest. + */ +#define extent_sort_cmp(h, l, r) \ +({ \ + struct bkey _ul = bkey_unpack_key(b, \ + __btree_node_offset_to_key(b, (l).k)); \ + struct bkey _ur = bkey_unpack_key(b, \ + __btree_node_offset_to_key(b, (r).k)); \ + \ + bkey_cmp(bkey_start_pos(&_ul), \ + bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ +}) + +static inline void extent_sort_sift(struct btree_node_iter_large *iter, + struct btree *b, size_t i) +{ + heap_sift_down(iter, i, extent_sort_cmp, NULL); +} + +static inline void extent_sort_next(struct btree_node_iter_large *iter, + struct btree *b, + struct btree_node_iter_set *i) +{ + sort_key_next(iter, b, i); + heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL); +} + +static void extent_sort_advance_prev(struct bkey_format *f, + struct btree_nr_keys *nr, + struct bkey_packed *start, + struct bkey_packed **prev) +{ + if (*prev) { + bch2_bkey_pack(*prev, (void *) *prev, f); + + btree_keys_account_key_add(nr, 0, *prev); + *prev = bkey_next(*prev); + } else { + *prev = start; + } +} + +static void extent_sort_append(struct bch_fs *c, + struct bkey_format *f, + struct btree_nr_keys *nr, + struct bkey_packed *start, + struct bkey_packed **prev, + struct bkey_s k) +{ + if (bkey_whiteout(k.k)) + return; + + /* + * prev is always unpacked, for key merging - until right before we + * advance it: + */ + + if (*prev && + bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) == + BCH_MERGE_MERGE) + return; + + extent_sort_advance_prev(f, nr, start, prev); + + bkey_reassemble((void *) *prev, k.s_c); +} + +struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, + struct bset *dst, + struct btree *b, + struct btree_node_iter_large *iter) +{ + struct bkey_format *f = &b->format; + struct btree_node_iter_set *_l = iter->data, *_r; + struct bkey_packed *prev = NULL, *lk, *rk; + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + heap_resort(iter, extent_sort_cmp, NULL); + + while (!bch2_btree_node_iter_large_end(iter)) { + lk = __btree_node_offset_to_key(b, _l->k); + l = __bkey_disassemble(b, lk, &l_unpacked); + + if (iter->used == 1) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); + extent_sort_next(iter, b, _l); + continue; + } + + _r = iter->data + 1; + if (iter->used > 2 && + extent_sort_cmp(iter, _r[0], _r[1]) >= 0) + _r++; + + rk = __btree_node_offset_to_key(b, _r->k); + r = __bkey_disassemble(b, rk, &r_unpacked); + + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); + extent_sort_next(iter, b, _l); + continue; + } + + /* Skip 0 size keys */ + if (!r.k->size) { + extent_sort_next(iter, b, _r); + continue; + } + + /* + * overlap: keep the newer key and trim the older key so they + * don't overlap. comparing pointers tells us which one is + * newer, since the bsets are appended one after the other. + */ + + /* can't happen because of comparison func */ + BUG_ON(_l->k < _r->k && + !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); + + if (_l->k > _r->k) { + /* l wins, trim r */ + if (bkey_cmp(l.k->p, r.k->p) >= 0) { + sort_key_next(iter, b, _r); + } else { + __bch2_cut_front(l.k->p, r); + extent_save(b, rk, r.k); + } + + extent_sort_sift(iter, b, _r - iter->data); + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { + BKEY_PADDED(k) tmp; + + /* + * r wins, but it overlaps in the middle of l - split l: + */ + bkey_reassemble(&tmp.k, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); + + __bch2_cut_front(r.k->p, l); + extent_save(b, lk, l.k); + + extent_sort_sift(iter, b, 0); + + extent_sort_append(c, f, &nr, dst->start, + &prev, bkey_i_to_s(&tmp.k)); + } else { + bch2_cut_back(bkey_start_pos(r.k), l.k); + extent_save(b, lk, l.k); + } + } + + extent_sort_advance_prev(f, &nr, dst->start, &prev); + + dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + return nr; +} + +/* Sort + repack in a new format: */ +struct btree_nr_keys +bch2_sort_repack(struct bset *dst, struct btree *src, + struct btree_node_iter *src_iter, + struct bkey_format *out_f, + bool filter_whiteouts) +{ + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { + if (filter_whiteouts && bkey_whiteout(in)) + continue; + + if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); + + btree_keys_account_key_add(&nr, 0, out); + out = bkey_next(out); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* Sort, repack, and merge: */ +struct btree_nr_keys +bch2_sort_repack_merge(struct bch_fs *c, + struct bset *dst, struct btree *src, + struct btree_node_iter *iter, + struct bkey_format *out_f, + bool filter_whiteouts) +{ + struct bkey_packed *prev = NULL, *k_packed, *next; + struct bkey k_unpacked; + struct bkey_s k; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + next = bch2_btree_node_iter_next_all(iter, src); + while ((k_packed = next)) { + /* + * The filter might modify the size of @k's value, so advance + * the iterator first: + */ + next = bch2_btree_node_iter_next_all(iter, src); + + if (filter_whiteouts && bkey_whiteout(k_packed)) + continue; + + k = __bkey_disassemble(src, k_packed, &k_unpacked); + + if (filter_whiteouts && + bch2_bkey_normalize(c, k)) + continue; + + extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k); + } + + extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); + + dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + return nr; +} + +static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r) ?: + (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} + +unsigned bch2_sort_keys(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *next, *out = dst; + + sort_iter_sort(iter, sort_keys_cmp); + + while ((in = sort_iter_next(iter, sort_keys_cmp))) { + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + if (bkey_whiteout(in) && + (next = sort_iter_peek(iter)) && + !bkey_cmp_packed(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + /* + * XXX racy, called with read lock from write path + * + * leads to spurious BUG_ON() in bkey_unpack_key() in + * debug mode + */ + next->needs_whiteout |= in->needs_whiteout; + continue; + } + + if (bkey_whiteout(in)) { + memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { + bkey_copy(out, in); + } + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static inline int sort_extents_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(l) - (int) bkey_deleted(r); +} + +unsigned bch2_sort_extents(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, sort_extents_cmp); + + while ((in = sort_iter_next(iter, sort_extents_cmp))) { + if (bkey_deleted(in)) + continue; + + if (bkey_whiteout(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + bkey_copy(out, in); + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static inline int sort_key_whiteouts_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bkey_cmp_packed(b, l, r); +} + +unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + struct bkey_packed *in, *out = dst; + + sort_iter_sort(iter, sort_key_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { + bkey_copy(out, in); + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} + +static inline int sort_extent_whiteouts_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + struct bkey ul = bkey_unpack_key(b, l); + struct bkey ur = bkey_unpack_key(b, r); + + return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); +} + +unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, + struct sort_iter *iter) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *out = dst; + struct bkey_i l, r; + bool prev = false, l_packed = false; + u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); + u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); + u64 new_size; + + max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); + + sort_iter_sort(iter, sort_extent_whiteouts_cmp); + + while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { + if (bkey_deleted(in)) + continue; + + EBUG_ON(bkeyp_val_u64s(f, in)); + EBUG_ON(in->type != KEY_TYPE_discard); + + r.k = bkey_unpack_key(iter->b, in); + + if (prev && + bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + new_size = l_packed + ? min(max_packed_size, max_packed_offset - + bkey_start_offset(&l.k)) + : KEY_SIZE_MAX; + + new_size = min(new_size, r.k.p.offset - + bkey_start_offset(&l.k)); + + BUG_ON(new_size < l.k.size); + + bch2_key_resize(&l.k, new_size); + + if (bkey_cmp(l.k.p, r.k.p) >= 0) + continue; + + bch2_cut_front(l.k.p, &r); + } + + if (prev) { + if (!bch2_bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } + + l = r; + prev = true; + l_packed = bkey_packed(in); + } + + if (prev) { + if (!bch2_bkey_pack(out, &l, f)) { + BUG_ON(l_packed); + bkey_copy(out, &l); + } + out = bkey_next(out); + } + + return (u64 *) out - (u64 *) dst; +} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h new file mode 100644 index 000000000000..397009181eae --- /dev/null +++ b/fs/bcachefs/bkey_sort.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_SORT_H +#define _BCACHEFS_BKEY_SORT_H + +struct btree_node_iter_large { + u16 used; + + struct btree_node_iter_set data[MAX_BSETS]; +}; + +void bch2_btree_node_iter_large_push(struct btree_node_iter_large *, + struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); + +struct sort_iter { + struct btree *b; + unsigned used; + + struct sort_iter_set { + struct bkey_packed *k, *end; + } data[MAX_BSETS + 1]; +}; + +static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) +{ + memset(iter, 0, sizeof(*iter)); + iter->b = b; +} + +static inline void sort_iter_add(struct sort_iter *iter, + struct bkey_packed *k, + struct bkey_packed *end) +{ + BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); + + if (k != end) + iter->data[iter->used++] = (struct sort_iter_set) { k, end }; +} + +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bset *, struct btree *, + struct btree_node_iter_large *); +struct btree_nr_keys +bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, + struct btree *, + struct btree_node_iter_large *); + +struct btree_nr_keys +bch2_sort_repack(struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); +struct btree_nr_keys +bch2_sort_repack_merge(struct bch_fs *, + struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); + +unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); +unsigned bch2_sort_extents(struct bkey_packed *, + struct sort_iter *, bool); + +unsigned bch2_sort_key_whiteouts(struct bkey_packed *, + struct sort_iter *); +unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, + struct sort_iter *); + +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 7e928ff78cc6..ef10e77ec1e5 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for working with individual keys, and sorted sets of keys with in a * btree node @@ -12,7 +13,6 @@ #include "util.h" #include <asm/unaligned.h> -#include <linux/dynamic_fault.h> #include <linux/console.h> #include <linux/random.h> #include <linux/prefetch.h> @@ -21,14 +21,19 @@ #include "alloc_types.h" #include <trace/events/bcachefs.h> +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, + struct btree *); + struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { + unsigned offset = __btree_node_key_to_offset(b, k); struct bset_tree *t; for_each_bset(b, t) - if (k >= btree_bkey_first(b, t) && - k < btree_bkey_last(b, t)) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); return t; + } BUG(); } @@ -63,9 +68,9 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) _k = _n, k = n) { _n = bkey_next(_k); - bch2_bkey_to_text(buf, sizeof(buf), &k); - printk(KERN_ERR "block %u key %zi/%u: %s\n", set, - _k->_data - i->_data, i->u64s, buf); + bch2_bkey_to_text(&PBUF(buf), &k); + printk(KERN_ERR "block %u key %5u: %s\n", set, + __btree_node_key_to_offset(b, _k), buf); if (_n == vstruct_last(i)) continue; @@ -113,7 +118,7 @@ void bch2_dump_btree_node_iter(struct btree *b, struct bkey uk = bkey_unpack_key(b, k); char buf[100]; - bch2_bkey_to_text(buf, sizeof(buf), &uk); + bch2_bkey_to_text(&PBUF(buf), &uk); printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); } @@ -121,20 +126,6 @@ void bch2_dump_btree_node_iter(struct btree *b, #ifdef CONFIG_BCACHEFS_DEBUG -static bool keys_out_of_order(struct btree *b, - const struct bkey_packed *prev, - const struct bkey_packed *next, - bool is_extents) -{ - struct bkey nextu = bkey_unpack_key(b, next); - - return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 || - ((is_extents - ? !bkey_deleted(next) - : !bkey_deleted(prev)) && - !bkey_cmp_packed(b, prev, next)); -} - void __bch2_verify_btree_nr_keys(struct btree *b) { struct bset_tree *t; @@ -151,123 +142,126 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) +static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree *b) { - const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b); + struct btree_node_iter iter = *_iter; + const struct bkey_packed *k, *n; + + k = bch2_btree_node_iter_peek_all(&iter, b); + __bch2_btree_node_iter_advance(&iter, b); + n = bch2_btree_node_iter_peek_all(&iter, b); bkey_unpack_key(b, k); if (n && - keys_out_of_order(b, k, n, iter->is_extents)) { + bkey_iter_cmp(b, k, n) > 0) { + struct btree_node_iter_set *set; struct bkey ku = bkey_unpack_key(b, k); struct bkey nu = bkey_unpack_key(b, n); char buf1[80], buf2[80]; bch2_dump_btree_node(b); - bch2_bkey_to_text(buf1, sizeof(buf1), &ku); - bch2_bkey_to_text(buf2, sizeof(buf2), &nu); - panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2); + bch2_bkey_to_text(&PBUF(buf1), &ku); + bch2_bkey_to_text(&PBUF(buf2), &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", + buf1, buf2); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + printk(" [%zi %zi]", t - b->set, + k->_data - bset(b, t)->_data); + } + panic("\n"); } } void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) + struct btree *b) { - struct btree_node_iter_set *set, *prev = NULL; + struct btree_node_iter_set *set, *s2; struct bset_tree *t; - struct bkey_packed *k, *first; - if (bch2_btree_node_iter_end(iter)) - return; + /* Verify no duplicates: */ + btree_node_iter_for_each(iter, set) + btree_node_iter_for_each(iter, s2) + BUG_ON(set != s2 && set->end == s2->end); + /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { - k = __btree_node_offset_to_key(b, set->k); - t = bch2_bkey_to_bset(b, k); - - BUG_ON(__btree_node_offset_to_key(b, set->end) != - btree_bkey_last(b, t)); - - BUG_ON(prev && - btree_node_iter_cmp(iter, b, *prev, *set) > 0); - - prev = set; + for_each_bset(b, t) + if (set->end == t->end_offset) + goto found; + BUG(); +found: + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); } - first = __btree_node_offset_to_key(b, iter->data[0].k); - - for_each_bset(b, t) - if (bch2_btree_node_iter_bset_pos(iter, b, t) == - btree_bkey_last(b, t) && - (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)))) - BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, - k, first) > 0); + /* Verify iterator is sorted: */ + btree_node_iter_for_each(iter, set) + BUG_ON(set != iter->data && + btree_node_iter_cmp(b, set[-1], set[0]) > 0); } -void bch2_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) +void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct bkey_packed *k, *prev; - struct bkey uk, uw = bkey_unpack_key(b, where); - - k = bch2_bkey_prev_all(b, t, where); - if (k && - keys_out_of_order(b, k, where, iter->is_extents)) { - char buf1[100], buf2[100]; + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) (where->_data + clobber_u64s); +#if 0 + BUG_ON(prev && + bkey_iter_cmp(b, prev, insert) > 0); +#else + if (prev && + bkey_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); + char buf1[100]; + char buf2[100]; bch2_dump_btree_node(b); - uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(buf1, sizeof(buf1), &uk); - bch2_bkey_to_text(buf2, sizeof(buf2), &uw); - panic("out of order with prev:\n%s\n%s\n", - buf1, buf2); + bch2_bkey_to_text(&PBUF(buf1), &k1); + bch2_bkey_to_text(&PBUF(buf2), &k2); + + panic("prev > insert:\n" + "prev key %5u %s\n" + "insert key %5u %s\n", + __btree_node_key_to_offset(b, prev), buf1, + __btree_node_key_to_offset(b, insert), buf2); } +#endif +#if 0 + BUG_ON(next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0); +#else + if (next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); + char buf1[100]; + char buf2[100]; - k = bkey_next(where); - BUG_ON(k != btree_bkey_last(b, t) && - keys_out_of_order(b, where, k, iter->is_extents)); - - for_each_bset(b, t) { - if (where >= btree_bkey_first(b, t) || - where < btree_bkey_last(b, t)) - continue; - - k = bch2_btree_node_iter_bset_pos(iter, b, t); - - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - - while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 && - (prev = bch2_bkey_prev_all(b, t, k))) - k = prev; - - for (; - k != btree_bkey_last(b, t); - k = bkey_next(k)) { - uk = bkey_unpack_key(b, k); - - if (iter->is_extents) { - BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 || - bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0)); - } else { - BUG_ON(!bkey_cmp(uw.p, uk.p) && - !bkey_deleted(&uk)); - } - - if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0) - break; - } + bch2_dump_btree_node(b); + bch2_bkey_to_text(&PBUF(buf1), &k1); + bch2_bkey_to_text(&PBUF(buf2), &k2); + + panic("insert > next:\n" + "insert key %5u %s\n" + "next key %5u %s\n", + __btree_node_key_to_offset(b, insert), buf1, + __btree_node_key_to_offset(b, next), buf2); } +#endif } #else static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b, - struct bkey_packed *k) {} + struct btree *b) {} #endif @@ -622,28 +616,30 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, unsigned offset) { - unsigned l = 0, r = t->size; + unsigned bset_offs = offset - btree_bkey_first_offset(t); + unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); + unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + EBUG_ON(!t->size); + EBUG_ON(idx > t->size); - while (l < r) { - unsigned m = (l + r) >> 1; + while (idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset) + idx++; - if (rw_aux_tree(b, t)[m].offset < offset) - l = m + 1; - else - r = m; - } + while (idx && + rw_aux_tree(b, t)[idx - 1].offset >= offset) + idx--; - EBUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset < offset); - EBUG_ON(l && - rw_aux_tree(b, t)[l - 1].offset >= offset); + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset); + EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); + EBUG_ON(idx + 1 < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx + 1].offset); - EBUG_ON(l > r); - EBUG_ON(l > t->size); - - return l; + return idx; } static inline unsigned bfloat_mantissa(const struct bkey_float *f, @@ -987,6 +983,10 @@ void bch2_bset_init_next(struct bch_fs *c, struct btree *b, set_btree_bset(b, t, i); } +/* + * find _some_ key in the same bset as @k that precedes @k - not necessarily the + * immediate predecessor: + */ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) { @@ -1025,40 +1025,31 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, return p; } -struct bkey_packed *bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k, + unsigned min_key_type) { - struct bkey_packed *p; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; - - while (bkey_next(p) != k) - p = bkey_next(p); - - return p; -} - -struct bkey_packed *bch2_bkey_prev(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - while (1) { - struct bkey_packed *p, *i, *ret = NULL; - - p = __bkey_prev(b, t, k); - if (!p) - return NULL; + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + while ((p = __bkey_prev(b, t, k)) && !ret) { for (i = p; i != k; i = bkey_next(i)) - if (!bkey_deleted(i)) + if (i->type >= min_key_type) ret = i; - if (ret) - return ret; - k = p; } + + if (btree_keys_expensive_checks(b)) { + BUG_ON(ret >= orig_k); + + for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); + i != orig_k; + i = bkey_next(i)) + BUG_ON(i->type >= min_key_type); + } + + return ret; } /* Insert */ @@ -1134,9 +1125,10 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, * modified, fix any auxiliary search tree by remaking all the nodes in the * auxiliary search tree that @k corresponds to */ -void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) { + struct bset_tree *t = bch2_bkey_to_bset(b, k); + switch (bset_aux_tree_type(t)) { case BSET_NO_AUX_TREE: break; @@ -1163,13 +1155,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b, if (!bset_has_rw_aux_tree(t)) return; + /* returns first entry >= where */ l = rw_aux_tree_bsearch(b, t, where); - /* l is first >= than @where */ - - EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); - EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); - if (!l) /* never delete first entry */ l++; else if (l < t->size && @@ -1247,6 +1235,7 @@ void bch2_bset_insert(struct btree *b, struct bkey_packed packed, *src = bkey_to_packed(insert); bch2_bset_verify_rw_aux_tree(b, t); + bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); if (bch2_bkey_pack_key(&packed, &insert->k, f)) src = &packed; @@ -1273,7 +1262,6 @@ void bch2_bset_insert(struct btree *b, bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); - bch2_verify_key_order(b, iter, where); bch2_verify_btree_nr_keys(b); } @@ -1301,7 +1289,7 @@ void bch2_bset_delete(struct btree *b, __flatten static struct bkey_packed *bset_search_write_set(const struct btree *b, struct bset_tree *t, - struct bpos search, + struct bpos *search, const struct bkey_packed *packed_search) { unsigned l = 0, r = t->size; @@ -1309,7 +1297,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, while (l + 1 != r) { unsigned m = (l + r) >> 1; - if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0) + if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) l = m; else r = m; @@ -1331,7 +1319,7 @@ static int bset_search_tree_slowpath(const struct btree *b, __flatten static struct bkey_packed *bset_search_tree(const struct btree *b, struct bset_tree *t, - struct bpos search, + struct bpos *search, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); @@ -1372,7 +1360,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, bkey_mantissa(packed_search, f, n)); else n = n * 2 + bset_search_tree_slowpath(b, t, - &search, packed_search, n); + search, packed_search, n); } while (n < t->size); inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); @@ -1399,10 +1387,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, __always_inline __flatten static struct bkey_packed *bch2_bset_search(struct btree *b, struct bset_tree *t, - struct bpos search, + struct bpos *search, struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search, - bool strictly_greater) + const struct bkey_packed *lossy_packed_search) { struct bkey_packed *m; @@ -1436,7 +1423,7 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, * start and end - handle that here: */ - if (bkey_cmp(search, t->max_key) > 0) + if (bkey_cmp(*search, t->max_key) > 0) return btree_bkey_last(b, t); m = bset_search_tree(b, t, search, lossy_packed_search); @@ -1445,21 +1432,21 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, if (lossy_packed_search) while (m != btree_bkey_last(b, t) && - !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search, - m, strictly_greater)) + bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, + m) > 0) m = bkey_next(m); if (!packed_search) while (m != btree_bkey_last(b, t) && - !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater)) + bkey_iter_pos_cmp(b, search, m) > 0) m = bkey_next(m); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && - btree_iter_pos_cmp_p_or_unp(b, search, packed_search, - prev, strictly_greater)); + bkey_iter_cmp_p_or_unp(b, search, packed_search, + prev) <= 0); } return m; @@ -1467,6 +1454,25 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, /* Btree node iterator */ +static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) { + struct btree_node_iter_set *pos; + + btree_node_iter_for_each(iter, pos) + ; + + BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); + *pos = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }; + } +} + void bch2_btree_node_iter_push(struct btree_node_iter *iter, struct btree *b, const struct bkey_packed *k, @@ -1478,17 +1484,15 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter, noinline __flatten __attribute__((cold)) static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, - struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) + struct btree *b, struct bpos *search) { struct bset_tree *t; - trace_bkey_pack_pos_fail(&search); + trace_bkey_pack_pos_fail(search); for_each_bset(b, t) __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, NULL, NULL, - strictly_greater), + bch2_bset_search(b, t, search, NULL, NULL), btree_bkey_last(b, t)); bch2_btree_node_iter_sort(iter, b); @@ -1535,18 +1539,17 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * past any extents that compare equal to the position we searched for. */ void bch2_btree_node_iter_init(struct btree_node_iter *iter, - struct btree *b, struct bpos search, - bool strictly_greater, bool is_extents) + struct btree *b, struct bpos *search) { struct bset_tree *t; struct bkey_packed p, *packed_search = NULL; - EBUG_ON(bkey_cmp(search, b->data->min_key) < 0); + EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); - __bch2_btree_node_iter_init(iter, is_extents); + memset(iter, 0, sizeof(*iter)); - switch (bch2_bkey_pack_pos_lossy(&p, search, b)) { + switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { case BKEY_PACK_POS_EXACT: packed_search = &p; break; @@ -1554,28 +1557,25 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, packed_search = NULL; break; case BKEY_PACK_POS_FAIL: - btree_node_iter_init_pack_failed(iter, b, search, - strictly_greater, is_extents); + btree_node_iter_init_pack_failed(iter, b, search); return; } for_each_bset(b, t) __bch2_btree_node_iter_push(iter, b, bch2_bset_search(b, t, search, - packed_search, &p, - strictly_greater), + packed_search, &p), btree_bkey_last(b, t)); bch2_btree_node_iter_sort(iter, b); } void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, - struct btree *b, - bool is_extents) + struct btree *b) { struct bset_tree *t; - __bch2_btree_node_iter_init(iter, is_extents); + memset(iter, 0, sizeof(*iter)); for_each_bset(b, t) __bch2_btree_node_iter_push(iter, b, @@ -1603,7 +1603,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, { bool ret; - if ((ret = (btree_node_iter_cmp(iter, b, + if ((ret = (btree_node_iter_cmp(b, iter->data[first], iter->data[first + 1]) > 0))) swap(iter->data[first], iter->data[first + 1]); @@ -1658,26 +1658,18 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, btree_node_iter_sort_two(iter, b, 1); } -/** - * bch_btree_node_iter_advance - advance @iter by one key - * - * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might - * momentarily have out of order extents. - */ void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b); + if (btree_keys_expensive_checks(b)) { + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); + } __bch2_btree_node_iter_advance(iter, b); - bch2_btree_node_iter_next_check(iter, b, k); -#else - __bch2_btree_node_iter_advance(iter, b); -#endif } -static inline bool __btree_node_iter_used(struct btree_node_iter *iter) +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) { unsigned n = ARRAY_SIZE(iter->data); @@ -1690,67 +1682,65 @@ static inline bool __btree_node_iter_used(struct btree_node_iter *iter) /* * Expensive: */ -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, - struct btree *b) +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) { struct bkey_packed *k, *prev = NULL; + struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); struct btree_node_iter_set *set; struct bset_tree *t; - struct bset_tree *prev_t; - unsigned end, used; + unsigned end = 0; bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { - k = bch2_bkey_prev_all(b, t, - bch2_btree_node_iter_bset_pos(iter, b, t)); + k = bch2_bkey_prev_filter(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t), + min_key_type); if (k && - (!prev || __btree_node_iter_cmp(iter->is_extents, b, - k, prev) > 0)) { + (!prev || bkey_iter_cmp(b, k, prev) > 0)) { prev = k; - prev_t = t; + end = t->end_offset; } } if (!prev) - return NULL; + goto out; /* * We're manually memmoving instead of just calling sort() to ensure the * prev we picked ends up in slot 0 - sort won't necessarily put it * there because of duplicate deleted keys: */ - end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t)); btree_node_iter_for_each(iter, set) - if (set->end == end) { - memmove(&iter->data[1], - &iter->data[0], - (void *) set - (void *) &iter->data[0]); - goto out; - } + if (set->end == end) + goto found; - used = __btree_node_iter_used(iter); - BUG_ON(used >= ARRAY_SIZE(iter->data)); + BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); +found: + BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); memmove(&iter->data[1], &iter->data[0], - (void *) &iter->data[used] - (void *) &iter->data[0]); -out: + (void *) set - (void *) &iter->data[0]); + iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - return prev; -} +out: + if (btree_keys_expensive_checks(b)) { + struct btree_node_iter iter2 = *iter; -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, - struct btree *b) -{ - struct bkey_packed *k; + if (prev) + __bch2_btree_node_iter_advance(&iter2, b); - do { - k = bch2_btree_node_iter_prev_all(iter, b); - } while (k && bkey_deleted(k)); + while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { + BUG_ON(k->type >= min_key_type); + __bch2_btree_node_iter_advance(&iter2, b); + } + } - return k; + return prev; } struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, @@ -1795,69 +1785,73 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) } } -int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, - char *buf, size_t size) +void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + struct bkey_packed *k) { struct bset_tree *t = bch2_bkey_to_bset(b, k); struct bkey_packed *l, *r, *p; struct bkey uk, up; char buf1[200], buf2[200]; - unsigned j; + unsigned j, inorder; - if (!size) - return 0; + if (out->pos != out->end) + *out->pos = '\0'; if (!bset_has_ro_aux_tree(t)) - goto out; + return; - j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra); - if (j && - j < t->size && - k == tree_to_bkey(b, t, j)) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed unpacked at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - case BFLOAT_FAILED_PREV: - p = tree_to_prev_bkey(b, t, j); - l = is_power_of_2(j) - ? btree_bkey_first(b, t) - : tree_to_prev_bkey(b, t, j >> ffs(j)); - r = is_power_of_2(j + 1) - ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - up = bkey_unpack_key(b, p); - uk = bkey_unpack_key(b, k); - bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); - bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); - - return scnprintf(buf, size, - " failed prev at depth %u\n" - "\tkey starts at bit %u but first differing bit at %u\n" - "\t%llu:%llu\n" - "\t%llu:%llu\n" - "\t%s\n" - "\t%s\n", - ilog2(j), - bch2_bkey_greatest_differing_bit(b, l, r), - bch2_bkey_greatest_differing_bit(b, p, k), - uk.p.inode, uk.p.offset, - up.p.inode, up.p.offset, - buf1, buf2); - case BFLOAT_FAILED_OVERFLOW: - uk = bkey_unpack_key(b, k); - return scnprintf(buf, size, - " failed overflow at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - } -out: - *buf = '\0'; - return 0; + inorder = bkey_to_cacheline(b, t, k); + if (!inorder || inorder >= t->size) + return; + + j = __inorder_to_eytzinger1(inorder, t->size, t->extra); + if (k != tree_to_bkey(b, t, j)) + return; + + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED_UNPACKED: + uk = bkey_unpack_key(b, k); + pr_buf(out, + " failed unpacked at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + break; + case BFLOAT_FAILED_PREV: + p = tree_to_prev_bkey(b, t, j); + l = is_power_of_2(j) + ? btree_bkey_first(b, t) + : tree_to_prev_bkey(b, t, j >> ffs(j)); + r = is_power_of_2(j + 1) + ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + + up = bkey_unpack_key(b, p); + uk = bkey_unpack_key(b, k); + bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); + bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); + + pr_buf(out, + " failed prev at depth %u\n" + "\tkey starts at bit %u but first differing bit at %u\n" + "\t%llu:%llu\n" + "\t%llu:%llu\n" + "\t%s\n" + "\t%s\n", + ilog2(j), + bch2_bkey_greatest_differing_bit(b, l, r), + bch2_bkey_greatest_differing_bit(b, p, k), + uk.p.inode, uk.p.offset, + up.p.inode, up.p.offset, + buf1, buf2); + break; + case BFLOAT_FAILED_OVERFLOW: + uk = bkey_unpack_key(b, k); + pr_buf(out, + " failed overflow at depth %u\n" + "\t%llu:%llu\n", + ilog2(j), + uk.p.inode, uk.p.offset); + break; + } } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 153e2b3f787f..17c239947300 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BSET_H #define _BCACHEFS_BSET_H @@ -342,8 +343,7 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *, - struct bkey_packed *); +void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); void bch2_bset_insert(struct btree *, struct btree_node_iter *, struct bkey_packed *, struct bkey_i *, unsigned); @@ -368,35 +368,22 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } -/* Returns true if @k is after iterator position @pos */ -static inline bool btree_iter_pos_cmp_packed(const struct btree *b, - struct bpos *pos, - const struct bkey_packed *k, - bool strictly_greater) -{ - int cmp = bkey_cmp_left_packed(b, k, pos); +struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); -} +struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, + struct bkey_packed *, unsigned); -static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b, - struct bpos pos, - const struct bkey_packed *pos_packed, - const struct bkey_packed *k, - bool strictly_greater) +static inline struct bkey_packed * +bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) { - int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos); - - return cmp > 0 || - (cmp == 0 && !strictly_greater && !bkey_deleted(k)); + return bch2_bkey_prev_filter(b, t, k, 0); } -struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -struct bkey_packed *bch2_bkey_prev_all(struct btree *, struct bset_tree *, - struct bkey_packed *); -struct bkey_packed *bch2_bkey_prev(struct btree *, struct bset_tree *, - struct bkey_packed *); +static inline struct bkey_packed * +bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); +} enum bch_extent_overlap { BCH_EXTENT_OVERLAP_ALL = 0, @@ -407,7 +394,7 @@ enum bch_extent_overlap { /* Returns how k overlaps with m */ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) + const struct bkey *m) { int cmp1 = bkey_cmp(k->p, m->p) < 0; int cmp2 = bkey_cmp(bkey_start_pos(k), @@ -418,20 +405,13 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, /* Btree key iteration */ -static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter, - bool is_extents) -{ - iter->is_extents = is_extents; - memset(iter->data, 0, sizeof(iter->data)); -} - void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, const struct bkey_packed *, const struct bkey_packed *); void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, - struct bpos, bool, bool); + struct bpos *); void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, - struct btree *, bool); + struct btree *); struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, struct btree *, struct bset_tree *); @@ -458,51 +438,46 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) return __btree_node_iter_set_end(iter, 0); } -static inline int __btree_node_iter_cmp(bool is_extents, - struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +/* + * When keys compare equal, deleted keys compare first: + * + * XXX: only need to compare pointers for keys that are both within a + * btree_node_iterator - we need to break ties for prev() to work correctly + */ +static inline int bkey_iter_cmp(struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { - /* - * For non extents, when keys compare equal the deleted keys have to - * come first - so that bch2_btree_node_iter_next_check() can detect - * duplicate nondeleted keys (and possibly other reasons?) - * - * For extents, bkey_deleted() is used as a proxy for k->size == 0, so - * deleted keys have to sort last. - */ - return bkey_cmp_packed(b, l, r) ?: is_extents - ? (int) bkey_deleted(l) - (int) bkey_deleted(r) - : (int) bkey_deleted(r) - (int) bkey_deleted(l); + return bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) + ?: cmp_int(l, r); } -static inline int btree_node_iter_cmp(struct btree_node_iter *iter, - struct btree *b, +static inline int btree_node_iter_cmp(struct btree *b, struct btree_node_iter_set l, struct btree_node_iter_set r) { - return __btree_node_iter_cmp(iter->is_extents, b, + return bkey_iter_cmp(b, __btree_node_offset_to_key(b, l.k), __btree_node_offset_to_key(b, r.k)); } -static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) +/* These assume l (the search key) is not a deleted key: */ +static inline int bkey_iter_pos_cmp(struct btree *b, + struct bpos *l, + const struct bkey_packed *r) { - if (k != end) { - struct btree_node_iter_set *pos; - - btree_node_iter_for_each(iter, pos) - ; + return -bkey_cmp_left_packed(b, r, l) + ?: (int) bkey_deleted(r); +} - BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); - *pos = (struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }; - } +static inline int bkey_iter_cmp_p_or_unp(struct btree *b, + struct bpos *l, + const struct bkey_packed *l_packed, + const struct bkey_packed *r) +{ + return -bkey_cmp_p_or_unp(b, r, l_packed, l) + ?: (int) bkey_deleted(r); } static inline struct bkey_packed * @@ -513,24 +488,33 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, } static inline struct bkey_packed * +bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, + struct btree *b, + unsigned min_key_type) +{ + while (!bch2_btree_node_iter_end(iter)) { + struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); + + if (k->type >= min_key_type) + return k; + + bch2_btree_node_iter_advance(iter, b); + } + + return NULL; +} + +static inline struct bkey_packed * bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) { - return bch2_btree_node_iter_end(iter) - ? NULL - : __bch2_btree_node_iter_peek_all(iter, b); + return bch2_btree_node_iter_peek_filter(iter, b, 0); } static inline struct bkey_packed * bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) { - struct bkey_packed *ret; - - while ((ret = bch2_btree_node_iter_peek_all(iter, b)) && - bkey_deleted(ret)) - bch2_btree_node_iter_advance(iter, b); - - return ret; + return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); } static inline struct bkey_packed * @@ -544,26 +528,27 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) return ret; } -struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, - struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, - struct btree *); +struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, + struct btree *, unsigned); -/* - * Iterates over all _live_ keys - skipping deleted (and potentially - * overlapping) keys - */ -#define for_each_btree_node_key(b, k, iter, _is_extents) \ - for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ - ((k) = bch2_btree_node_iter_peek(iter, b)); \ - bch2_btree_node_iter_advance(iter, b)) +static inline struct bkey_packed * +bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b) +{ + return bch2_btree_node_iter_prev_filter(iter, b, 0); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) +{ + return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); +} struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, struct btree *, struct bkey *); -#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\ - for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\ +#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ bch2_btree_node_iter_advance(iter, b)) @@ -588,6 +573,13 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, #define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ btree_keys_account_key(_nr, _bset_idx, _k, -1) +#define btree_account_key_add(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) +#define btree_account_key_drop(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) + struct bset_stats { struct { size_t nr, bytes; @@ -600,8 +592,8 @@ struct bset_stats { }; void bch2_btree_keys_stats(struct btree *, struct bset_stats *); -int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *, - char *, size_t); +void bch2_bfloat_to_text(struct printbuf *, struct btree *, + struct bkey_packed *); /* Debug stuff */ @@ -613,17 +605,18 @@ void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); void __bch2_verify_btree_nr_keys(struct btree *); void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_key_order(struct btree *, struct btree_node_iter *, - struct bkey_packed *); +void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, + struct bkey_packed *, unsigned); #else static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, struct btree *b) {} -static inline void bch2_verify_key_order(struct btree *b, - struct btree_node_iter *iter, - struct bkey_packed *where) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) {} #endif static inline void bch2_verify_btree_nr_keys(struct btree *b) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index c950f2564f25..046524c8d5ea 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_cache.h" @@ -5,20 +6,18 @@ #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" -#include "extents.h" #include <linux/prefetch.h> +#include <linux/sched/mm.h> #include <trace/events/bcachefs.h> -#define DEF_BTREE_ID(kwd, val, name) name, - const char * const bch2_btree_ids[] = { - DEFINE_BCH_BTREE_IDS() +#define x(kwd, val, name) name, + BCH_BTREE_IDS() +#undef x NULL }; -#undef DEF_BTREE_ID - void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; @@ -99,7 +98,7 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) if (!b) return NULL; - bkey_extent_init(&b->key); + bkey_btree_ptr_init(&b->key); six_lock_init(&b->lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); @@ -115,7 +114,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); /* Cause future lookups for this node to fail: */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; + PTR_HASH(&b->key) = 0; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -172,6 +171,10 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (!btree_node_may_write(b)) goto out_unlock; + if (btree_node_dirty(b) && + test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + goto out_unlock; + if (btree_node_dirty(b) || btree_node_write_in_flight(b) || btree_node_read_in_flight(b)) { @@ -506,7 +509,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; u64 start_time = local_clock(); + unsigned flags; + flags = memalloc_nofs_save(); mutex_lock(&bc->lock); /* @@ -544,6 +549,7 @@ out_unlock: list_del_init(&b->list); mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); out: b->flags = 0; b->written = 0; @@ -577,10 +583,11 @@ err: /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_iter *iter, - const struct bkey_i *k, - unsigned level, - enum six_lock_type lock_type) + struct btree_iter *iter, + const struct bkey_i *k, + unsigned level, + enum six_lock_type lock_type, + bool sync) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -590,6 +597,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ BUG_ON(!btree_node_locked(iter, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) @@ -600,7 +608,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, /* raced with another fill: */ /* mark as unhashed... */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; + PTR_HASH(&b->key) = 0; mutex_lock(&bc->lock); list_add(&b->list, &bc->freeable); @@ -623,9 +631,15 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(c, b, sync); + six_unlock_write(&b->lock); + if (!sync) { + six_unlock_intent(&b->lock); + return NULL; + } + if (lock_type == SIX_LOCK_read) six_lock_downgrade(&b->lock); @@ -649,7 +663,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, struct btree *b; struct bset_tree *t; - /* btree_node_fill() requires parent to be locked: */ + /* + * XXX: locking optimization + * + * we can make the locking looser here - caller can drop lock on parent + * node before locking child node (and potentially blocking): we just + * have to have bch2_btree_node_fill() call relock on the parent and + * return -EINTR if that fails + */ EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); retry: @@ -663,7 +684,7 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type); + b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -713,6 +734,7 @@ retry: if (bch2_btree_node_relock(iter, level + 1)) goto retry; + trace_trans_restart_btree_node_reused(iter->trans->ip); return ERR_PTR(-EINTR); } } @@ -751,11 +773,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree *b, enum btree_node_sibling sib) { + struct btree_trans *trans = iter->trans; struct btree *parent; struct btree_node_iter node_iter; struct bkey_packed *k; BKEY_PADDED(k) tmp; - struct btree *ret; + struct btree *ret = NULL; unsigned level = b->level; parent = btree_iter_node(iter, level + 1); @@ -763,8 +786,8 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, return NULL; if (!bch2_btree_node_relock(iter, level + 1)) { - bch2_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); + ret = ERR_PTR(-EINTR); + goto out; } node_iter = iter->l[parent->level].iter; @@ -772,48 +795,87 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, k = bch2_btree_node_iter_peek_all(&node_iter, parent); BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - do { - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev_all(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek_all(&node_iter, parent)); - if (!k) - return NULL; - } while (bkey_deleted(k)); + k = sib == btree_prev_sib + ? bch2_btree_node_iter_prev(&node_iter, parent) + : (bch2_btree_node_iter_advance(&node_iter, parent), + bch2_btree_node_iter_peek(&node_iter, parent)); + if (!k) + goto out; bch2_bkey_unpack(parent, &tmp.k, k); - ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + ret = bch2_btree_node_get(c, iter, &tmp.k, level, + SIX_LOCK_intent); - if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { - btree_node_unlock(iter, level); + if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { + struct btree_iter *linked; - if (!bch2_btree_node_relock(iter, level + 1)) { - bch2_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); + if (!bch2_btree_node_relock(iter, level + 1)) + goto out; + + /* + * We might have got -EINTR because trylock failed, and we're + * holding other locks that would cause us to deadlock: + */ + trans_for_each_iter(trans, linked) + if (btree_iter_cmp(iter, linked) < 0) + __bch2_btree_iter_unlock(linked); + + if (sib == btree_prev_sib) + btree_node_unlock(iter, level); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, + SIX_LOCK_intent); + + /* + * before btree_iter_relock() calls btree_iter_verify_locks(): + */ + if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level + 1); + + if (!bch2_btree_node_relock(iter, level)) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + if (!IS_ERR(ret)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } } - ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + bch2_trans_relock(trans); } +out: + if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level + 1); - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + if (PTR_ERR_OR_ZERO(ret) == -EINTR) + bch2_btree_iter_upgrade(iter, level + 2); - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); - } + BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); + + if (!IS_ERR_OR_NULL(ret)) { + struct btree *n1 = ret, *n2 = b; + + if (sib != btree_prev_sib) + swap(n1, n2); + + BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, + n1->key.k.p), + n2->data->min_key)); } + bch2_btree_trans_verify_locks(trans); + return ret; } -void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, - unsigned level, enum btree_id btree_id) +void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, + const struct bkey_i *k, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; + BUG_ON(!btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); rcu_read_lock(); @@ -823,78 +885,56 @@ void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, if (b) return; - b = bch2_btree_node_mem_alloc(c); - if (IS_ERR(b)) - return; - - bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { - /* raced with another fill: */ - - /* mark as unhashed... */ - bkey_i_to_extent(&b->key)->v._data[0] = 0; - - mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); - mutex_unlock(&bc->lock); - goto out; - } - - bch2_btree_node_read(c, b, false); -out: - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); } -int bch2_print_btree_node(struct bch_fs *c, struct btree *b, - char *buf, size_t len) +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) { const struct bkey_format *f = &b->format; struct bset_stats stats; - char ptrs[100]; memset(&stats, 0, sizeof(stats)); - bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs), - bkey_i_to_s_c(&b->key)); bch2_btree_keys_stats(b, &stats); - return scnprintf(buf, len, - "l %u %llu:%llu - %llu:%llu:\n" - " ptrs: %s\n" - " format: u64s %u fields %u %u %u %u %u\n" - " unpack fn len: %u\n" - " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" - " nr packed keys %u\n" - " nr unpacked keys %u\n" - " floats %zu\n" - " failed unpacked %zu\n" - " failed prev %zu\n" - " failed overflow %zu\n", - b->level, - b->data->min_key.inode, - b->data->min_key.offset, - b->data->max_key.inode, - b->data->max_key.offset, - ptrs, - f->key_u64s, - f->bits_per_field[0], - f->bits_per_field[1], - f->bits_per_field[2], - f->bits_per_field[3], - f->bits_per_field[4], - b->unpack_fn_len, - b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), - b->nr.live_u64s * 100 / btree_max_u64s(c), - b->sib_u64s[0], - b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), - b->nr.packed_keys, - b->nr.unpacked_keys, - stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); + pr_buf(out, + "l %u %llu:%llu - %llu:%llu:\n" + " ptrs: ", + b->level, + b->data->min_key.inode, + b->data->min_key.offset, + b->data->max_key.inode, + b->data->max_key.offset); + bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_buf(out, "\n" + " format: u64s %u fields %u %u %u %u %u\n" + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %zu)\n" + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n" + " failed prev %zu\n" + " failed overflow %zu\n", + f->key_u64s, + f->bits_per_field[0], + f->bits_per_field[1], + f->bits_per_field[2], + f->bits_per_field[3], + f->bits_per_field[4], + b->unpack_fn_len, + b->nr.live_u64s * sizeof(u64), + btree_bytes(c) - sizeof(struct btree_node), + b->nr.live_u64s * 100 / btree_max_u64s(c), + b->sib_u64s[0], + b->sib_u64s[1], + BTREE_FOREGROUND_MERGE_THRESHOLD(c), + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, + stats.failed_unpacked, + stats.failed_prev, + stats.failed_overflow); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index e021d6e9422a..c5873c58439c 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -1,9 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_CACHE_H #define _BCACHEFS_BTREE_CACHE_H #include "bcachefs.h" #include "btree_types.h" -#include "extents.h" struct btree_iter; @@ -26,22 +26,22 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, enum six_lock_type); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, - enum btree_node_sibling); + struct btree *, enum btree_node_sibling); -void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *, - unsigned, enum btree_id); +void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); void bch2_fs_btree_cache_init_early(struct btree_cache *); -#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0]) +#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v) /* is btree node in hash table? */ static inline bool btree_node_hashed(struct btree *b) { - return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key); + return b->key.k.type == KEY_TYPE_btree_ptr && + PTR_HASH(&b->key); } #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ @@ -84,7 +84,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) #define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b) -int bch2_print_btree_node(struct bch_fs *, struct btree *, - char *, size_t); +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + struct btree *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 02b14e38ffda..a458cfe0e92d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1,10 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * Copyright (C) 2014 Datera Inc. */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_background.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_locking.h" #include "btree_update_interior.h" @@ -13,11 +15,13 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "journal.h" #include "keylist.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -30,6 +34,21 @@ #include <linux/sched/task.h> #include <trace/events/bcachefs.h> +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); +} + +static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + __gc_pos_set(c, new_pos); +} + +/* range_checks - for validating min/max pos of each btree node: */ + struct range_checks { struct range_level { struct bpos min; @@ -89,205 +108,231 @@ static void btree_node_range_checks(struct bch_fs *c, struct btree *b, } } -u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, + u8 *max_stale, bool initial) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - u8 max_stale = 0; - - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + unsigned flags = + BCH_BUCKET_MARK_GC| + (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); + int ret = 0; - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - size_t b = PTR_BUCKET_NR(ca, ptr); + if (initial) { + BUG_ON(journal_seq_verify(c) && + k.k->version.lo > journal_cur_seq(&c->journal)); - if (gen_after(ca->oldest_gens[b], ptr->gen)) - ca->oldest_gens[b] = ptr->gen; + if (k.k->version.lo > atomic64_read(&c->key_version)) + atomic64_set(&c->key_version, k.k->version.lo); - max_stale = max(max_stale, ptr_stale(ca, ptr)); + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, + "superblock not marked as containing replicas (type %u)", + k.k->type)) { + ret = bch2_mark_bkey_replicas(c, k); + if (ret) + return ret; } - } - - return max_stale; -} - -/* - * For runtime mark and sweep: - */ -static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k, unsigned flags) -{ - struct gc_pos pos = { 0 }; - u8 ret = 0; - - switch (type) { - case BKEY_TYPE_BTREE: - bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL, - 0, flags| - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - break; - case BKEY_TYPE_EXTENTS: - bch2_mark_key(c, k, k.k->size, false, pos, NULL, - 0, flags| - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - ret = bch2_btree_key_recalc_oldest_gen(c, k); - break; - default: - BUG(); - } - - return ret; -} - -int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) -{ - enum bch_data_type data_type = type == BKEY_TYPE_BTREE - ? BCH_DATA_BTREE : BCH_DATA_USER; - int ret = 0; - - if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c, - "superblock not marked as containing replicas (type %u)", - data_type)) { - ret = bch2_mark_bkey_replicas(c, data_type, k); - if (ret) - return ret; - } - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) { + bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - size_t b = PTR_BUCKET_NR(ca, ptr); - struct bucket *g = PTR_BUCKET(ca, ptr); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, ptr, false); - if (mustfix_fsck_err_on(!g->mark.gen_valid, c, + if (mustfix_fsck_err_on(!g->gen_valid, c, "found ptr with missing gen in alloc btree,\n" - "type %s gen %u", - bch2_data_types[data_type], - ptr->gen)) { - g->_mark.gen = ptr->gen; - g->_mark.gen_valid = 1; - set_bit(b, ca->buckets_dirty); + "type %u gen %u", + k.k->type, ptr->gen)) { + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->_mark.dirty = g->_mark.dirty = true; + g2->gen_valid = g->gen_valid = true; } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "%s ptr gen in the future: %u > %u", - bch2_data_types[data_type], - ptr->gen, g->mark.gen)) { - g->_mark.gen = ptr->gen; - g->_mark.gen_valid = 1; - set_bit(b, ca->buckets_dirty); + "%u ptr gen in the future: %u > %u", + k.k->type, ptr->gen, g->mark.gen)) { + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->_mark.dirty = g->_mark.dirty = true; + g2->gen_valid = g->gen_valid = true; set_bit(BCH_FS_FIXED_GENS, &c->flags); } - } - break; - } } - atomic64_set(&c->key_version, - max_t(u64, k.k->version.lo, - atomic64_read(&c->key_version))); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); - bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC); + if (gen_after(g->oldest_gen, ptr->gen)) + g->oldest_gen = ptr->gen; + + *max_stale = max(*max_stale, ptr_stale(ca, ptr)); + } + + bch2_mark_key(c, k, k.k->size, NULL, 0, flags); fsck_err: return ret; } -static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, + u8 *max_stale, bool initial) { - enum bkey_type type = btree_node_type(b); struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; - u8 stale = 0; - - if (btree_node_has_ptrs(b)) - for_each_btree_node_key_unpack(b, k, &iter, - btree_node_is_extents(b), - &unpacked) { - bch2_bkey_debugcheck(c, b, k); - stale = max(stale, bch2_gc_mark_key(c, type, k, 0)); - } + int ret = 0; - return stale; -} + *max_stale = 0; -static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - write_seqcount_begin(&c->gc_pos_lock); - c->gc_pos = new_pos; - write_seqcount_end(&c->gc_pos_lock); -} + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; -static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -{ - BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); - __gc_pos_set(c, new_pos); + for_each_btree_node_key_unpack(b, k, &iter, + &unpacked) { + bch2_bkey_debugcheck(c, b, k); + + ret = bch2_gc_mark_key(c, k, max_stale, initial); + if (ret) + break; + } + + return ret; } -static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) +static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, + bool initial, bool metadata_only) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct btree *b; struct range_checks r; - unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1; - unsigned max_stale; + unsigned depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale; int ret = 0; - /* - * if expensive_debug_checks is on, run range_checks on all leaf nodes: - */ - if (expensive_debug_checks(c)) - depth = 0; + bch2_trans_init(&trans, c, 0, 0); + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); btree_node_range_checks_init(&r, depth); - __for_each_btree_node(&iter, c, btree_id, POS_MIN, + __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, depth, BTREE_ITER_PREFETCH, b) { btree_node_range_checks(c, b, &r); bch2_verify_btree_nr_keys(b); - max_stale = btree_gc_mark_node(c, b); - gc_pos_set(c, gc_pos_btree_node(b)); - if (max_stale > 64) - bch2_btree_node_rewrite(c, &iter, - b->data->keys.seq, - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - else if (!btree_gc_rewrite_disabled(c) && - (btree_gc_always_rewrite(c) || max_stale > 16)) - bch2_btree_node_rewrite(c, &iter, - b->data->keys.seq, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - - bch2_btree_iter_cond_resched(&iter); + ret = btree_gc_mark_node(c, b, &max_stale, initial); + if (ret) + break; + + if (!initial) { + if (max_stale > 64) + bch2_btree_node_rewrite(c, iter, + b->data->keys.seq, + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + else if (!btree_gc_rewrite_disabled(c) && + (btree_gc_always_rewrite(c) || max_stale > 16)) + bch2_btree_node_rewrite(c, iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); + } + + bch2_trans_cond_resched(&trans); } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; mutex_lock(&c->btree_root_lock); - b = c->btree_roots[btree_id].b; if (!btree_node_fake(b)) - bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, initial); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); - mutex_unlock(&c->btree_root_lock); + + return ret; +} + +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); +} + +static int mark_journal_key(struct bch_fs *c, enum btree_id id, + struct bkey_i *insert) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u8 max_stale; + int ret = 0; + + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), + BTREE_ITER_SLOTS, k, ret) { + percpu_down_read(&c->mark_lock); + ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, + BCH_BUCKET_MARK_GC| + BCH_BUCKET_MARK_NOATOMIC); + percpu_up_read(&c->mark_lock); + + if (!ret) + break; + } + + return bch2_trans_exit(&trans) ?: ret; +} + +static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, + bool initial, bool metadata_only) +{ + enum btree_id ids[BTREE_ID_NR]; + unsigned i; + + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + + for (i = 0; i < BTREE_ID_NR; i++) { + enum btree_id id = ids[i]; + enum btree_node_type type = __btree_node_type(0, id); + + int ret = bch2_gc_btree(c, id, initial, metadata_only); + if (ret) + return ret; + + if (journal_keys && !metadata_only && + btree_node_type_needs_gc(type)) { + struct journal_key *j; + int ret; + + for_each_journal_key(*journal_keys, j) + if (j->btree_id == id) { + ret = mark_journal_key(c, id, j->k); + if (ret) + return ret; + } + } + } + return 0; } @@ -316,9 +361,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, unsigned i; u64 b; + /* + * This conditional is kind of gross, but we may be called from the + * device add path, before the new device has actually been added to the + * running filesystem: + */ if (c) { lockdep_assert_held(&c->sb_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read(&c->mark_lock); } for (i = 0; i < layout->nr_superblocks; i++) { @@ -333,9 +383,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, BCH_DATA_SB, flags); } - if (c) - spin_lock(&c->journal.lock); - for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, @@ -343,10 +390,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, gc_phase(GC_PHASE_SB), flags); } - if (c) { - percpu_up_read_preempt_enable(&c->usage_lock); - spin_unlock(&c->journal.lock); - } + if (c) + percpu_up_read(&c->mark_lock); } static void bch2_mark_superblocks(struct bch_fs *c) @@ -358,17 +403,13 @@ static void bch2_mark_superblocks(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); mutex_unlock(&c->sb_lock); } /* Also see bch2_pending_btree_node_free_insert_done() */ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { - struct gc_pos pos = { 0 }; - struct bch_fs_usage stats = { 0 }; struct btree_update *as; struct pending_btree_node_free *d; @@ -377,15 +418,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - c->opts.btree_node_size, true, pos, - &stats, 0, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ + bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0, + BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -397,7 +431,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) size_t i, j, iter; unsigned ci; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read(&c->mark_lock); spin_lock(&c->freelist_lock); gc_pos_set(c, gc_pos_alloc(c, NULL)); @@ -406,8 +440,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); @@ -415,8 +448,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&c->freelist_lock); @@ -430,138 +462,332 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&ob->lock); } - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read(&c->mark_lock); } -static void bch2_gc_start(struct bch_fs *c) +static void bch2_gc_free(struct bch_fs *c) { struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket_mark new; unsigned i; - size_t b; - int cpu; - percpu_down_write(&c->usage_lock); + genradix_free(&c->stripes[1]); - /* - * Indicates to buckets code that gc is now in progress - done under - * usage_lock to avoid racing with bch2_mark_key(): - */ - __gc_pos_set(c, GC_POS_MIN); - - /* Save a copy of the existing bucket stats while we recompute them: */ for_each_member_device(ca, c, i) { - ca->usage_cached = __bch2_dev_usage_read(ca); - for_each_possible_cpu(cpu) { - struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); - memset(p, 0, sizeof(*p)); + kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets[1] = NULL; + + free_percpu(ca->usage[1]); + ca->usage[1] = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; +} + +static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) +{ + struct bch_dev *ca; + bool verify = !metadata_only && + (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); + unsigned i; + int ret = 0; + +#define copy_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ + if (verify) \ + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f); \ + dst->_f = src->_f; \ + } +#define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f) { \ + if (verify) \ + fsck_err(c, "stripe %zu has wrong "_msg \ + ": got %u, should be %u", \ + dst_iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f); \ + dst->_f = src->_f; \ + dst->dirty = true; \ + } +#define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ + if (verify) \ + fsck_err(c, "dev %u bucket %zu has wrong " #_f \ + ": got %u, should be %u", i, b, \ + dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ + dst->b[b]._mark.dirty = true; \ + } +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + + if (!metadata_only) { + struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); + struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + struct stripe *dst, *src; + unsigned i; + + c->ec_stripes_heap.used = 0; + + while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && + (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { + BUG_ON(src_iter.pos != dst_iter.pos); + + copy_stripe_field(alive, "alive"); + copy_stripe_field(sectors, "sectors"); + copy_stripe_field(algorithm, "algorithm"); + copy_stripe_field(nr_blocks, "nr_blocks"); + copy_stripe_field(nr_redundant, "nr_redundant"); + copy_stripe_field(blocks_nonempty, + "blocks_nonempty"); + + for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) + copy_stripe_field(block_sectors[i], + "block_sectors[%u]", i); + + if (dst->alive) + bch2_stripes_heap_insert(c, dst, dst_iter.pos); + + genradix_iter_advance(&dst_iter, &c->stripes[0]); + genradix_iter_advance(&src_iter, &c->stripes[1]); } } - c->usage_cached = __bch2_fs_usage_read(c); - for_each_possible_cpu(cpu) { - struct bch_fs_usage *p = - per_cpu_ptr(c->usage_percpu, cpu); + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 0); + struct bucket_array *src = __bucket_array(ca, 1); + size_t b; + + for (b = 0; b < src->nbuckets; b++) { + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(owned_by_allocator); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + + if (dst->b[b].oldest_gen != src->b[b].oldest_gen) { + dst->b[b].oldest_gen = src->b[b].oldest_gen; + dst->b[b]._mark.dirty = true; + } + } + }; + + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); + + bch2_dev_usage_from_buckets(c); - memset(p->s, 0, sizeof(p->s)); + { + unsigned nr = fs_usage_u64s(c); + struct bch_fs_usage *dst = c->usage_base; + struct bch_fs_usage *src = (void *) + bch2_acc_percpu_u64s((void *) c->usage_gc, nr); + + copy_fs_field(hidden, "hidden"); + copy_fs_field(btree, "btree"); + + if (!metadata_only) { + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + char buf[80]; + + if (metadata_only && + (e->data_type == BCH_DATA_USER || + e->data_type == BCH_DATA_CACHED)) + continue; + + bch2_replicas_entry_to_text(&PBUF(buf), e); + + copy_fs_field(replicas[i], "%s", buf); + } } - percpu_up_write(&c->usage_lock); +#undef copy_fs_field +#undef copy_dev_field +#undef copy_bucket_field +#undef copy_stripe_field +#undef copy_field +fsck_err: + return ret; +} + +static int bch2_gc_start(struct bch_fs *c, + bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + /* + * indicate to stripe code that we need to allocate for the gc stripes + * radix tree, too + */ + gc_pos_set(c, gc_phase(GC_PHASE_START)); + + BUG_ON(c->usage_gc); + + c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!c->usage_gc) + return -ENOMEM; - /* Clear bucket marks: */ for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - bucket_cmpxchg(buckets->b + b, new, ({ - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - })); - ca->oldest_gens[b] = new.gen; + BUG_ON(ca->buckets[1]); + BUG_ON(ca->usage[1]); + + ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; + } + + ca->usage[1] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; } - up_read(&ca->bucket_lock); } + + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 1); + struct bucket_array *src = __bucket_array(ca, 0); + size_t b; + + dst->first_bucket = src->first_bucket; + dst->nbuckets = src->nbuckets; + + for (b = 0; b < src->nbuckets; b++) { + struct bucket *d = &dst->b[b]; + struct bucket *s = &src->b[b]; + + d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; + d->gen_valid = s->gen_valid; + + if (metadata_only && + (s->mark.data_type == BCH_DATA_USER || + s->mark.data_type == BCH_DATA_CACHED)) { + d->_mark = s->mark; + d->_mark.owned_by_allocator = 0; + } + } + }; + + return bch2_ec_mem_alloc(c, true); } /** - * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes + * bch2_gc - walk _all_ references to buckets, and recompute them: + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them */ -void bch2_gc(struct bch_fs *c) +int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, + bool initial, bool metadata_only) { struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i; + unsigned i, iter = 0; + int ret; - /* - * Walk _all_ references to buckets, and recompute them: - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ trace_gc_start(c); - /* - * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on - * gc_lock if sectors_available goes to 0: - */ - bch2_recalc_sectors_available(c); - down_write(&c->gc_lock); - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) +again: + percpu_down_write(&c->mark_lock); + ret = bch2_gc_start(c, metadata_only); + percpu_up_write(&c->mark_lock); + + if (ret) goto out; - bch2_gc_start(c); + bch2_mark_superblocks(c); - /* Walk btree: */ - while (c->gc_pos.phase < (int) BTREE_ID_NR) { - int ret = c->btree_roots[c->gc_pos.phase].b - ? bch2_gc_btree(c, (int) c->gc_pos.phase) - : 0; + ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); + if (ret) + goto out; - if (ret) { - bch_err(c, "btree gc failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); - goto out; + bch2_mark_pending_btree_node_frees(c); + bch2_mark_allocator_buckets(c); + + c->gc_count++; +out: + if (!ret && + (test_bit(BCH_FS_FIXED_GENS, &c->flags) || + (!iter && test_restart_gc(c)))) { + /* + * XXX: make sure gens we fixed got saved + */ + if (iter++ <= 2) { + bch_info(c, "Fixed gens, restarting mark and sweep:"); + clear_bit(BCH_FS_FIXED_GENS, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + percpu_down_write(&c->mark_lock); + bch2_gc_free(c); + percpu_up_write(&c->mark_lock); + + goto again; } - gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; } - bch2_mark_superblocks(c); - bch2_mark_pending_btree_node_frees(c); - bch2_mark_allocator_buckets(c); + if (!ret) { + bch2_journal_block(&c->journal); - for_each_member_device(ca, c, i) - atomic_long_set(&ca->saturated_count, 0); + percpu_down_write(&c->mark_lock); + ret = bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); + } else { + percpu_down_write(&c->mark_lock); + } /* Indicates that gc is no longer in progress: */ - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - c->gc_count++; -out: + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + bch2_gc_free(c); + percpu_up_write(&c->mark_lock); + up_write(&c->gc_lock); + trace_gc_end(c); bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); @@ -577,21 +803,21 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + return ret; } /* Btree coalescing */ static void recalc_packed_keys(struct btree *b) { + struct bset *i = btree_bset_first(b); struct bkey_packed *k; memset(&b->nr, 0, sizeof(b->nr)); BUG_ON(b->nsets != 1); - for (k = btree_bkey_first(b, b->set); - k != btree_bkey_last(b, b->set); - k = bkey_next(k)) + vstruct_for_each(i, k) btree_keys_account_key_add(&b->nr, 0, k); } @@ -780,21 +1006,20 @@ next: bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); /* Insert the newly coalesced nodes */ - bch2_btree_insert_node(as, parent, iter, &keylist); + bch2_btree_insert_node(as, parent, iter, &keylist, 0); BUG_ON(!bch2_keylist_empty(&keylist)); BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]); - BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0])); + bch2_btree_iter_node_replace(iter, new_nodes[0]); for (i = 0; i < nr_new_nodes; i++) - bch2_btree_open_bucket_put(c, new_nodes[i]); + bch2_open_buckets_put(c, &new_nodes[i]->ob); /* Free the old nodes and update our sliding window */ for (i = 0; i < nr_old_nodes; i++) { bch2_btree_node_free_inmem(c, old_nodes[i], iter); - six_unlock_intent(&old_nodes[i]->lock); /* * the index update might have triggered a split, in which case @@ -817,7 +1042,8 @@ next: static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct btree *b; bool kthread = (current->flags & PF_KTHREAD) != 0; unsigned i; @@ -826,6 +1052,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) struct btree *merge[GC_MERGE_NODES]; u32 lock_seq[GC_MERGE_NODES]; + bch2_trans_init(&trans, c, 0, 0); + /* * XXX: We don't have a good way of positively matching on sibling nodes * that have the same parent - this code works by handling the cases @@ -835,7 +1063,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) */ memset(merge, 0, sizeof(merge)); - __for_each_btree_node(&iter, c, btree_id, POS_MIN, + __for_each_btree_node(&trans, iter, btree_id, POS_MIN, BTREE_MAX_DEPTH, 0, BTREE_ITER_PREFETCH, b) { memmove(merge + 1, merge, @@ -857,7 +1085,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) } memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); - bch2_coalesce_nodes(c, &iter, merge); + bch2_coalesce_nodes(c, iter, merge); for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { lock_seq[i] = merge[i]->lock.state.seq; @@ -867,23 +1095,23 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) lock_seq[0] = merge[0]->lock.state.seq; if (kthread && kthread_should_stop()) { - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return -ESHUTDOWN; } - bch2_btree_iter_cond_resched(&iter); + bch2_trans_cond_resched(&trans); /* * If the parent node wasn't relocked, it might have been split * and the nodes in our sliding window might not have the same * parent anymore - blow away the sliding window: */ - if (btree_iter_node(&iter, iter.level + 1) && - !btree_node_intent_locked(&iter, iter.level + 1)) + if (btree_iter_node(iter, iter->level + 1) && + !btree_node_intent_locked(iter, iter->level + 1)) memset(merge + 1, 0, (GC_MERGE_NODES - 1) * sizeof(merge[0])); } - return bch2_btree_iter_unlock(&iter); + return bch2_trans_exit(&trans); } /** @@ -893,9 +1121,6 @@ void bch2_coalesce(struct bch_fs *c) { enum btree_id id; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - down_read(&c->gc_lock); trace_gc_coalesce_start(c); @@ -907,7 +1132,6 @@ void bch2_coalesce(struct bch_fs *c) if (ret) { if (ret != -ESHUTDOWN) bch_err(c, "btree coalescing failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); return; } } @@ -922,6 +1146,7 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic_long_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); + int ret; set_freezable(); @@ -955,7 +1180,9 @@ static int bch2_gc_thread(void *arg) last = atomic_long_read(&clock->now); last_kick = atomic_read(&c->kick_gc); - bch2_gc(c); + ret = bch2_gc(c, NULL, false, false); + if (ret) + bch_err(c, "btree gc failed: %i", ret); debug_check_no_locks_held(); } @@ -991,115 +1218,3 @@ int bch2_gc_thread_start(struct bch_fs *c) wake_up_process(p); return 0; } - -/* Initial GC computes bucket marks during startup */ - -static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id) -{ - struct btree_iter iter; - struct btree *b; - struct range_checks r; - int ret = 0; - - btree_node_range_checks_init(&r, 0); - - if (!c->btree_roots[id].b) - return 0; - - b = c->btree_roots[id].b; - if (!btree_node_fake(b)) - ret = bch2_btree_mark_key_initial(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&b->key)); - if (ret) - return ret; - - /* - * We have to hit every btree node before starting journal replay, in - * order for the journal seq blacklist machinery to work: - */ - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - btree_node_range_checks(c, b, &r); - - if (btree_node_has_ptrs(b)) { - struct btree_node_iter node_iter; - struct bkey unpacked; - struct bkey_s_c k; - - for_each_btree_node_key_unpack(b, k, &node_iter, - btree_node_is_extents(b), - &unpacked) { - ret = bch2_btree_mark_key_initial(c, - btree_node_type(b), k); - if (ret) - goto err; - } - } - - bch2_btree_iter_cond_resched(&iter); - } -err: - return bch2_btree_iter_unlock(&iter) ?: ret; -} - -static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal) -{ - unsigned iter = 0; - enum btree_id id; - int ret; - - mutex_lock(&c->sb_lock); - if (!bch2_sb_get_replicas(c->disk_sb.sb)) { - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) - bch_info(c, "building replicas info"); - set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - } - mutex_unlock(&c->sb_lock); -again: - bch2_gc_start(c); - - for (id = 0; id < BTREE_ID_NR; id++) { - ret = bch2_initial_gc_btree(c, id); - if (ret) - return ret; - } - - ret = bch2_journal_mark(c, journal); - if (ret) - return ret; - - if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - return -EINVAL; - } - - bch_info(c, "Fixed gens, restarting initial mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); - goto again; - } - - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type) - atomic64_add(1 << 16, &c->key_version); - - bch2_mark_superblocks(c); - - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - - return 0; -} - -int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) -{ - int ret; - - down_write(&c->gc_lock); - ret = __bch2_initial_gc(c, journal); - up_write(&c->gc_lock); - - return ret; -} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 4d1ab9dbe9c8..bd5f2752954f 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -1,18 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_GC_H #define _BCACHEFS_BTREE_GC_H #include "btree_types.h" -enum bkey_type; - void bch2_coalesce(struct bch_fs *); -void bch2_gc(struct bch_fs *); + +struct journal_keys; +int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); -int bch2_initial_gc(struct bch_fs *, struct list_head *); -u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); -int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type, - struct bkey_s_c); void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); /* @@ -46,8 +43,6 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) }; } -#define GC_POS_MIN gc_phase(0) - static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { if (l.phase != r.phase) @@ -59,17 +54,34 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) return 0; } +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { +#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; + BCH_BTREE_IDS() +#undef x + default: + BUG(); + } +} + +static inline struct gc_pos gc_pos_btree(enum btree_id id, + struct bpos pos, unsigned level) +{ + return (struct gc_pos) { + .phase = btree_id_to_gc_phase(id), + .pos = pos, + .level = level, + }; +} + /* * GC position of the pointers within a btree node: note, _not_ for &b->key * itself, that lives in the parent node: */ static inline struct gc_pos gc_pos_btree_node(struct btree *b) { - return (struct gc_pos) { - .phase = b->btree_id, - .pos = b->key.k.p, - .level = b->level, - }; + return gc_pos_btree(b->btree_id, b->key.k.p, b->level); } /* @@ -81,11 +93,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) */ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) { - return (struct gc_pos) { - .phase = (int) id, - .pos = POS_MAX, - .level = U8_MAX, - }; + return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); } static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) @@ -96,14 +104,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o }; } -static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) +static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; bool ret; do { seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(c->gc_pos, pos) < 0; + ret = gc_pos_cmp(pos, c->gc_pos) <= 0; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 74ffad4c38f3..52c3f51f02cb 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" +#include "bkey_sort.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" @@ -19,43 +21,6 @@ #include <trace/events/bcachefs.h> -/* btree_node_iter_large: */ - -#define btree_node_iter_cmp_heap(h, _l, _r) \ - __btree_node_iter_cmp((iter)->is_extents, b, \ - __btree_node_offset_to_key(b, (_l).k), \ - __btree_node_offset_to_key(b, (_r).k)) - -void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set n = - ((struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }); - - __heap_add(iter, n, btree_node_iter_cmp_heap); - } -} - -void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, - struct btree *b) -{ - iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; - - EBUG_ON(!iter->used); - EBUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - heap_del(iter, 0, btree_node_iter_cmp_heap); - else - heap_sift_down(iter, 0, btree_node_iter_cmp_heap); -} - static void verify_no_dups(struct btree *b, struct bkey_packed *start, struct bkey_packed *end) @@ -116,190 +81,6 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); } -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); - -struct sort_iter { - struct btree *b; - unsigned used; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; -}; - -static void sort_iter_init(struct sort_iter *iter, struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - iter->b = b; -} - -static inline void __sort_iter_sift(struct sort_iter *iter, - unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -{ - - __sort_iter_sift(iter, 0, cmp); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - __sort_iter_sift(iter, i, cmp); -} - -static void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return iter->used ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - iter->data->k = bkey_next(iter->data->k); - - BUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - array_remove_item(iter->data, iter->used, 0); - else - sort_iter_sift(iter, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -static inline int sort_key_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r); -} - -static unsigned sort_key_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_key_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extent_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -} - -static unsigned sort_extent_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *out = dst; - struct bkey_i l, r; - bool prev = false, l_packed = false; - u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); - u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); - u64 new_size; - - max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); - - sort_iter_sort(iter, sort_extent_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { - EBUG_ON(bkeyp_val_u64s(f, in)); - EBUG_ON(in->type != KEY_TYPE_DISCARD); - - r.k = bkey_unpack_key(iter->b, in); - - if (prev && - bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - new_size = l_packed - ? min(max_packed_size, max_packed_offset - - bkey_start_offset(&l.k)) - : KEY_SIZE_MAX; - - new_size = min(new_size, r.k.p.offset - - bkey_start_offset(&l.k)); - - BUG_ON(new_size < l.k.size); - - bch2_key_resize(&l.k, new_size); - - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - bch2_cut_front(l.k.p, &r); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - l = r; - prev = true; - l_packed = bkey_packed(in); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, bool compacting, enum compact_mode mode) @@ -309,7 +90,7 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, if (mode == COMPACT_LAZY) { if (should_compact_bset_lazy(b, t) || - (compacting && bset_unwritten(b, bset(b, t)))) + (compacting && !bset_written(b, bset(b, t)))) return dead_u64s; } else { if (bset_written(b, bset(b, t))) @@ -356,7 +137,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, struct bkey_packed *k, *n, *out, *start, *end; struct btree_node_entry *src = NULL, *dst = NULL; - if (t != b->set && bset_unwritten(b, i)) { + if (t != b->set && !bset_written(b, i)) { src = container_of(i, struct btree_node_entry, keys); dst = max(write_block(b), (void *) btree_bkey_last(b, t -1)); @@ -396,7 +177,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, continue; if (bkey_whiteout(k)) { - unreserve_whiteout(b, t, k); + unreserve_whiteout(b, k); memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); set_bkeyp_val_u64s(f, u_pos, 0); u_pos = bkey_next(u_pos); @@ -420,11 +201,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, BUG_ON((void *) unwritten_whiteouts_start(c, b) < (void *) btree_bkey_last(b, bset_tree_last(b))); - u64s = btree_node_is_extents(b) - ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter) - : sort_key_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter); + u64s = (btree_node_is_extents(b) + ? bch2_sort_extent_whiteouts + : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), + &sort_iter); BUG_ON(u64s > b->whiteout_u64s); BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); @@ -467,7 +247,7 @@ static bool bch2_drop_whiteouts(struct btree *b) start = btree_bkey_first(b, t); end = btree_bkey_last(b, t); - if (bset_unwritten(b, i) && + if (!bset_written(b, i) && t != b->set) { struct bset *dst = max_t(struct bset *, write_block(b), @@ -499,87 +279,6 @@ static bool bch2_drop_whiteouts(struct btree *b) return ret; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; -} - -static unsigned sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, sort_keys_cmp); - - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; - } - - if (bkey_whiteout(in)) { - memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_copy(out, in); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extents_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_deleted(l) - (int) bkey_deleted(r); -} - -static unsigned sort_extents(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_extents_cmp); - - while ((in = sort_iter_next(iter, sort_extents_cmp))) { - if (bkey_deleted(in)) - continue; - - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - static void btree_node_sort(struct bch_fs *c, struct btree *b, struct btree_iter *iter, unsigned start_idx, @@ -618,16 +317,18 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (btree_node_is_extents(b)) filter_whiteouts = bset_written(b, start_bset); - u64s = btree_node_is_extents(b) - ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) - : sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + u64s = (btree_node_is_extents(b) + ? bch2_sort_extents + : bch2_sort_keys)(out->keys.start, + &sort_iter, + filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); /* Make sure we preserve bset journal_seq: */ @@ -678,101 +379,6 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bch2_verify_btree_nr_keys(b); } -/* Sort + repack in a new format: */ -static struct btree_nr_keys sort_repack(struct bset *dst, - struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_whiteout(in)) - continue; - - if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort, repack, and merge: */ -static struct btree_nr_keys sort_repack_merge(struct bch_fs *c, - struct bset *dst, - struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts, - key_filter_fn filter, - key_merge_fn merge) -{ - struct bkey_packed *k, *prev = NULL, *out; - struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; - - memset(&nr, 0, sizeof(nr)); - - while ((k = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_whiteout(k)) - continue; - - /* - * The filter might modify pointers, so we have to unpack the - * key and values to &tmp.k: - */ - bch2_bkey_unpack(src, &tmp.k, k); - - if (filter && filter(c, src, bkey_i_to_s(&tmp.k))) - continue; - - /* prev is always unpacked, for key merging: */ - - if (prev && - merge && - merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE) - continue; - - /* - * the current key becomes the new prev: advance prev, then - * copy the current key - but first pack prev (in place): - */ - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - - btree_keys_account_key_add(&nr, 0, prev); - prev = bkey_next(prev); - } else { - prev = vstruct_last(dst); - } - - bkey_copy(prev, &tmp.k); - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = vstruct_last(dst); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - void bch2_btree_sort_into(struct bch_fs *c, struct btree *dst, struct btree *src) @@ -785,24 +391,21 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_bset_set_no_aux_tree(dst, dst->set); - bch2_btree_node_iter_init_from_start(&src_iter, src, - btree_node_is_extents(src)); + bch2_btree_node_iter_init_from_start(&src_iter, src); - if (btree_node_ops(src)->key_normalize || - btree_node_ops(src)->key_merge) - nr = sort_repack_merge(c, btree_bset_first(dst), + if (btree_node_is_extents(src)) + nr = bch2_sort_repack_merge(c, btree_bset_first(dst), src, &src_iter, &dst->format, - true, - btree_node_ops(src)->key_normalize, - btree_node_ops(src)->key_merge); + true); else - nr = sort_repack(btree_bset_first(dst), + nr = bch2_sort_repack(btree_bset_first(dst), src, &src_iter, &dst->format, true); - bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); set_btree_bset_end(dst, dst->set); @@ -829,7 +432,7 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b, for (unwritten_idx = 0; unwritten_idx < b->nsets; unwritten_idx++) - if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) break; if (b->nsets - unwritten_idx > 1) { @@ -852,7 +455,7 @@ void bch2_btree_build_aux_trees(struct btree *b) for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, - bset_unwritten(b, bset(b, t)) && + !bset_written(b, bset(b, t)) && t == bset_tree_last(b)); } @@ -907,33 +510,27 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, vstruct_end(i) - (void *) i->_data); } -static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i, - unsigned offset, int write, char *buf, size_t len) +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct btree *b, struct bset *i, + unsigned offset, int write) { - char *out = buf, *end = buf + len; - - out += scnprintf(out, end - out, - "error validating btree node %s " - "at btree %u level %u/%u\n" - "pos %llu:%llu node offset %u", - write ? "before write " : "", - b->btree_id, b->level, - c->btree_roots[b->btree_id].level, - b->key.k.p.inode, b->key.k.p.offset, - b->written); + pr_buf(out, "error validating btree node %s" + "at btree %u level %u/%u\n" + "pos %llu:%llu node offset %u", + write ? "before write " : "", + b->btree_id, b->level, + c->btree_roots[b->btree_id].level, + b->key.k.p.inode, b->key.k.p.offset, + b->written); if (i) - out += scnprintf(out, end - out, - " bset u64s %u", - le16_to_cpu(i->u64s)); - - return out - buf; + pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } enum btree_err_type { @@ -950,10 +547,11 @@ enum btree_validate_ret { #define btree_err(type, c, b, i, msg, ...) \ ({ \ __label__ out; \ - char _buf[300], *out = _buf, *end = out + sizeof(_buf); \ + char _buf[300]; \ + struct printbuf out = PBUF(_buf); \ \ - out += btree_err_msg(c, b, i, b->written, write, out, end - out);\ - out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__); \ + btree_err_msg(&out, c, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ @@ -1006,8 +604,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, { struct bkey_packed *k, *prev = NULL; struct bpos prev_pos = POS_MIN; - enum bkey_type type = btree_node_type(b); bool seen_non_whiteout = false; + unsigned version; const char *err; int ret = 0; @@ -1053,13 +651,12 @@ static int validate_bset(struct bch_fs *c, struct btree *b, "invalid bkey format: %s", err); } - if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION, - BTREE_ERR_FIXABLE, c, b, i, - "unsupported bset version")) { - i->version = cpu_to_le16(BCACHE_BSET_VERSION); - i->u64s = 0; - return 0; - } + version = le16_to_cpu(i->version); + btree_err_on((version != BCH_BSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + BTREE_ERR_FATAL, c, b, i, + "unsupported bset version"); if (btree_err_on(b->written + sectors > c->opts.btree_node_size, BTREE_ERR_FIXABLE, c, b, i, @@ -1108,19 +705,23 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch2_bkey_swab(type, &b->format, k); + bch2_bkey_swab(&b->format, k); + + if (!write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(btree_node_type(b), k, write); u = bkey_disassemble(b, k, &tmp); - invalid = __bch2_bkey_invalid(c, type, u) ?: + invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, u) ?: - (write ? bch2_bkey_val_invalid(c, type, u) : NULL); + (write ? bch2_bkey_val_invalid(c, u) : NULL); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); + bch2_bkey_val_to_text(&PBUF(buf), c, u); btree_err(BTREE_ERR_FIXABLE, c, b, i, - "invalid bkey:\n%s\n%s", buf, invalid); + "invalid bkey:\n%s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -1128,6 +729,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, continue; } + if (write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(btree_node_type(b), k, write); + /* * with the separate whiteouts thing (used for extents), the * second set of keys actually can have whiteouts too, so we @@ -1166,12 +771,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct btree_node *sorted; struct bkey_packed *k; struct bset *i; - bool used_mempool; + bool used_mempool, blacklisted; unsigned u64s; int ret, retry_read = 0, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b)); + iter->used = 0; if (bch2_meta_read_fault("btree")) btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, @@ -1240,20 +845,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry b->written += sectors; - ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) { - btree_err(BTREE_ERR_FATAL, c, b, i, - "insufficient memory"); - goto err; - } + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); - if (ret) { - btree_err_on(first, - BTREE_ERR_FIXABLE, c, b, i, - "first btree node bset has blacklisted journal seq"); - if (!first) - continue; - } + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; bch2_btree_node_iter_large_push(iter, b, i->start, @@ -1293,15 +893,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { - enum bkey_type type = btree_node_type(b); struct bkey tmp; struct bkey_s_c u = bkey_disassemble(b, k, &tmp); - const char *invalid = bch2_bkey_val_invalid(c, type, u); + const char *invalid = bch2_bkey_val_invalid(c, u); - if (invalid) { + if (invalid || + (inject_invalid_keys(c) && + !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; - bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); + bch2_bkey_val_to_text(&PBUF(buf), c, u); btree_err(BTREE_ERR_FIXABLE, c, b, i, "invalid bkey %s: %s", buf, invalid); @@ -1310,6 +911,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); continue; } @@ -1324,7 +926,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry out: mempool_free(iter, &c->fill_iter); return retry_read; -err: fsck_err: if (ret == BTREE_RETRY_READ) { retry_read = 1; @@ -1343,11 +944,9 @@ static void btree_node_read_work(struct work_struct *work) struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; bool can_retry; - memset(&avoid, 0, sizeof(avoid)); - goto start; while (1) { bch_info(c, "retrying read"); @@ -1370,8 +969,11 @@ start: percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - __set_bit(rb->pick.ptr.dev, avoid.d); - can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0; + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, b, can_retry)) @@ -1383,7 +985,8 @@ start: } } - bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); bio_put(&rb->bio); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1406,7 +1009,7 @@ static void btree_node_read_endio(struct bio *bio) void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bool sync) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; @@ -1414,7 +1017,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, trace_btree_read(c, b); - ret = bch2_btree_pick_ptr(c, b, NULL, &pick); + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, "btree node read error: no device to read from")) { set_btree_node_read_error(b); @@ -1423,7 +1027,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); + bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, + btree_bytes(c)), + &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); @@ -1539,22 +1145,24 @@ static void bch2_btree_node_write_error(struct bch_fs *c, { struct btree *b = wbio->wbio.bio.bi_private; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct bkey_i_extent *new_key; - struct bkey_s_extent e; + struct bkey_i_btree_ptr *new_key; + struct bkey_s_btree_ptr bp; struct bch_extent_ptr *ptr; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; int ret; - __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, - BTREE_MAX_DEPTH, - b->level, 0); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->level, 0); retry: - ret = bch2_btree_iter_traverse(&iter); + ret = bch2_btree_iter_traverse(iter); if (ret) goto err; /* has node been freed? */ - if (iter.l[b->level].b != b) { + if (iter->l[b->level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -1564,22 +1172,22 @@ retry: bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_extent(&tmp.k); - e = extent_i_to_s(new_key); - extent_for_each_ptr_backwards(e, ptr) - if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)) - bch2_extent_drop_ptr(e, ptr); + new_key = bkey_i_to_btree_ptr(&tmp.k); + bp = btree_ptr_i_to_s(new_key); + + bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_extent_nr_ptrs(e.c)) + if (!bch2_bkey_nr_ptrs(bp.s_c)) goto err; - ret = bch2_btree_node_update_key(c, &iter, b, new_key); + ret = bch2_btree_node_update_key(c, iter, b, new_key); if (ret == -EINTR) goto retry; if (ret) goto err; out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); return; @@ -1673,12 +1281,11 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - const struct bch_extent_ptr *ptr; unsigned whiteout_u64s = 0; int ret; - extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr) - break; + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) + return -1; ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); if (ret) @@ -1696,7 +1303,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; BKEY_PADDED(key) k; - struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1704,6 +1310,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, u64 seq = 0; bool used_mempool; unsigned long old, new; + bool validate_before_checksum = false; void *data; if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) @@ -1722,8 +1329,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!(old & (1 << BTREE_NODE_dirty))) return; - if (b->written && - !btree_node_may_write(b)) + if (!btree_node_may_write(b)) return; if (old & (1 << BTREE_NODE_write_in_flight)) { @@ -1739,7 +1345,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, } while (cmpxchg_acquire(&b->flags, old, new) != old); BUG_ON(btree_node_fake(b)); - BUG_ON(!list_empty(&b->write_blocked)); BUG_ON((b->will_make_reachable != 0) != !b->written); BUG_ON(b->written >= c->opts.btree_node_size); @@ -1817,8 +1422,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->whiteout_u64s = 0; u64s = btree_node_is_extents(b) - ? sort_extents(vstruct_last(i), &sort_iter, false) - : sort_keys(i->start, &sort_iter, false); + ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) + : bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); clear_needs_whiteout(i); @@ -1837,11 +1442,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = cpu_to_le16(BCACHE_BSET_VERSION); + i->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) + validate_before_checksum = true; + + /* validate_bset will be modifying: */ + if (le16_to_cpu(i->version) < + bcachefs_metadata_version_bkey_renumber) + validate_before_checksum = true; + /* if we're going to be encrypting, check metadata validity first: */ - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + if (validate_before_checksum && validate_bset_for_write(c, b, i, sectors_to_write)) goto err; @@ -1855,7 +1470,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); /* if we're not encrypting, check metadata after checksumming: */ - if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + if (!validate_before_checksum && validate_bset_for_write(c, b, i, sectors_to_write)) goto err; @@ -1878,7 +1493,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, trace_btree_write(b, bytes_to_write, sectors_to_write); - wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio), + wbio = container_of(bio_alloc_bioset(GFP_NOIO, + buf_pages(data, sectors_to_write << 9), + &c->btree_bio), struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; @@ -1907,9 +1524,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, */ bkey_copy(&k.key, &b->key); - e = bkey_i_to_s_extent(&k.key); - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ptr->offset += b->written; b->written += sectors_to_write; @@ -1942,9 +1558,9 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); /* - * Note: immediately after write, bset_unwritten()/bset_written() don't - * work - the amount of data we had to write after compaction might have - * been smaller than the offset of the last bset. + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. * * However, we know that all bsets have been written here, as long as * we're still holding the write lock: @@ -2054,7 +1670,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c) ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) { - char *out = buf, *end = buf + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; @@ -2065,24 +1681,22 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) unsigned long flags = READ_ONCE(b->flags); unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; - if (//!(flags & (1 << BTREE_NODE_dirty)) && - !b->writes[0].wait.list.first && - !b->writes[1].wait.list.first && - !(b->will_make_reachable & 1)) + if (!(flags & (1 << BTREE_NODE_dirty))) continue; - out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n", - b, - (flags & (1 << BTREE_NODE_dirty)) != 0, - b->level, - b->written, - !list_empty_careful(&b->write_blocked), - b->will_make_reachable != 0, - b->will_make_reachable & 1, - b->writes[ idx].wait.list.first != NULL, - b->writes[!idx].wait.list.first != NULL); + pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, + b->level, + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, + b->will_make_reachable & 1, + b->writes[ idx].wait.list.first != NULL, + b->writes[!idx].wait.list.first != NULL); } rcu_read_unlock(); - return out - buf; + return out.pos - buf; } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index fa1546425151..c817aeed878a 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -1,7 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_IO_H #define _BCACHEFS_BTREE_IO_H #include "bset.h" +#include "btree_locking.h" #include "extents.h" #include "io_types.h" @@ -14,7 +16,7 @@ struct btree_read_bio { struct bch_fs *c; u64 start_time; unsigned have_ioref:1; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; }; @@ -47,7 +49,7 @@ static inline void btree_node_wait_on_io(struct btree *b) static inline bool btree_node_may_write(struct btree *b) { return list_empty_careful(&b->write_blocked) && - !b->will_make_reachable; + (!b->written || !b->will_make_reachable); } enum compact_mode { @@ -99,42 +101,36 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type); -/* - * btree_node_dirty() can be cleared with only a read lock, - * and for bch2_btree_node_write_cond() we want to set need_write iff it's - * still dirty: - */ -static inline void set_btree_node_need_write_if_dirty(struct btree *b) +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) { - unsigned long old, new, v = READ_ONCE(b->flags); - - do { - old = new = v; - - if (!(old & (1 << BTREE_NODE_dirty))) - return; - - new |= (1 << BTREE_NODE_need_write); - } while ((v = cmpxchg(&b->flags, old, new)) != old); + while (b->written && + btree_node_need_write(b) && + btree_node_may_write(b)) { + if (!btree_node_write_in_flight(b)) { + bch2_btree_node_write(c, b, SIX_LOCK_read); + break; + } + + six_unlock_read(&b->lock); + btree_node_wait_on_io(b); + btree_node_lock_type(c, b, SIX_LOCK_read); + } } #define bch2_btree_node_write_cond(_c, _b, cond) \ do { \ - while ((_b)->written && btree_node_dirty(_b) && (cond)) { \ - if (!btree_node_may_write(_b)) { \ - set_btree_node_need_write_if_dirty(_b); \ - break; \ - } \ + unsigned long old, new, v = READ_ONCE((_b)->flags); \ \ - if (!btree_node_write_in_flight(_b)) { \ - bch2_btree_node_write(_c, _b, SIX_LOCK_read); \ + do { \ + old = new = v; \ + \ + if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ break; \ - } \ \ - six_unlock_read(&(_b)->lock); \ - btree_node_wait_on_io(_b); \ - btree_node_lock_type(c, b, SIX_LOCK_read); \ - } \ + new |= (1 << BTREE_NODE_need_write); \ + } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ + \ + btree_node_write_if_need(_c, _b); \ } while (0) void bch2_btree_flush_all_reads(struct bch_fs *); @@ -142,55 +138,4 @@ void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_btree_verify_flushed(struct bch_fs *); ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); -/* Sorting */ - -struct btree_node_iter_large { - u8 is_extents; - u16 used; - - struct btree_node_iter_set data[MAX_BSETS]; -}; - -static inline void -__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter, - bool is_extents) -{ - iter->used = 0; - iter->is_extents = is_extents; -} - -void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *, - struct btree *); - -void bch2_btree_node_iter_large_push(struct btree_node_iter_large *, - struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - -static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter) -{ - return !iter->used; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter, - struct btree *b) -{ - return bch2_btree_node_iter_large_end(iter) - ? NULL - : __btree_node_offset_to_key(b, iter->data->k); -} - -static inline struct bkey_packed * -bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter, - struct btree *b) -{ - struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b); - - if (ret) - bch2_btree_node_iter_large_advance(iter, b); - - return ret; -} - #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 70c3132eb538..8955555d6603 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" @@ -14,11 +15,51 @@ static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, struct btree_iter_level *, struct bkey *); -#define BTREE_ITER_NOT_END ((struct btree *) 1) +#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) +#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) +#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) +#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) +#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) +#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) +#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { - return iter->l[l].b && iter->l[l].b != BTREE_ITER_NOT_END; + return l < BTREE_MAX_DEPTH && + (unsigned long) iter->l[l].b >= 128; +} + +/* Returns < 0 if @k is before iter pos, > 0 if @k is after */ +static inline int __btree_iter_pos_cmp(struct btree_iter *iter, + const struct btree *b, + const struct bkey_packed *k, + bool interior_node) +{ + int cmp = bkey_cmp_left_packed(b, k, &iter->pos); + + if (cmp) + return cmp; + if (bkey_deleted(k)) + return -1; + + /* + * Normally, for extents we want the first key strictly greater than + * the iterator position - with the exception that for interior nodes, + * we don't want to advance past the last key if the iterator position + * is POS_MAX: + */ + if (iter->flags & BTREE_ITER_IS_EXTENTS && + (!interior_node || + bkey_cmp_left_packed_byval(b, k, POS_MAX))) + return -1; + return 1; +} + +static inline int btree_iter_pos_cmp(struct btree_iter *iter, + const struct btree *b, + const struct bkey_packed *k) +{ + return __btree_iter_pos_cmp(iter, b, k, b->level != 0); } /* Btree node locking: */ @@ -32,23 +73,22 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) struct btree_iter *linked; EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq); + EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); - for_each_linked_btree_node(iter, b, linked) - linked->lock_seq[b->level] += 2; - - iter->lock_seq[b->level] += 2; + trans_for_each_iter_with_node(iter->trans, b, linked) + linked->l[b->level].lock_seq += 2; six_unlock_write(&b->lock); } void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) { - struct bch_fs *c = iter->c; struct btree_iter *linked; unsigned readers = 0; - for_each_linked_btree_iter(iter, linked) + EBUG_ON(btree_node_read_locked(iter, b->level)); + + trans_for_each_iter(iter->trans, linked) if (linked->l[b->level].b == b && btree_node_read_locked(linked, b->level)) readers++; @@ -61,63 +101,112 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) */ atomic64_sub(__SIX_VAL(read_lock, readers), &b->lock.state.counter); - btree_node_lock_type(c, b, SIX_LOCK_write); + btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); atomic64_add(__SIX_VAL(read_lock, readers), &b->lock.state.counter); } bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { - struct btree_iter *linked; - struct btree *b = iter->l[level].b; - int want = btree_lock_want(iter, level); - int have = btree_node_locked_type(iter, level); + struct btree *b = btree_iter_node(iter, level); + int want = __btree_lock_want(iter, level); + + if (!is_btree_node(iter, level)) + return false; + + if (race_fault()) + return false; - if (want == have) + if (six_relock_type(&b->lock, want, iter->l[level].lock_seq) || + (btree_node_lock_seq_matches(iter, b, level) && + btree_node_lock_increment(iter, b, level, want))) { + mark_btree_node_locked(iter, level, want); return true; + } else { + return false; + } +} + +static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) +{ + struct btree *b = iter->l[level].b; + + EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); if (!is_btree_node(iter, level)) return false; + if (btree_node_intent_locked(iter, level)) + return true; + if (race_fault()) return false; - if (have != BTREE_NODE_UNLOCKED - ? six_trylock_convert(&b->lock, have, want) - : six_relock_type(&b->lock, want, iter->lock_seq[level])) + if (btree_node_locked(iter, level) + ? six_lock_tryupgrade(&b->lock) + : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) goto success; - for_each_linked_btree_iter(iter, linked) - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) == want && - iter->lock_seq[level] == b->lock.state.seq) { - btree_node_unlock(iter, level); - six_lock_increment(&b->lock, want); - goto success; - } + if (btree_node_lock_seq_matches(iter, b, level) && + btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(iter, level); + goto success; + } return false; success: - mark_btree_node_unlocked(iter, level); - mark_btree_node_locked(iter, level, want); + mark_btree_node_intent_locked(iter, level); return true; } -bool bch2_btree_iter_relock(struct btree_iter *iter) +static inline bool btree_iter_get_locks(struct btree_iter *iter, + bool upgrade, bool trace) { - unsigned l; + unsigned l = iter->level; + int fail_idx = -1; - for (l = iter->level; - l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b; - l++) - if (!bch2_btree_node_relock(iter, l)) { + do { + if (!btree_iter_node(iter, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(iter, l) + : bch2_btree_node_relock(iter, l))) { + if (trace) + (upgrade + ? trace_node_upgrade_fail + : trace_node_relock_fail)(l, iter->l[l].lock_seq, + is_btree_node(iter, l) + ? 0 + : (unsigned long) iter->l[l].b, + is_btree_node(iter, l) + ? iter->l[l].b->lock.state.seq + : 0); + + fail_idx = l; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - return false; } + l++; + } while (l < iter->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_iter_traverse has to walk back up to + * the node that we failed to relock: + */ + while (fail_idx >= 0) { + btree_node_unlock(iter, fail_idx); + iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; + --fail_idx; + } + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) iter->uptodate = BTREE_ITER_NEED_PEEK; - return true; + + bch2_btree_trans_verify_locks(iter->trans); + + return iter->uptodate < BTREE_ITER_NEED_RELOCK; } /* Slowpath: */ @@ -126,40 +215,18 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, struct btree_iter *iter, enum six_lock_type type) { - struct bch_fs *c = iter->c; struct btree_iter *linked; + bool ret = true; - /* Can't have children locked before ancestors: */ - EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked)); - - /* - * Can't hold any read locks while we block taking an intent lock - see - * below for reasoning, and we should have already dropped any read - * locks in the current iterator - */ - EBUG_ON(type == SIX_LOCK_intent && - iter->nodes_locked != iter->nodes_intent_locked); - - for_each_linked_btree_iter(iter, linked) - if (linked->l[level].b == b && - btree_node_locked_type(linked, level) == type) { - six_lock_increment(&b->lock, type); - return true; - } - - /* - * Must lock btree nodes in key order - this case hapens when locking - * the prev sibling in btree node merging: - */ - if (iter->nodes_locked && - __ffs(iter->nodes_locked) == level && - __btree_iter_cmp(iter->btree_id, pos, iter)) - return false; - - for_each_linked_btree_iter(iter, linked) { + /* Check if it's safe to block: */ + trans_for_each_iter(iter->trans, linked) { if (!linked->nodes_locked) continue; + /* * Must lock btree nodes in key order: */ + if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) + ret = false; + /* * Can't block taking an intent lock if we have _any_ nodes read * locked: @@ -174,16 +241,15 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - iter->locks_want); - return false; + if (!(iter->trans->nounlock)) { + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); + btree_iter_get_locks(linked, true, false); + } + ret = false; } - /* We have to lock btree nodes in key order: */ - if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) - return false; - /* * Interior nodes must be locked before their descendants: if * another iterator has possible descendants locked of the node @@ -191,87 +257,169 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (linked->btree_id == iter->btree_id && level > __fls(linked->nodes_locked)) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - iter->locks_want); - return false; + if (!(iter->trans->nounlock)) { + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); + btree_iter_get_locks(linked, true, false); + } + ret = false; } } - __btree_node_lock_type(c, b, type); + if (unlikely(!ret)) { + trace_trans_restart_would_deadlock(iter->trans->ip); + return false; + } + + __btree_node_lock_type(iter->trans->c, b, type); return true; } /* Btree iterator locking: */ -static void btree_iter_drop_extra_locks(struct btree_iter *iter) +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) > iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); - } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->lock); - iter->nodes_intent_locked ^= 1 << l; - } - break; - } + for (l = 0; btree_iter_node(iter, l); l++) { + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && + !btree_node_locked(iter, l)) + continue; + + BUG_ON(btree_lock_want(iter, l) != + btree_node_locked_type(iter, l)); } } -bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) +void bch2_btree_trans_verify_locks(struct btree_trans *trans) +{ + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) + bch2_btree_iter_verify_locks(iter); +} +#endif + +__flatten +static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) +{ + return iter->uptodate >= BTREE_ITER_NEED_RELOCK + ? btree_iter_get_locks(iter, false, trace) + : true; +} + +bool __bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) { struct btree_iter *linked; - /* Drop locks we don't want anymore: */ - if (new_locks_want < iter->locks_want) - for_each_linked_btree_iter(iter, linked) - if (linked->locks_want > new_locks_want) { - linked->locks_want = max_t(unsigned, 1, - new_locks_want); - btree_iter_drop_extra_locks(linked); - } + EBUG_ON(iter->locks_want >= new_locks_want); iter->locks_want = new_locks_want; - btree_iter_drop_extra_locks(iter); - if (bch2_btree_iter_relock(iter)) + if (btree_iter_get_locks(iter, true, true)) return true; /* - * Just an optimization: ancestor nodes must be locked before child - * nodes, so set locks_want on iterators that might lock ancestors - * before us to avoid getting -EINTR later: + * Ancestor nodes must be locked before child nodes, so set locks_want + * on iterators that might lock ancestors before us to avoid getting + * -EINTR later: */ - for_each_linked_btree_iter(iter, linked) - if (linked->btree_id == iter->btree_id && - btree_iter_cmp(linked, iter) <= 0) - linked->locks_want = max_t(unsigned, linked->locks_want, - new_locks_want); + trans_for_each_iter(iter->trans, linked) + if (linked != iter && + linked->btree_id == iter->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_iter_get_locks(linked, true, false); + } + return false; } -static void __bch2_btree_iter_unlock(struct btree_iter *iter) +bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, + unsigned new_locks_want) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + unsigned l = iter->level; + + EBUG_ON(iter->locks_want >= new_locks_want); + + iter->locks_want = new_locks_want; + + do { + if (!btree_iter_node(iter, l)) + break; + + if (!bch2_btree_node_upgrade(iter, l)) { + iter->locks_want = l; + return false; + } - while (iter->nodes_locked) - btree_node_unlock(iter, __ffs(iter->nodes_locked)); + l++; + } while (l < iter->locks_want); + + return true; } -int bch2_btree_iter_unlock(struct btree_iter *iter) +void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned downgrade_to) { struct btree_iter *linked; + unsigned l; + + /* + * We downgrade linked iterators as well because btree_iter_upgrade + * might have had to modify locks_want on linked iterators due to lock + * ordering: + */ + trans_for_each_iter(iter->trans, linked) { + unsigned new_locks_want = downgrade_to ?: + (linked->flags & BTREE_ITER_INTENT ? 1 : 0); + + if (linked->locks_want <= new_locks_want) + continue; - for_each_linked_btree_iter(iter, linked) - __bch2_btree_iter_unlock(linked); - __bch2_btree_iter_unlock(iter); + linked->locks_want = new_locks_want; + + while (linked->nodes_locked && + (l = __fls(linked->nodes_locked)) >= linked->locks_want) { + if (l > linked->level) { + btree_node_unlock(linked, l); + } else { + if (btree_node_intent_locked(linked, l)) { + six_lock_downgrade(&linked->l[l].b->lock); + linked->nodes_intent_locked ^= 1 << l; + } + break; + } + } + } - return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; + bch2_btree_trans_verify_locks(iter->trans); +} + +/* Btree transaction locking: */ + +bool bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_iter *iter; + bool ret = true; + + trans_for_each_iter(trans, iter) + if (iter->uptodate == BTREE_ITER_NEED_RELOCK) + ret &= bch2_btree_iter_relock(iter, true); + + return ret; +} + +void bch2_trans_unlock(struct btree_trans *trans) +{ + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) + __bch2_btree_iter_unlock(iter); } /* Btree iterator: */ @@ -285,48 +433,67 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, struct btree_node_iter tmp = l->iter; struct bkey_packed *k; + if (!debug_check_iterators(iter->trans->c)) + return; + + if (iter->uptodate > BTREE_ITER_NEED_PEEK) + return; + bch2_btree_node_iter_verify(&l->iter, b); /* * For interior nodes, the iterator will have skipped past * deleted keys: + * + * For extents, the iterator may have skipped past deleted keys (but not + * whiteouts) */ - k = b->level - ? bch2_btree_node_iter_prev(&tmp, b) + k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS + ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) : bch2_btree_node_iter_prev_all(&tmp, b); - if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->flags & BTREE_ITER_IS_EXTENTS)) { + if (k && btree_iter_pos_cmp(iter, b, k) > 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(buf, sizeof(buf), &uk); - panic("prev key should be before after pos:\n%s\n%llu:%llu\n", + bch2_bkey_to_text(&PBUF(buf), &uk); + panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", buf, iter->pos.inode, iter->pos.offset); } k = bch2_btree_node_iter_peek_all(&l->iter, b); - if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k, - iter->flags & BTREE_ITER_IS_EXTENTS)) { + if (k && btree_iter_pos_cmp(iter, b, k) < 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(buf, sizeof(buf), &uk); - panic("next key should be before iter pos:\n%llu:%llu\n%s\n", + bch2_bkey_to_text(&PBUF(buf), &uk); + panic("iter should be after current key:\n" + "iter pos %llu:%llu\n" + "cur key %s\n", iter->pos.inode, iter->pos.offset, buf); } + + BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && + (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && + !bkey_whiteout(&iter->k) && + bch2_btree_node_iter_end(&l->iter)); } void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - if (iter->l[b->level].b == b) - __bch2_btree_iter_verify(iter, b); + if (!debug_check_iterators(iter->trans->c)) + return; - for_each_linked_btree_node(iter, b, linked) - __bch2_btree_iter_verify(iter, b); + trans_for_each_iter_with_node(iter->trans, b, linked) + __bch2_btree_iter_verify(linked, b); } +#else + +static inline void __bch2_btree_iter_verify(struct btree_iter *iter, + struct btree *b) {} + #endif static void __bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -341,7 +508,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree_node_iter_set *set; unsigned offset = __btree_node_key_to_offset(b, where); int shift = new_u64s - clobber_u64s; - unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift; + unsigned old_end = t->end_offset - shift; btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -349,8 +516,9 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->flags & BTREE_ITER_IS_EXTENTS)) { + btree_iter_pos_cmp(iter, b, where) > 0) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + bch2_btree_node_iter_push(node_iter, b, where, end); if (!b->level && @@ -361,15 +529,14 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, } return; found: - set->end = (int) set->end + shift; + set->end = t->end_offset; /* Iterator hasn't gotten to the key that changed yet: */ if (set->k < offset) return; if (new_u64s && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->flags & BTREE_ITER_IS_EXTENTS)) { + btree_iter_pos_cmp(iter, b, where) > 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -380,9 +547,27 @@ found: goto iter_current_key_not_modified; } + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + bch2_btree_node_iter_sort(node_iter, b); - if (!b->level && node_iter == &iter->l[0].iter) + if (!b->level && node_iter == &iter->l[0].iter) { + /* + * not legal to call bkey_debugcheck() here, because we're + * called midway through the update path after update has been + * marked but before deletes have actually happened: + */ +#if 0 __btree_iter_peek_all(iter, &iter->l[0], &iter->k); +#endif + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_peek_all(&l->iter, l->b); + + if (unlikely(!k)) + iter->k.type = KEY_TYPE_deleted; + else + bkey_disassemble(l->b, k, &iter->k); + } iter_current_key_not_modified: /* @@ -407,9 +592,8 @@ iter_current_key_not_modified: * always point to the key for the child node the btree iterator points * to. */ - if (b->level && new_u64s && !bkey_deleted(where) && - btree_iter_pos_cmp_packed(b, &iter->pos, where, - iter->flags & BTREE_ITER_IS_EXTENTS)) { + if (b->level && new_u64s && + btree_iter_pos_cmp(iter, b, where) > 0) { struct bset_tree *t; struct bkey_packed *k; @@ -420,8 +604,7 @@ iter_current_key_not_modified: k = bch2_bkey_prev_all(b, t, bch2_btree_node_iter_bset_pos(node_iter, b, t)); if (k && - __btree_node_iter_cmp(node_iter, b, - k, where) > 0) { + bkey_iter_cmp(b, k, where) > 0) { struct btree_node_iter_set *set; unsigned offset = __btree_node_key_to_offset(b, bkey_next(k)); @@ -443,32 +626,23 @@ next_bset: } void bch2_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) { + struct bset_tree *t = bch2_bkey_to_bset(b, where); struct btree_iter *linked; if (node_iter != &iter->l[b->level].iter) __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (iter->l[b->level].b == b) - __bch2_btree_node_iter_fix(iter, b, - &iter->l[b->level].iter, t, - where, clobber_u64s, new_u64s); - - for_each_linked_btree_node(iter, b, linked) + trans_for_each_iter_with_node(iter->trans, b, linked) __bch2_btree_node_iter_fix(linked, b, &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); - - /* interior node iterators are... special... */ - if (!b->level) - bch2_btree_iter_verify(iter, b); } static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, @@ -483,14 +657,14 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, * signal to bch2_btree_iter_peek_slot() that we're currently at * a hole */ - u->type = KEY_TYPE_DELETED; + u->type = KEY_TYPE_deleted; return bkey_s_c_null; } ret = bkey_disassemble(l->b, k, u); - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, l->b, ret); + if (debug_check_bkeys(iter->trans->c)) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; } @@ -511,9 +685,23 @@ static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bch2_btree_node_iter_peek(&l->iter, l->b)); } -static inline void __btree_iter_advance(struct btree_iter_level *l) +static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, + struct btree_iter_level *l, + int max_advance) { - bch2_btree_node_iter_advance(&l->iter, l->b); + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && + btree_iter_pos_cmp(iter, l->b, k) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + + bch2_btree_node_iter_advance(&l->iter, l->b); + nr_advanced++; + } + + return true; } /* @@ -546,7 +734,7 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) char buf[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(buf, sizeof(buf), &uk); + bch2_bkey_to_text(&PBUF(buf), &uk); panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", buf, b->key.k.p.inode, b->key.k.p.offset); } @@ -555,21 +743,11 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } -/* Returns true if @k is after iterator position @pos */ -static inline bool btree_iter_pos_cmp(struct btree_iter *iter, - const struct bkey *k) -{ - int cmp = bkey_cmp(k->p, iter->pos); - - return cmp > 0 || - (cmp == 0 && - !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k)); -} - static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return !btree_iter_pos_cmp(iter, &b->key.k); + return __btree_iter_pos_cmp(iter, NULL, + bkey_to_packed(&b->key), true) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -581,17 +759,20 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, } static inline void __btree_iter_init(struct btree_iter *iter, - struct btree *b) + unsigned level) { - struct btree_iter_level *l = &iter->l[b->level]; + struct btree_iter_level *l = &iter->l[level]; + + bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); - bch2_btree_node_iter_init(&l->iter, b, iter->pos, - iter->flags & BTREE_ITER_IS_EXTENTS, - btree_node_is_extents(b)); + if (iter->flags & BTREE_ITER_IS_EXTENTS) + btree_iter_advance_to_pos(iter, l, -1); /* Skip to first non whiteout: */ - if (b->level) - bch2_btree_node_iter_peek(&l->iter, b); + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } static inline void btree_iter_node_set(struct btree_iter *iter, @@ -602,20 +783,21 @@ static inline void btree_iter_node_set(struct btree_iter *iter, EBUG_ON(!btree_iter_pos_in_node(iter, b)); EBUG_ON(b->lock.state.seq & 1); - iter->lock_seq[b->level] = b->lock.state.seq; + iter->l[b->level].lock_seq = b->lock.state.seq; iter->l[b->level].b = b; - __btree_iter_init(iter, b); + __btree_iter_init(iter, b->level); } /* * A btree node is being replaced - update the iterator to point to the new * node: */ -bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) +void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) { + enum btree_node_locked_type t; struct btree_iter *linked; - for_each_linked_btree_iter(iter, linked) + trans_for_each_iter(iter->trans, linked) if (btree_iter_pos_in_node(linked, b)) { /* * bch2_btree_iter_node_drop() has already been called - @@ -624,52 +806,28 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) */ BUG_ON(btree_node_locked(linked, b->level)); - /* - * If @linked wants this node read locked, we don't want - * to actually take the read lock now because it's not - * legal to hold read locks on other nodes while we take - * write locks, so the journal can make forward - * progress... - * - * Instead, btree_iter_node_set() sets things up so - * bch2_btree_node_relock() will succeed: - */ - - if (btree_want_intent(linked, b->level)) { - six_lock_increment(&b->lock, SIX_LOCK_intent); - mark_btree_node_intent_locked(linked, b->level); + t = btree_lock_want(linked, b->level); + if (t != BTREE_NODE_UNLOCKED) { + six_lock_increment(&b->lock, t); + mark_btree_node_locked(linked, b->level, t); } btree_iter_node_set(linked, b); } - if (!btree_iter_pos_in_node(iter, b)) { - six_unlock_intent(&b->lock); - return false; - } - - mark_btree_node_intent_locked(iter, b->level); - btree_iter_node_set(iter, b); - return true; -} - -void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - - for_each_linked_btree_iter(iter, linked) - bch2_btree_iter_node_drop(linked, b); + six_unlock_intent(&b->lock); } void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) { + struct btree_iter *linked; unsigned level = b->level; - if (iter->l[level].b == b) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - btree_node_unlock(iter, level); - iter->l[level].b = BTREE_ITER_NOT_END; - } + trans_for_each_iter(iter->trans, linked) + if (linked->l[level].b == b) { + __btree_node_unlock(linked, level); + linked->l[level].b = BTREE_ITER_NO_NODE_DROP; + } } /* @@ -680,15 +838,14 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - for_each_linked_btree_node(iter, b, linked) - __btree_iter_init(linked, b); - __btree_iter_init(iter, b); + trans_for_each_iter_with_node(iter->trans, b, linked) + __btree_iter_init(linked, b->level); } static inline int btree_iter_lock_root(struct btree_iter *iter, unsigned depth_want) { - struct bch_fs *c = iter->c; + struct bch_fs *c = iter->trans->c; struct btree *b; enum six_lock_type lock_type; unsigned i; @@ -707,11 +864,12 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, * that depth */ iter->level = depth_want; - iter->l[iter->level].b = NULL; - return 0; + for (i = iter->level; i < BTREE_MAX_DEPTH; i++) + iter->l[i].b = NULL; + return 1; } - lock_type = btree_lock_want(iter, iter->level); + lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, iter, lock_type))) return -EINTR; @@ -720,13 +878,14 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, b->level == iter->level && !race_fault())) { for (i = 0; i < iter->level; i++) - iter->l[i].b = BTREE_ITER_NOT_END; + iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; iter->l[iter->level].b = b; + for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) + iter->l[i].b = NULL; mark_btree_node_locked(iter, iter->level, lock_type); btree_iter_node_set(iter, b); return 0; - } six_unlock_type(&b->lock, lock_type); @@ -736,11 +895,12 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, noinline static void btree_iter_prefetch(struct btree_iter *iter) { + struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; BKEY_PADDED(k) tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags) + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ? (iter->level > 1 ? 0 : 2) : (iter->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(iter, iter->level); @@ -755,9 +915,7 @@ static void btree_iter_prefetch(struct btree_iter *iter) break; bch2_bkey_unpack(l->b, &tmp.k, k); - bch2_btree_node_prefetch(iter->c, &tmp.k, - iter->level - 1, - iter->btree_id); + bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); } if (!was_locked) @@ -766,10 +924,11 @@ static void btree_iter_prefetch(struct btree_iter *iter) static inline int btree_iter_down(struct btree_iter *iter) { + struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; struct btree *b; unsigned level = iter->level - 1; - enum six_lock_type lock_type = btree_lock_want(iter, level); + enum six_lock_type lock_type = __btree_lock_want(iter, level); BKEY_PADDED(k) tmp; BUG_ON(!btree_node_locked(iter, iter->level)); @@ -777,7 +936,7 @@ static inline int btree_iter_down(struct btree_iter *iter) bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type); + b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -799,17 +958,27 @@ static void btree_iter_up(struct btree_iter *iter) int __must_check __bch2_btree_iter_traverse(struct btree_iter *); -static int btree_iter_traverse_error(struct btree_iter *iter, int ret) +static int __btree_iter_traverse_all(struct btree_trans *trans, + struct btree_iter *orig_iter, int ret) { - struct bch_fs *c = iter->c; - struct btree_iter *linked, *sorted_iters, **i; -retry_all: - bch2_btree_iter_unlock(iter); + struct bch_fs *c = trans->c; + struct btree_iter *iter; + u8 sorted[BTREE_ITER_MAX]; + unsigned i, nr_sorted = 0; + + trans_for_each_iter(trans, iter) + sorted[nr_sorted++] = iter - trans->iters; - if (ret != -ENOMEM && ret != -EINTR) - goto io_error; +#define btree_iter_cmp_by_idx(_l, _r) \ + btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) - if (ret == -ENOMEM) { + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); +#undef btree_iter_cmp_by_idx + +retry_all: + bch2_trans_unlock(trans); + + if (unlikely(ret == -ENOMEM)) { struct closure cl; closure_init_stack(&cl); @@ -820,57 +989,54 @@ retry_all: } while (ret); } - /* - * Linked iters are normally a circular singly linked list - break cycle - * while we sort them: - */ - linked = iter->next; - iter->next = NULL; - sorted_iters = NULL; - - while (linked) { - iter = linked; - linked = linked->next; - - i = &sorted_iters; - while (*i && btree_iter_cmp(iter, *i) > 0) - i = &(*i)->next; - - iter->next = *i; - *i = iter; + if (unlikely(ret == -EIO)) { + trans->error = true; + orig_iter->flags |= BTREE_ITER_ERROR; + orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR; + goto out; } - /* Make list circular again: */ - iter = sorted_iters; - while (iter->next) - iter = iter->next; - iter->next = sorted_iters; + BUG_ON(ret && ret != -EINTR); /* Now, redo traversals in correct order: */ + for (i = 0; i < nr_sorted; i++) { + iter = &trans->iters[sorted[i]]; - iter = sorted_iters; - do { -retry: - ret = __bch2_btree_iter_traverse(iter); - if (unlikely(ret)) { - if (ret == -EINTR) - goto retry; - goto retry_all; - } + do { + ret = __bch2_btree_iter_traverse(iter); + } while (ret == -EINTR); - iter = iter->next; - } while (iter != sorted_iters); + if (ret) + goto retry_all; + } - ret = btree_iter_linked(iter) ? -EINTR : 0; + ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; out: bch2_btree_cache_cannibalize_unlock(c); return ret; -io_error: - BUG_ON(ret != -EIO); +} + +int bch2_btree_iter_traverse_all(struct btree_trans *trans) +{ + return __btree_iter_traverse_all(trans, NULL, 0); +} - iter->flags |= BTREE_ITER_ERROR; - iter->l[iter->level].b = NULL; - goto out; +static unsigned btree_iter_up_until_locked(struct btree_iter *iter, + bool check_pos) +{ + unsigned l = iter->level; + + while (btree_iter_node(iter, l) && + (!is_btree_node(iter, l) || + !bch2_btree_node_relock(iter, l) || + (check_pos && + !btree_iter_pos_in_node(iter, iter->l[l].b)))) { + btree_node_unlock(iter, l); + iter->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } + + return l; } /* @@ -880,48 +1046,23 @@ io_error: * Returns 0 on success, -EIO on error (error reading in a btree node). * * On error, caller (peek_node()/peek_key()) must return NULL; the error is - * stashed in the iterator and returned from bch2_btree_iter_unlock(). + * stashed in the iterator and returned from bch2_trans_exit(). */ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { unsigned depth_want = iter->level; - if (unlikely(!iter->l[iter->level].b)) + if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF; - - /* make sure we have all the intent locks we need - ugh */ - if (unlikely(iter->l[iter->level].b && - iter->level + 1 < iter->locks_want)) { - unsigned i; - - for (i = iter->level + 1; - i < iter->locks_want && iter->l[i].b; - i++) - if (!bch2_btree_node_relock(iter, i)) { - while (iter->level < BTREE_MAX_DEPTH && - iter->l[iter->level].b && - iter->level + 1 < iter->locks_want) - btree_iter_up(iter); - break; - } - } + if (bch2_btree_iter_relock(iter, false)) + return 0; /* - * If the current node isn't locked, go up until we have a locked node - * or run out of nodes: + * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos + * here unnecessary */ - while (btree_iter_node(iter, iter->level) && - !(is_btree_node(iter, iter->level) && - bch2_btree_node_relock(iter, iter->level) && - - /* - * XXX: correctly using BTREE_ITER_UPTODATE should make - * comparing iter->pos against node's key unnecessary - */ - btree_iter_pos_in_node(iter, iter->l[iter->level].b))) - btree_iter_up(iter); + iter->level = btree_iter_up_until_locked(iter, true); /* * If we've got a btree node locked (i.e. we aren't about to relock the @@ -929,15 +1070,8 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) * * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary */ - if (btree_iter_node(iter, iter->level)) { - struct btree_iter_level *l = &iter->l[iter->level]; - struct bkey_s_c k; - struct bkey u; - - while ((k = __btree_iter_peek_all(iter, l, &u)).k && - !btree_iter_pos_cmp(iter, k.k)) - __btree_iter_advance(l); - } + if (btree_iter_node(iter, iter->level)) + btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that @@ -950,13 +1084,19 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ? btree_iter_down(iter) : btree_iter_lock_root(iter, depth_want); if (unlikely(ret)) { + if (ret == 1) + return 0; + iter->level = depth_want; - iter->l[iter->level].b = BTREE_ITER_NOT_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; return ret; } } iter->uptodate = BTREE_ITER_NEED_PEEK; + + bch2_btree_trans_verify_locks(iter->trans); + __bch2_btree_iter_verify(iter, iter->l[iter->level].b); return 0; } @@ -964,16 +1104,25 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - if (iter->uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - ret = __bch2_btree_iter_traverse(iter); + ret = bch2_trans_cond_resched(iter->trans) ?: + __bch2_btree_iter_traverse(iter); if (unlikely(ret)) - ret = btree_iter_traverse_error(iter, ret); + ret = __btree_iter_traverse_all(iter->trans, iter, ret); return ret; } +static inline void bch2_btree_iter_checks(struct btree_iter *iter, + enum btree_iter_type type) +{ + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != + (btree_node_type_is_extents(iter->btree_id) && + type != BTREE_ITER_NODES)); + + bch2_btree_trans_verify_locks(iter->trans); +} + /* Iterate across nodes (leaf and interior nodes) */ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) @@ -981,18 +1130,23 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b; int ret; - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch2_btree_iter_checks(iter, BTREE_ITER_NODES); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return iter->l[iter->level].b; ret = bch2_btree_iter_traverse(iter); if (ret) - return ERR_PTR(ret); + return NULL; - b = iter->l[iter->level].b; + b = btree_iter_node(iter, iter->level); + if (!b) + return NULL; - if (b) { - EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = b->key.k.p; - } + BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + + iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; return b; } @@ -1002,25 +1156,42 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) struct btree *b; int ret; - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - - btree_iter_up(iter); + bch2_btree_iter_checks(iter, BTREE_ITER_NODES); + /* already got to end? */ if (!btree_iter_node(iter, iter->level)) return NULL; - /* parent node usually won't be locked: redo traversal if necessary */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + bch2_trans_cond_resched(iter->trans); + + btree_iter_up(iter); + + if (!bch2_btree_node_relock(iter, iter->level)) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + ret = bch2_btree_iter_traverse(iter); if (ret) return NULL; - b = iter->l[iter->level].b; + /* got to end? */ + b = btree_iter_node(iter, iter->level); if (!b) - return b; + return NULL; if (bkey_cmp(iter->pos, b->key.k.p) < 0) { - /* Haven't gotten to the end of the parent node: */ + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ + + /* + * We don't really want to be unlocking here except we can't + * directly tell btree_iter_traverse() "traverse to this level" + * except by setting iter->level, so we have to unlock so we + * don't screw up our lock invariants: + */ + if (btree_node_read_locked(iter, iter->level)) + btree_node_unlock(iter, iter->level); /* ick: */ iter->pos = iter->btree_id == BTREE_ID_INODES @@ -1037,6 +1208,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) } iter->pos = b->key.k.p; + iter->uptodate = BTREE_ITER_UPTODATE; return b; } @@ -1046,7 +1218,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) { struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k; EBUG_ON(iter->level != 0); EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); @@ -1056,71 +1227,94 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ iter->pos = new_pos; btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - !btree_iter_pos_cmp_packed(l->b, &iter->pos, k, - iter->flags & BTREE_ITER_IS_EXTENTS)) - __btree_iter_advance(l); + btree_iter_advance_to_pos(iter, l, -1); - if (!k && btree_iter_pos_after_node(iter, l->b)) { + if (bch2_btree_node_iter_end(&l->iter) && + btree_iter_pos_after_node(iter, l->b)) btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - iter->flags |= BTREE_ITER_AT_END_OF_LEAF; - } } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */ + int cmp = bkey_cmp(new_pos, iter->pos); + unsigned level; + + if (!cmp) + return; + iter->pos = new_pos; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + level = btree_iter_up_until_locked(iter, true); + + if (btree_iter_node(iter, level)) { + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp < 0 || + !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) + __btree_iter_init(iter, level); + + /* Don't leave it locked if we're not supposed to: */ + if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) + btree_node_unlock(iter, level); + } + + if (level != iter->level) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + else + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c k; - int ret; + struct bkey_s_c ret = { .k = &iter->k }; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(iter->flags & BTREE_ITER_SLOTS); + if (!bkey_deleted(&iter->k)) { + EBUG_ON(bch2_btree_node_iter_end(&l->iter)); + ret.v = bkeyp_val(&l->b->format, + __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + } - if (iter->uptodate == BTREE_ITER_UPTODATE) { - struct bkey_packed *k = - __bch2_btree_node_iter_peek_all(&l->iter, l->b); - struct bkey_s_c ret = { - .k = &iter->k, - .v = bkeyp_val(&l->b->format, k) - }; + if (debug_check_bkeys(iter->trans->c) && + !bkey_deleted(ret.k)) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + return ret; +} - EBUG_ON(!btree_node_locked(iter, 0)); +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; + int ret; - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, l->b, ret); - return ret; - } + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (iter->uptodate == BTREE_ITER_END) - return bkey_s_c_null; + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); while (1) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } k = __btree_iter_peek(iter, l); if (likely(k.k)) break; /* got to the end of the leaf, iterator needs to be traversed: */ - iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->uptodate = BTREE_ITER_END; + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) return bkey_s_c_null; - } iter->pos = btree_type_successor(iter->btree_id, iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; } /* @@ -1140,14 +1334,13 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; - iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - iter->uptodate = BTREE_ITER_END; + iter->pos = l->b->key.k.p; + iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + + if (!bkey_cmp(iter->pos, POS_MAX)) return bkey_s_c_null; - } iter->pos = btree_type_successor(iter->btree_id, iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; return bch2_btree_iter_peek(iter); } @@ -1158,22 +1351,27 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) struct bkey_packed *p; struct bkey_s_c k; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(iter->flags & BTREE_ITER_SLOTS); + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - k = bch2_btree_iter_peek(iter); - if (IS_ERR_OR_NULL(k.k)) - return k; + /* + * XXX: when we just need to relock we should be able to avoid + * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK + * for that to work + */ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + + return bch2_btree_iter_peek(iter); } do { - __btree_iter_advance(l); + bch2_btree_node_iter_advance(&l->iter, l->b); p = bch2_btree_node_iter_peek_all(&l->iter, l->b); if (unlikely(!p)) return bch2_btree_iter_peek_next_leaf(iter); - } while (bkey_deleted(p)); + } while (bkey_whiteout(p)); k = __btree_iter_unpack(iter, l, &iter->k, p); @@ -1182,10 +1380,56 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return k; } +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *p; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { + k = bch2_btree_iter_peek(iter); + if (IS_ERR(k.k)) + return k; + } + + while (1) { + p = bch2_btree_node_iter_prev(&l->iter, l->b); + if (likely(p)) + break; + + iter->pos = l->b->data->min_key; + if (!bkey_cmp(iter->pos, POS_MIN)) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + btree_type_predecessor(iter->btree_id, iter->pos)); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + p = bch2_btree_node_iter_peek(&l->iter, l->b); + if (p) + break; + } + + k = __btree_iter_unpack(iter, l, &iter->k, p); + + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + + iter->pos = bkey_start_pos(k.k); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; +} + static inline struct bkey_s_c -__bch2_btree_iter_peek_slot(struct btree_iter *iter) +__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter; struct bkey_s_c k; struct bkey n; int ret; @@ -1194,14 +1438,18 @@ recheck: while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && bkey_deleted(k.k) && bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) - __btree_iter_advance(l); + bch2_btree_node_iter_advance(&l->iter, l->b); - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); - EBUG_ON(bkey_deleted(k.k)); - iter->uptodate = BTREE_ITER_UPTODATE; - return k; - } + /* + * iterator is now at the correct position for inserting at iter->pos, + * but we need to keep iterating until we find the first non whiteout so + * we know how big a hole we have, if any: + */ + + node_iter = l->iter; + if (k.k && bkey_whiteout(k.k)) + k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&node_iter, l->b)); /* * If we got to the end of the node, check if we need to traverse to the @@ -1216,74 +1464,118 @@ recheck: goto recheck; } + if (k.k && + !bkey_whiteout(k.k) && + bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + /* + * if we skipped forward to find the first non whiteout and + * there _wasn't_ actually a hole, we want the iterator to be + * pointed at the key we found: + */ + l->iter = node_iter; + + EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); + EBUG_ON(bkey_deleted(k.k)); + iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } + /* hole */ - bkey_init(&n); - n.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { - if (n.p.offset == KEY_OFFSET_MAX) { - if (n.p.inode == KEY_INODE_MAX) { - iter->uptodate = BTREE_ITER_END; - return bkey_s_c_null; - } + /* holes can't span inode numbers: */ + if (iter->pos.offset == KEY_OFFSET_MAX) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; - iter->pos = bkey_successor(iter->pos); - goto recheck; - } + iter->pos = bkey_successor(iter->pos); + goto recheck; + } - if (!k.k) - k.k = &l->b->key.k; + if (!k.k) + k.k = &l->b->key.k; - bch2_key_resize(&n, - min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) - : KEY_OFFSET_MAX) - - n.p.offset)); + bkey_init(&n); + n.p = iter->pos; + bch2_key_resize(&n, + min_t(u64, KEY_SIZE_MAX, + (k.k->p.inode == n.p.inode + ? bkey_start_offset(k.k) + : KEY_OFFSET_MAX) - + n.p.offset)); - EBUG_ON(!n.size); - } + EBUG_ON(!n.size); - iter->k = n; + iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; return (struct bkey_s_c) { &iter->k, NULL }; } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +static inline struct bkey_s_c +__bch2_btree_iter_peek_slot(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + struct bkey_s_c k; int ret; - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (iter->btree_id == BTREE_ID_EXTENTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS)); + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); - if (iter->uptodate == BTREE_ITER_UPTODATE) { - struct bkey_s_c ret = { .k = &iter->k };; +recheck: + while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && + bkey_deleted(k.k) && + bkey_cmp(k.k->p, iter->pos) == 0) + bch2_btree_node_iter_advance(&l->iter, l->b); - if (!bkey_deleted(&iter->k)) - ret.v = bkeyp_val(&l->b->format, - __bch2_btree_node_iter_peek_all(&l->iter, l->b)); + /* + * If we got to the end of the node, check if we need to traverse to the + * next node: + */ + if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); - EBUG_ON(!btree_node_locked(iter, 0)); + goto recheck; + } - if (debug_check_bkeys(iter->c)) - bch2_bkey_debugcheck(iter->c, l->b, ret); - return ret; + if (k.k && + !bkey_deleted(k.k) && + !bkey_cmp(iter->pos, k.k->p)) { + iter->uptodate = BTREE_ITER_UPTODATE; + return k; + } else { + /* hole */ + bkey_init(&iter->k); + iter->k.p = iter->pos; + + iter->uptodate = BTREE_ITER_UPTODATE; + return (struct bkey_s_c) { &iter->k, NULL }; } +} - if (iter->uptodate == BTREE_ITER_END) - return bkey_s_c_null; +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + int ret; - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); + + if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } return __bch2_btree_iter_peek_slot(iter); } struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { + bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); + iter->pos = btree_type_successor(iter->btree_id, iter->k.p); if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { @@ -1298,85 +1590,386 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) } if (!bkey_deleted(&iter->k)) - __btree_iter_advance(&iter->l[0]); + bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); return __bch2_btree_iter_peek_slot(iter); } -void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned depth, - unsigned flags) +static inline void bch2_btree_iter_init(struct btree_trans *trans, + struct btree_iter *iter, enum btree_id btree_id, + struct bpos pos, unsigned flags) { + struct bch_fs *c = trans->c; unsigned i; - EBUG_ON(depth >= BTREE_MAX_DEPTH); - EBUG_ON(locks_want > BTREE_MAX_DEPTH); + if (btree_node_type_is_extents(btree_id) && + !(flags & BTREE_ITER_NODES)) + flags |= BTREE_ITER_IS_EXTENTS; - iter->c = c; + iter->trans = trans; iter->pos = pos; bkey_init(&iter->k); iter->k.p = pos; iter->flags = flags; iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; - iter->level = depth; - iter->locks_want = locks_want; + iter->level = 0; + iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(iter->l); i++) iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NOT_END; - iter->next = iter; - - if (unlikely((flags & BTREE_ITER_IS_EXTENTS) && - !bkey_cmp(pos, POS_MAX))) - iter->uptodate = BTREE_ITER_END; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; prefetch(c->btree_roots[btree_id].b); } -void bch2_btree_iter_unlink(struct btree_iter *iter) +/* new transactional stuff: */ + +int bch2_trans_iter_put(struct btree_trans *trans, + struct btree_iter *iter) { - struct btree_iter *linked; + int ret = btree_iter_err(iter); - __bch2_btree_iter_unlock(iter); + trans->iters_live &= ~(1ULL << iter->idx); + return ret; +} - if (!btree_iter_linked(iter)) - return; +static inline void __bch2_trans_iter_free(struct btree_trans *trans, + unsigned idx) +{ + __bch2_btree_iter_unlock(&trans->iters[idx]); + trans->iters_linked &= ~(1ULL << idx); + trans->iters_live &= ~(1ULL << idx); + trans->iters_touched &= ~(1ULL << idx); + trans->iters_unlink_on_restart &= ~(1ULL << idx); + trans->iters_unlink_on_commit &= ~(1ULL << idx); +} - for_each_linked_btree_iter(iter, linked) { +int bch2_trans_iter_free(struct btree_trans *trans, + struct btree_iter *iter) +{ + int ret = btree_iter_err(iter); - if (linked->next == iter) { - linked->next = iter->next; - return; + __bch2_trans_iter_free(trans, iter->idx); + return ret; +} + +int bch2_trans_iter_free_on_commit(struct btree_trans *trans, + struct btree_iter *iter) +{ + int ret = btree_iter_err(iter); + + trans->iters_unlink_on_commit |= 1ULL << iter->idx; + return ret; +} + +static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) +{ + void *new_iters, *new_updates; + + new_size = roundup_pow_of_two(new_size); + + BUG_ON(new_size > BTREE_ITER_MAX); + + if (new_size <= trans->size) + return 0; + + BUG_ON(trans->used_mempool); + + bch2_trans_unlock(trans); + + new_iters = kmalloc(sizeof(struct btree_iter) * new_size + + sizeof(struct btree_insert_entry) * (new_size + 4), + GFP_NOFS); + if (new_iters) + goto success; + + new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + new_size = BTREE_ITER_MAX; + + trans->used_mempool = true; +success: + new_updates = new_iters + sizeof(struct btree_iter) * new_size; + + memcpy(new_iters, trans->iters, + sizeof(struct btree_iter) * trans->nr_iters); + memcpy(new_updates, trans->updates, + sizeof(struct btree_insert_entry) * trans->nr_updates); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + memset(trans->iters, POISON_FREE, + sizeof(struct btree_iter) * trans->nr_iters + + sizeof(struct btree_insert_entry) * trans->nr_iters); + + if (trans->iters != trans->iters_onstack) + kfree(trans->iters); + + trans->iters = new_iters; + trans->updates = new_updates; + trans->size = new_size; + + if (trans->iters_live) { + trace_trans_restart_iters_realloced(trans->ip, trans->size); + return -EINTR; + } + + return 0; +} + +static int btree_trans_iter_alloc(struct btree_trans *trans) +{ + unsigned idx = __ffs64(~trans->iters_linked); + + if (idx < trans->nr_iters) + goto got_slot; + + if (trans->nr_iters == trans->size) { + int ret = bch2_trans_realloc_iters(trans, trans->size * 2); + if (ret) + return ret; + } + + idx = trans->nr_iters++; + BUG_ON(trans->nr_iters > trans->size); + + trans->iters[idx].idx = idx; +got_slot: + BUG_ON(trans->iters_linked & (1ULL << idx)); + trans->iters_linked |= 1ULL << idx; + return idx; +} + +static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned flags, u64 iter_id) +{ + struct btree_iter *iter; + int idx; + + BUG_ON(trans->nr_iters > BTREE_ITER_MAX); + + for (idx = 0; idx < trans->nr_iters; idx++) { + if (!(trans->iters_linked & (1ULL << idx))) + continue; + + iter = &trans->iters[idx]; + if (iter_id + ? iter->id == iter_id + : (iter->btree_id == btree_id && + !bkey_cmp(iter->pos, pos))) + goto found; + } + idx = -1; +found: + if (idx < 0) { + idx = btree_trans_iter_alloc(trans); + if (idx < 0) + return ERR_PTR(idx); + + iter = &trans->iters[idx]; + iter->id = iter_id; + + bch2_btree_iter_init(trans, iter, btree_id, pos, flags); + } else { + iter = &trans->iters[idx]; + + iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + } + + BUG_ON(iter->btree_id != btree_id); + BUG_ON(trans->iters_live & (1ULL << idx)); + trans->iters_live |= 1ULL << idx; + trans->iters_touched |= 1ULL << idx; + + BUG_ON(iter->btree_id != btree_id); + BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); + + return iter; +} + +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos, unsigned flags, + u64 iter_id) +{ + struct btree_iter *iter = + __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id); + + if (!IS_ERR(iter)) + bch2_btree_iter_set_pos(iter, pos); + return iter; +} + +struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) +{ + struct btree_iter *iter = + __btree_trans_get_iter(trans, btree_id, pos, + flags|BTREE_ITER_NODES, 0); + unsigned i; + + BUG_ON(IS_ERR(iter)); + BUG_ON(bkey_cmp(iter->pos, pos)); + + iter->locks_want = locks_want; + iter->level = depth; + + for (i = 0; i < ARRAY_SIZE(iter->l); i++) + iter->l[i].b = NULL; + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + + return iter; +} + +struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *src) +{ + struct btree_iter *iter; + int i, idx; + + idx = btree_trans_iter_alloc(trans); + if (idx < 0) + return ERR_PTR(idx); + + trans->iters_live |= 1ULL << idx; + trans->iters_touched |= 1ULL << idx; + trans->iters_unlink_on_restart |= 1ULL << idx; + + iter = &trans->iters[idx]; + + memcpy(&iter->trans, + &src->trans, + (void *) &iter[1] - (void *) &iter->trans); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(iter, i)) + six_lock_increment(&iter->l[i].b->lock, + __btree_lock_want(iter, i)); + + return &trans->iters[idx]; +} + +static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) +{ + if (size > trans->mem_bytes) { + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(size); + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + + if (!new_mem) + return -ENOMEM; + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + if (old_bytes) { + trace_trans_restart_mem_realloced(trans->ip, new_bytes); + return -EINTR; } } - BUG(); + return 0; } -void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) +void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - BUG_ON(btree_iter_linked(new)); + void *p; + int ret; - new->next = iter->next; - iter->next = new; + ret = bch2_trans_preload_mem(trans, trans->mem_top + size); + if (ret) + return ERR_PTR(ret); + + p = trans->mem + trans->mem_top; + trans->mem_top += size; + return p; +} - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - unsigned nr_iters = 1; +inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) +{ + iters &= trans->iters_linked; + iters &= ~trans->iters_live; - for_each_linked_btree_iter(iter, new) - nr_iters++; + while (iters) { + unsigned idx = __ffs64(iters); - BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE); + iters &= ~(1ULL << idx); + __bch2_trans_iter_free(trans, idx); } } -void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_begin(struct btree_trans *trans) +{ + u64 iters_to_unlink; + + /* + * On transaction restart, the transaction isn't required to allocate + * all the same iterators it on the last iteration: + * + * Unlink any iterators it didn't use this iteration, assuming it got + * further (allocated an iter with a higher idx) than where the iter + * was originally allocated: + */ + iters_to_unlink = ~trans->iters_live & + ((1ULL << fls64(trans->iters_live)) - 1); + + iters_to_unlink |= trans->iters_unlink_on_restart; + iters_to_unlink |= trans->iters_unlink_on_commit; + + trans->iters_live = 0; + + bch2_trans_unlink_iters(trans, iters_to_unlink); + + trans->iters_touched = 0; + trans->iters_unlink_on_restart = 0; + trans->iters_unlink_on_commit = 0; + trans->nr_updates = 0; + trans->mem_top = 0; + + bch2_btree_iter_traverse_all(trans); +} + +void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) +{ + memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + + trans->c = c; + trans->ip = _RET_IP_; + trans->size = ARRAY_SIZE(trans->iters_onstack); + trans->iters = trans->iters_onstack; + trans->updates = trans->updates_onstack; + trans->fs_usage_deltas = NULL; + + if (expected_nr_iters > trans->size) + bch2_trans_realloc_iters(trans, expected_nr_iters); + + if (expected_mem_bytes) + bch2_trans_preload_mem(trans, expected_mem_bytes); +} + +int bch2_trans_exit(struct btree_trans *trans) { - __bch2_btree_iter_unlock(dst); - memcpy(dst, src, offsetof(struct btree_iter, next)); - dst->nodes_locked = dst->nodes_intent_locked = 0; - dst->uptodate = BTREE_ITER_NEED_RELOCK; + bch2_trans_unlock(trans); + + kfree(trans->fs_usage_deltas); + kfree(trans->mem); + if (trans->used_mempool) + mempool_free(trans->iters, &trans->c->btree_iters_pool); + else if (trans->iters != trans->iters_onstack) + kfree(trans->iters); + trans->mem = (void *) 0x1; + trans->iters = (void *) 0x1; + + return trans->error ? -EIO : 0; } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 0097a2a20a18..9483ec8913e3 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_ITER_H #define _BCACHEFS_BTREE_ITER_H -#include <linux/dynamic_fault.h> - +#include "bset.h" #include "btree_types.h" static inline void btree_iter_set_dirty(struct btree_iter *iter, @@ -17,101 +17,132 @@ static inline struct btree *btree_iter_node(struct btree_iter *iter, return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; } +static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, + const struct btree *b, unsigned level) +{ + /* + * We don't compare the low bits of the lock sequence numbers because + * @iter might have taken a write lock on @b, and we don't want to skip + * the linked iterator if the sequence numbers were equal before taking + * that write lock. The lock sequence number is incremented by taking + * and releasing write locks and is even when unlocked: + */ + return iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1; +} + static inline struct btree *btree_node_parent(struct btree_iter *iter, struct btree *b) { return btree_iter_node(iter, b->level + 1); } -static inline bool btree_iter_linked(const struct btree_iter *iter) +static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) { - return iter->next != iter; + return hweight64(trans->iters_linked) > 1; } -/** - * for_each_linked_btree_iter - iterate over all iterators linked with @_iter - */ -#define for_each_linked_btree_iter(_iter, _linked) \ - for ((_linked) = (_iter)->next; \ - (_linked) != (_iter); \ - (_linked) = (_linked)->next) +static inline int btree_iter_err(const struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; +} + +/* Iterate over iters within a transaction: */ static inline struct btree_iter * -__next_linked_btree_node(struct btree_iter *iter, struct btree *b, - struct btree_iter *linked) -{ - do { - linked = linked->next; - - if (linked == iter) - return NULL; - - /* - * We don't compare the low bits of the lock sequence numbers - * because @iter might have taken a write lock on @b, and we - * don't want to skip the linked iterator if the sequence - * numbers were equal before taking that write lock. The lock - * sequence number is incremented by taking and releasing write - * locks and is even when unlocked: - */ - } while (linked->l[b->level].b != b || - linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1); - - return linked; -} - -/** - * for_each_linked_btree_node - iterate over all iterators linked with @_iter - * that also point to @_b - * - * @_b is assumed to be locked by @_iter - * - * Filters out iterators that don't have a valid btree_node iterator for @_b - - * i.e. iterators for which bch2_btree_node_relock() would not succeed. - */ -#define for_each_linked_btree_node(_iter, _b, _linked) \ - for ((_linked) = (_iter); \ - ((_linked) = __next_linked_btree_node(_iter, _b, _linked));) +__trans_next_iter(struct btree_trans *trans, unsigned idx) +{ + EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); + + for (; idx < trans->nr_iters; idx++) + if (trans->iters_linked & (1ULL << idx)) + return &trans->iters[idx]; + + return NULL; +} + +#define trans_for_each_iter(_trans, _iter) \ + for (_iter = __trans_next_iter((_trans), 0); \ + (_iter); \ + _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) + +static inline bool __iter_has_node(const struct btree_iter *iter, + const struct btree *b) +{ + return iter->l[b->level].b == b && + btree_node_lock_seq_matches(iter, b, b->level); +} + +static inline struct btree_iter * +__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, + unsigned idx) +{ + struct btree_iter *iter = __trans_next_iter(trans, idx); + + while (iter && !__iter_has_node(iter, b)) + iter = __trans_next_iter(trans, iter->idx + 1); + + return iter; +} + +#define trans_for_each_iter_with_node(_trans, _b, _iter) \ + for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ + (_iter); \ + _iter = __trans_next_iter_with_node((_trans), (_b), \ + (_iter)->idx + 1)) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_iter_verify(struct btree_iter *, struct btree *); +void bch2_btree_trans_verify_locks(struct btree_trans *); #else static inline void bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} + struct btree *b) {} +static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} #endif void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bset_tree *, - struct bkey_packed *, unsigned, unsigned); + struct btree_node_iter *, struct bkey_packed *, + unsigned, unsigned); -int bch2_btree_iter_unlock(struct btree_iter *); -bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned); +bool bch2_trans_relock(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); -static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter, - unsigned new_locks_want) +bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); +bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); + +static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, + unsigned new_locks_want) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (iter->locks_want == new_locks_want && - iter->nodes_intent_locked == (1 << new_locks_want) - 1) - return true; + return iter->locks_want < new_locks_want + ? (!iter->trans->nounlock + ? __bch2_btree_iter_upgrade(iter, new_locks_want) + : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) + : iter->uptodate <= BTREE_ITER_NEED_PEEK; +} + +void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); - return __bch2_btree_iter_set_locks_want(iter, new_locks_want); +static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) +{ + if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) + __bch2_btree_iter_downgrade(iter, 0); } -bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *); +void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int bch2_btree_iter_traverse_all(struct btree_trans *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); @@ -119,37 +150,32 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *, - enum btree_id, struct bpos, - unsigned , unsigned, unsigned); - -static inline void bch2_btree_iter_init(struct btree_iter *iter, - struct bch_fs *c, enum btree_id btree_id, - struct bpos pos, unsigned flags) -{ - __bch2_btree_iter_init(iter, c, btree_id, pos, - flags & BTREE_ITER_INTENT ? 1 : 0, 0, - (btree_id == BTREE_ID_EXTENTS - ? BTREE_ITER_IS_EXTENTS : 0)|flags); -} - -void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *); -void bch2_btree_iter_unlink(struct btree_iter *); -void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *); - static inline struct bpos btree_type_successor(enum btree_id id, struct bpos pos) { if (id == BTREE_ID_INODES) { pos.inode++; pos.offset = 0; - } else if (id != BTREE_ID_EXTENTS) { + } else if (!btree_node_type_is_extents(id)) { pos = bkey_successor(pos); } return pos; } +static inline struct bpos btree_type_predecessor(enum btree_id id, + struct bpos pos) +{ + if (id == BTREE_ID_INODES) { + --pos.inode; + pos.offset = 0; + } else { + pos = bkey_predecessor(pos); + } + + return pos; +} + static inline int __btree_iter_cmp(enum btree_id id, struct bpos pos, const struct btree_iter *r) @@ -169,26 +195,29 @@ static inline int btree_iter_cmp(const struct btree_iter *l, * Unlocks before scheduling * Note: does not revalidate iterator */ -static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter) +static inline int bch2_trans_cond_resched(struct btree_trans *trans) { - if (need_resched()) { - bch2_btree_iter_unlock(iter); + if (need_resched() || race_fault()) { + bch2_trans_unlock(trans); schedule(); - } else if (race_fault()) { - bch2_btree_iter_unlock(iter); + return bch2_trans_relock(trans) ? 0 : -EINTR; + } else { + return 0; } } -#define __for_each_btree_node(_iter, _c, _btree_id, _start, \ +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ _locks_want, _depth, _flags, _b) \ - for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \ - _locks_want, _depth, _flags), \ + for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ + _start, _locks_want, _depth, _flags), \ _b = bch2_btree_iter_peek_node(_iter); \ (_b); \ (_b) = bch2_btree_iter_next_node(_iter, _depth)) -#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \ - __for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b) +#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b) static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, unsigned flags) @@ -201,28 +230,77 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, unsigned flags) { - bch2_btree_iter_cond_resched(iter); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_next_slot(iter) : bch2_btree_iter_next(iter); } -#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \ - for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \ - (_start), (_flags)), \ - (_k) = __bch2_btree_iter_peek(_iter, _flags); \ - !IS_ERR_OR_NULL((_k).k); \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ + bch2_trans_get_iter((_trans), (_btree_id), \ + (_start), (_flags))) ?: \ + PTR_ERR_OR_ZERO(((_k) = \ + __bch2_btree_iter_peek(_iter, _flags)).k); \ + !ret && (_k).k; \ + (_ret) = PTR_ERR_OR_ZERO(((_k) = \ + __bch2_btree_iter_next(_iter, _flags)).k)) #define for_each_btree_key_continue(_iter, _flags, _k) \ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ !IS_ERR_OR_NULL((_k).k); \ (_k) = __bch2_btree_iter_next(_iter, _flags)) -static inline int btree_iter_err(struct bkey_s_c k) +static inline int bkey_err(struct bkey_s_c k) { return PTR_ERR_OR_ZERO(k.k); } +/* new multiple iterator interface: */ + +int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); +int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); +int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *); + +void bch2_trans_unlink_iters(struct btree_trans *, u64); + +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, + struct bpos, unsigned, u64); +struct btree_iter *bch2_trans_copy_iter(struct btree_trans *, + struct btree_iter *); + +static __always_inline u64 __btree_iter_id(void) +{ + u64 ret = 0; + + ret <<= 32; + ret |= _RET_IP_ & U32_MAX; + ret <<= 32; + ret |= _THIS_IP_ & U32_MAX; + return ret; +} + +static __always_inline struct btree_iter * +bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + struct bpos pos, unsigned flags) +{ + return __bch2_trans_get_iter(trans, btree_id, pos, flags, + __btree_iter_id()); +} + +struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); + +void bch2_trans_begin(struct btree_trans *); + +static inline void bch2_trans_begin_updates(struct btree_trans *trans) +{ + trans->nr_updates = 0; +} + +void *bch2_trans_kmalloc(struct btree_trans *, size_t); +void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); +int bch2_trans_exit(struct btree_trans *); + #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index cbd28ca5e25e..ea07ba19c5dc 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_LOCKING_H #define _BCACHEFS_BTREE_LOCKING_H @@ -12,7 +13,6 @@ #include <linux/six.h> #include "btree_iter.h" -#include "btree_io.h" /* matches six lock types */ enum btree_node_locked_type { @@ -76,19 +76,26 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter, mark_btree_node_locked(iter, level, SIX_LOCK_intent); } -static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level) +static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) { return level < iter->locks_want ? SIX_LOCK_intent : SIX_LOCK_read; } -static inline bool btree_want_intent(struct btree_iter *iter, int level) +static inline enum btree_node_locked_type +btree_lock_want(struct btree_iter *iter, int level) { - return btree_lock_want(iter, level) == SIX_LOCK_intent; + if (level < iter->level) + return BTREE_NODE_UNLOCKED; + if (level < iter->locks_want) + return BTREE_NODE_INTENT_LOCKED; + if (level == iter->level) + return BTREE_NODE_READ_LOCKED; + return BTREE_NODE_UNLOCKED; } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); @@ -99,6 +106,21 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) mark_btree_node_unlocked(iter, level); } +static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +{ + EBUG_ON(!level && iter->trans->nounlock); + + __btree_node_unlock(iter, level); +} + +static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) +{ + btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + + while (iter->nodes_locked) + btree_node_unlock(iter, __ffs(iter->nodes_locked)); +} + static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) { switch (type) { @@ -132,8 +154,29 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, __btree_node_lock_type(c, b, type); } +/* + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +static inline bool btree_node_lock_increment(struct btree_iter *iter, + struct btree *b, unsigned level, + enum btree_node_locked_type want) +{ + struct btree_iter *linked; + + trans_for_each_iter(iter->trans, linked) + if (linked != iter && + linked->l[level].b == b && + btree_node_locked_type(linked, level) >= want) { + six_lock_increment(&b->lock, want); + return true; + } + + return false; +} + bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type); + struct btree_iter *, enum six_lock_type); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, @@ -143,6 +186,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos, EBUG_ON(level >= BTREE_MAX_DEPTH); return likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(iter, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type); } @@ -151,13 +195,14 @@ bool __bch2_btree_node_relock(struct btree_iter *, unsigned); static inline bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level) { - return likely(btree_lock_want(iter, level) == - btree_node_locked_type(iter, level)) || + EBUG_ON(btree_node_locked(iter, level) && + btree_node_locked_type(iter, level) != + __btree_lock_want(iter, level)); + + return likely(btree_node_locked(iter, level)) || __bch2_btree_node_relock(iter, level); } -bool bch2_btree_iter_relock(struct btree_iter *); - void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); @@ -165,7 +210,7 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) { EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq); + EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); if (!six_trylock_write(&b->lock)) __bch2_btree_node_lock_write(b, iter); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a01c1b378457..91aa30a6ed2f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_TYPES_H #define _BCACHEFS_BTREE_TYPES_H @@ -6,10 +7,12 @@ #include <linux/six.h> #include "bkey_methods.h" +#include "buckets_types.h" #include "journal_types.h" struct open_bucket; struct btree_update; +struct btree_trans; #define MAX_BSETS 3U @@ -53,13 +56,8 @@ struct btree_write { struct closure_waitlist wait; }; -struct btree_ob_ref { - u8 nr; - u8 refs[BCH_REPLICAS_MAX]; -}; - struct btree_alloc { - struct btree_ob_ref ob; + struct open_buckets ob; BKEY_PADDED(k); }; @@ -126,7 +124,7 @@ struct btree { */ unsigned long will_make_reachable; - struct btree_ob_ref ob; + struct open_buckets ob; /* lru list */ struct list_head list; @@ -175,25 +173,26 @@ struct btree_cache { }; struct btree_node_iter { - u8 is_extents; - struct btree_node_iter_set { u16 k, end; } data[MAX_BSETS]; }; -#define BTREE_ITER_SLOTS (1 << 0) -#define BTREE_ITER_INTENT (1 << 1) -#define BTREE_ITER_PREFETCH (1 << 2) +enum btree_iter_type { + BTREE_ITER_KEYS, + BTREE_ITER_SLOTS, + BTREE_ITER_NODES, +}; + +#define BTREE_ITER_TYPE ((1 << 2) - 1) + +#define BTREE_ITER_INTENT (1 << 2) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 3) -/* - * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: - */ -#define BTREE_ITER_AT_END_OF_LEAF (1 << 4) +#define BTREE_ITER_IS_EXTENTS (1 << 4) #define BTREE_ITER_ERROR (1 << 5) enum btree_iter_uptodate { @@ -201,7 +200,6 @@ enum btree_iter_uptodate { BTREE_ITER_NEED_PEEK = 1, BTREE_ITER_NEED_RELOCK = 2, BTREE_ITER_NEED_TRAVERSE = 3, - BTREE_ITER_END = 4, }; /* @@ -212,11 +210,13 @@ enum btree_iter_uptodate { * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - struct bch_fs *c; + u8 idx; + + struct btree_trans *trans; struct bpos pos; u8 flags; - unsigned uptodate:4; + enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; unsigned level:4, locks_want:4, @@ -226,25 +226,84 @@ struct btree_iter { struct btree_iter_level { struct btree *b; struct btree_node_iter iter; + u32 lock_seq; } l[BTREE_MAX_DEPTH]; - u32 lock_seq[BTREE_MAX_DEPTH]; - /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; - /* - * Circular linked list of linked iterators: linked iterators share - * locks (e.g. two linked iterators may have the same node intent - * locked, or read and write locked, at the same time), and insertions - * through one iterator won't invalidate the other linked iterators. - */ + u64 id; +}; + +struct deferred_update { + struct journal_preres res; + struct journal_entry_pin journal; + + spinlock_t lock; + unsigned dirty:1; + + u8 allocated_u64s; + enum btree_id btree_id; + + /* must be last: */ + struct bkey_i k; +}; + +struct btree_insert_entry { + struct bkey_i *k; + + union { + struct btree_iter *iter; + struct deferred_update *d; + }; - /* Must come last: */ - struct btree_iter *next; + bool deferred; + bool triggered; + bool marked; +}; + +#define BTREE_ITER_MAX 64 + +struct btree_trans { + struct bch_fs *c; + unsigned long ip; + u64 commit_start; + + u64 iters_linked; + u64 iters_live; + u64 iters_touched; + u64 iters_unlink_on_restart; + u64 iters_unlink_on_commit; + + u8 nr_iters; + u8 nr_updates; + u8 size; + unsigned used_mempool:1; + unsigned error:1; + unsigned nounlock:1; + + unsigned mem_top; + unsigned mem_bytes; + void *mem; + + struct btree_iter *iters; + struct btree_insert_entry *updates; + + /* update path: */ + struct journal_res journal_res; + struct journal_preres journal_preres; + u64 *journal_seq; + struct disk_reservation *disk_res; + unsigned flags; + unsigned journal_u64s; + + struct btree_iter iters_onstack[2]; + struct btree_insert_entry updates_onstack[6]; + + struct replicas_delta_list *fs_usage_deltas; }; #define BTREE_FLAG(flag) \ @@ -299,10 +358,38 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) return b->set + b->nsets - 1; } +static inline void * +__btree_node_offset_to_ptr(const struct btree *b, u16 offset) +{ + return (void *) ((u64 *) b->data + 1 + offset); +} + +static inline u16 +__btree_node_ptr_to_offset(const struct btree *b, const void *p) +{ + u16 ret = (u64 *) p - 1 - (u64 *) b->data; + + EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); + return ret; +} + static inline struct bset *bset(const struct btree *b, const struct bset_tree *t) { - return (void *) b->data + t->data_offset * sizeof(u64); + return __btree_node_offset_to_ptr(b, t->data_offset); +} + +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +{ + t->end_offset = + __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); +} + +static inline void set_btree_bset(struct btree *b, struct bset_tree *t, + const struct bset *i) +{ + t->data_offset = __btree_node_ptr_to_offset(b, i); + set_btree_bset_end(b, t); } static inline struct bset *btree_bset_first(struct btree *b) @@ -318,19 +405,27 @@ static inline struct bset *btree_bset_last(struct btree *b) static inline u16 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) { - size_t ret = (u64 *) k - (u64 *) b->data - 1; - - EBUG_ON(ret > U16_MAX); - return ret; + return __btree_node_ptr_to_offset(b, k); } static inline struct bkey_packed * __btree_node_offset_to_key(const struct btree *b, u16 k) { - return (void *) ((u64 *) b->data + k + 1); + return __btree_node_offset_to_ptr(b, k); } -#define btree_bkey_first(_b, _t) (bset(_b, _t)->start) +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) +{ + return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); +} + +#define btree_bkey_first(_b, _t) \ +({ \ + EBUG_ON(bset(_b, _t)->start != \ + __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ + \ + bset(_b, _t)->start; \ +}) #define btree_bkey_last(_b, _t) \ ({ \ @@ -340,47 +435,52 @@ __btree_node_offset_to_key(const struct btree *b, u16 k) __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) -static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +static inline unsigned bset_byte_offset(struct btree *b, void *i) { - t->end_offset = - __btree_node_key_to_offset(b, vstruct_last(bset(b, t))); - btree_bkey_last(b, t); + return i - (void *) b->data; } -static inline void set_btree_bset(struct btree *b, struct bset_tree *t, - const struct bset *i) -{ - t->data_offset = (u64 *) i - (u64 *) b->data; - - EBUG_ON(bset(b, t) != i); - - set_btree_bset_end(b, t); -} +enum btree_node_type { +#define x(kwd, val, name) BKEY_TYPE_##kwd = val, + BCH_BTREE_IDS() +#undef x + BKEY_TYPE_BTREE, +}; -static inline unsigned bset_byte_offset(struct btree *b, void *i) +/* Type of a key in btree @id at level @level: */ +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return i - (void *) b->data; + return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; } /* Type of keys @b contains: */ -static inline enum bkey_type btree_node_type(struct btree *b) +static inline enum btree_node_type btree_node_type(struct btree *b) { - return b->level ? BKEY_TYPE_BTREE : b->btree_id; + return __btree_node_type(b->level, b->btree_id); } -static inline const struct bkey_ops *btree_node_ops(struct btree *b) +static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return &bch2_bkey_ops[btree_node_type(b)]; + return type == BKEY_TYPE_EXTENTS; } -static inline bool btree_node_has_ptrs(struct btree *b) +static inline bool btree_node_is_extents(struct btree *b) { - return btree_type_has_ptrs(btree_node_type(b)); + return btree_node_type_is_extents(btree_node_type(b)); } -static inline bool btree_node_is_extents(struct btree *b) +static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - return btree_node_type(b) == BKEY_TYPE_EXTENTS; + switch (type) { + case BKEY_TYPE_ALLOC: + case BKEY_TYPE_BTREE: + case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_INODES: + case BKEY_TYPE_EC: + return true; + default: + return false; + } } struct btree_root { @@ -392,6 +492,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; + s8 error; }; /* @@ -399,26 +500,13 @@ struct btree_root { * we're holding the write lock and we know what key is about to be overwritten: */ -struct btree_iter; -struct btree_node_iter; - enum btree_insert_ret { BTREE_INSERT_OK, - /* extent spanned multiple leaf nodes: have to traverse to next node: */ - BTREE_INSERT_NEED_TRAVERSE, - /* write lock held for too long */ - BTREE_INSERT_NEED_RESCHED, /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_JOURNAL_RES_FULL, BTREE_INSERT_ENOSPC, - BTREE_INSERT_NEED_GC_LOCK, -}; - -struct extent_insert_hook { - enum btree_insert_ret - (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, - struct bkey_s_c, const struct bkey_i *); + BTREE_INSERT_NEED_MARK_REPLICAS, + BTREE_INSERT_NEED_JOURNAL_RES, }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index fd3e0affb636..616c103c05ec 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_UPDATE_H #define _BCACHEFS_BTREE_UPDATE_H @@ -6,126 +7,176 @@ struct bch_fs; struct btree; -struct btree_insert; void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, struct btree_iter *); bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_i *); -void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, +void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, struct bkey_i *); -/* Normal update interface: */ - -struct btree_insert { - struct bch_fs *c; - struct disk_reservation *disk_res; - struct journal_res journal_res; - u64 *journal_seq; - struct extent_insert_hook *hook; - unsigned flags; - bool did_work; - - unsigned short nr; - struct btree_insert_entry { - struct btree_iter *iter; - struct bkey_i *k; - unsigned extra_res; - /* - * true if entire key was inserted - can only be false for - * extents - */ - bool done; - } *entries; -}; - -int __bch2_btree_insert_at(struct btree_insert *); +void bch2_deferred_update_free(struct bch_fs *, + struct deferred_update *); +struct deferred_update * +bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned); #define BTREE_INSERT_ENTRY(_iter, _k) \ ((struct btree_insert_entry) { \ .iter = (_iter), \ .k = (_k), \ - .done = false, \ }) -#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ +#define BTREE_INSERT_DEFERRED(_d, _k) \ ((struct btree_insert_entry) { \ - .iter = (_iter), \ .k = (_k), \ - .extra_res = (_extra), \ - .done = false, \ + .d = (_d), \ + .deferred = true, \ }) -/** - * bch_btree_insert_at - insert one or more keys at iterator positions - * @iter: btree iterator - * @insert_key: key to insert - * @disk_res: disk reservation - * @hook: extent insert callback - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error +enum { + __BTREE_INSERT_ATOMIC, + __BTREE_INSERT_NOUNLOCK, + __BTREE_INSERT_NOFAIL, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_USE_RESERVE, + __BTREE_INSERT_USE_ALLOC_RESERVE, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RESERVED, + __BTREE_INSERT_NOMARK_INSERT, + __BTREE_INSERT_NOMARK_OVERWRITES, + __BTREE_INSERT_NOMARK, + __BTREE_INSERT_MARK_INMEM, + __BTREE_INSERT_NO_CLEAR_REPLICAS, + __BTREE_INSERT_BUCKET_INVALIDATE, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +/* + * Don't drop/retake locks before doing btree update, instead return -EINTR if + * we had to drop locks for any reason */ -#define bch2_btree_insert_at(_c, _disk_res, _hook, \ - _journal_seq, _flags, ...) \ - __bch2_btree_insert_at(&(struct btree_insert) { \ - .c = (_c), \ - .disk_res = (_disk_res), \ - .journal_seq = (_journal_seq), \ - .hook = (_hook), \ - .flags = (_flags), \ - .nr = COUNT_ARGS(__VA_ARGS__), \ - .entries = (struct btree_insert_entry[]) { \ - __VA_ARGS__ \ - }}) +#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC) /* - * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent - * locks, -EAGAIN if need to wait on btree reserve + * Don't drop locks _after_ successfully updating btree: */ -#define BTREE_INSERT_ATOMIC (1 << 0) +#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) /* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL (1 << 1) +#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) + +#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) +#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) /* for copygc, or when merging btree nodes */ -#define BTREE_INSERT_USE_RESERVE (1 << 2) -#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3) +#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) +#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) -/* - * Insert is for journal replay: don't get journal reservations, or mark extents - * (bch_mark_key) - */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4) +/* Insert is for journal replay - don't get journal reservations: */ +#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -/* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT (1 << 5) -#define BTREE_INSERT_GC_LOCK_HELD (1 << 6) +#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -#define BCH_HASH_SET_MUST_CREATE (1 << 7) -#define BCH_HASH_SET_MUST_REPLACE (1 << 8) +/* Don't mark new key, just overwrites: */ +#define BTREE_INSERT_NOMARK_INSERT (1 << __BTREE_INSERT_NOMARK_INSERT) -int bch2_btree_delete_at(struct btree_iter *, unsigned); +/* Don't mark overwrites, just new key: */ +#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) -int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, unsigned); +/* Don't call mark new key at all: */ +#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) + +/* Don't mark transactionally: */ +#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM) + +#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) + +#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) + +/* Don't block on allocation failure (for new btree nodes: */ +#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) +#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) + +#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) + +int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, - struct disk_reservation *, - struct extent_insert_hook *, u64 *, int flags); + struct disk_reservation *, u64 *, int flags); +int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, + struct bpos, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, struct bversion, - struct disk_reservation *, - struct extent_insert_hook *, u64 *); + struct bpos, struct bpos, u64 *); int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, __le64, unsigned); int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, - struct btree *, struct bkey_i_extent *); + struct btree *, struct bkey_i_btree_ptr *); + +int bch2_trans_commit(struct btree_trans *, + struct disk_reservation *, + u64 *, unsigned); + +struct btree_insert_entry *bch2_trans_update(struct btree_trans *, + struct btree_insert_entry); + +#define bch2_trans_do(_c, _journal_seq, _flags, _do) \ +({ \ + struct btree_trans trans; \ + int _ret; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + \ + do { \ + bch2_trans_begin(&trans); \ + \ + _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \ + (_journal_seq), (_flags)); \ + } while (_ret == -EINTR); \ + \ + bch2_trans_exit(&trans); \ + _ret; \ +}) + +/* + * We sort transaction entries so that if multiple iterators point to the same + * leaf node they'll be adjacent: + */ +static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i != trans->updates && + !i->deferred && + i[0].iter->l[0].b == i[-1].iter->l[0].b; +} + +#define __trans_next_update(_trans, _i, _filter) \ +({ \ + while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\ + (_i)++; \ + \ + (_i) < (_trans)->updates + (_trans->nr_updates); \ +}) + +#define __trans_for_each_update(_trans, _i, _filter) \ + for ((_i) = (_trans)->updates; \ + __trans_next_update(_trans, _i, _filter); \ + (_i)++) + +#define trans_for_each_update(trans, i) \ + __trans_for_each_update(trans, i, true) + +#define trans_for_each_update_iter(trans, i) \ + __trans_for_each_update(trans, i, !(i)->deferred) + +#define trans_for_each_update_leaf(trans, i) \ + __trans_for_each_update(trans, i, !(i)->deferred && \ + !same_leaf_as_prev(trans, i)) #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index c3ecc1e96726..9294137719df 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -34,7 +35,7 @@ static void btree_node_interior_verify(struct btree *b) BUG_ON(!b->level); - bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false); + bch2_btree_node_iter_init(&iter, b, &b->key.k.p); #if 1 BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || bkey_cmp_left_packed(b, k, &b->key.k.p)); @@ -131,13 +132,15 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, /* Btree node freeing/allocation: */ static bool btree_key_matches(struct bch_fs *c, - struct bkey_s_c_extent l, - struct bkey_s_c_extent r) + struct bkey_s_c l, + struct bkey_s_c r) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r); const struct bch_extent_ptr *ptr1, *ptr2; - extent_for_each_ptr(l, ptr1) - extent_for_each_ptr(r, ptr2) + bkey_for_each_ptr(ptrs1, ptr1) + bkey_for_each_ptr(ptrs2, ptr2) if (ptr1->dev == ptr2->dev && ptr1->gen == ptr2->gen && ptr1->offset == ptr2->offset) @@ -159,18 +162,10 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, { struct bch_fs *c = as->c; struct pending_btree_node_free *d; - unsigned replicas; - - /* - * btree_update lock is only needed here to avoid racing with - * gc: - */ - mutex_lock(&c->btree_interior_update_lock); for (d = as->pending; d < as->pending + as->nr_pending; d++) if (!bkey_cmp(k.k->p, d->key.k.p) && - btree_key_matches(c, bkey_s_c_to_extent(k), - bkey_i_to_s_c_extent(&d->key))) + btree_key_matches(c, k, bkey_i_to_s_c(&d->key))) goto found; BUG(); found: @@ -178,14 +173,6 @@ found: d->index_update_done = true; /* - * Btree nodes are accounted as freed in bch_alloc_stats when they're - * freed from the index: - */ - replicas = bch2_extent_nr_dirty_ptrs(k); - if (replicas) - stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size; - - /* * We're dropping @k from the btree, but it's still live until the * index update is persistent so we need to keep a reference around for * mark and sweep to find - that's primarily what the @@ -202,29 +189,17 @@ found: * to cancel out one of mark and sweep's markings if necessary: */ - /* - * bch2_mark_key() compares the current gc pos to the pos we're - * moving this reference from, hence one comparison here: - */ - if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { - struct bch_fs_usage tmp = { 0 }; - - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - -c->opts.btree_node_size, true, b - ? gc_pos_btree_node(b) - : gc_pos_btree_root(as->btree_id), - &tmp, 0, 0); - /* - * Don't apply tmp - pending deletes aren't tracked in - * bch_alloc_stats: - */ - } - - mutex_unlock(&c->btree_interior_update_lock); + if (gc_pos_cmp(c->gc_pos, b + ? gc_pos_btree_node(b) + : gc_pos_btree_root(as->btree_id)) >= 0 && + gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) + bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), + 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE| + BCH_BUCKET_MARK_GC); } -static void __btree_node_free(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) +static void __btree_node_free(struct bch_fs *c, struct btree *b) { trace_btree_node_free(c, b); @@ -237,26 +212,16 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, clear_btree_node_noevict(b); - btree_node_lock_type(c, b, SIX_LOCK_write); - bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - - /* - * By using six_unlock_write() directly instead of - * bch2_btree_node_unlock_write(), we don't update the iterator's - * sequence numbers and cause future bch2_btree_node_relock() calls to - * fail: - */ - six_unlock_write(&b->lock); } void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) { - struct btree_ob_ref ob = b->ob; + struct open_buckets ob = b->ob; btree_update_drop_new_node(c, b); @@ -264,14 +229,21 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) clear_btree_node_dirty(b); - __btree_node_free(c, b, NULL); + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->lock); - bch2_open_bucket_put_refs(c, &ob.nr, ob.refs); + bch2_open_buckets_put(c, &ob); } void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { + struct btree_iter *linked; + + trans_for_each_iter(iter->trans, linked) + BUG_ON(linked->l[b->level].b == b); + /* * Is this a node that isn't reachable on disk yet? * @@ -283,33 +255,24 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, */ btree_update_drop_new_node(c, b); - bch2_btree_iter_node_drop_linked(iter, b); - - __btree_node_free(c, b, iter); - - bch2_btree_iter_node_drop(iter, b); + six_lock_write(&b->lock); + __btree_node_free(c, b); + six_unlock_write(&b->lock); + six_unlock_intent(&b->lock); } static void bch2_btree_node_free_ondisk(struct bch_fs *c, struct pending_btree_node_free *pending) { - struct bch_fs_usage stats = { 0 }; - BUG_ON(!pending->index_update_done); - bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - -c->opts.btree_node_size, true, - gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0, 0); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ -} + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE); -void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) -{ - bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs); + if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) + bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0, + BCH_BUCKET_MARK_OVERWRITE| + BCH_BUCKET_MARK_GC); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -320,8 +283,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, struct write_point *wp; struct btree *b; BKEY_PADDED(k) tmp; - struct bkey_i_extent *e; - struct btree_ob_ref ob; + struct open_buckets ob = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; @@ -350,7 +312,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, + wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, @@ -363,7 +325,7 @@ retry: struct open_bucket *ob; unsigned i; - writepoint_for_each_ptr(wp, ob, i) + open_bucket_for_each(c, &wp->ptrs, ob, i) if (ob->sectors_free < c->opts.btree_node_size) ob->sectors_free = 0; @@ -371,11 +333,10 @@ retry: goto retry; } - e = bkey_extent_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size); + bkey_btree_ptr_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); - ob.nr = 0; - bch2_open_bucket_get(c, wp, &ob.nr, ob.refs); + bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(c); @@ -404,6 +365,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev set_btree_node_accessed(b); set_btree_node_dirty(b); + set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); memset(&b->nr, 0, sizeof(b->nr)); @@ -411,7 +373,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b->data->flags = 0; SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr; + b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; bch2_btree_build_aux_trees(b); @@ -496,10 +458,12 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser b->ob.nr = 0; bkey_copy(&a->k, &b->key); } else { - bch2_btree_open_bucket_put(c, b); + bch2_open_buckets_put(c, &b->ob); } - __btree_node_free(c, b, NULL); + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->lock); six_unlock_intent(&b->lock); } @@ -518,7 +482,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, struct btree *b; struct disk_reservation disk_res = { 0, 0 }; unsigned sectors = nr_nodes * c->opts.btree_node_size; - int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD; + int ret, disk_res_flags = 0; if (flags & BTREE_INSERT_NOFAIL) disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; @@ -562,8 +526,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, goto err_free; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, - bkey_i_to_s_c(&b->key)); + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); if (ret) goto err_free; @@ -585,6 +548,8 @@ static void bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + bch2_journal_pin_flush(&c->journal, &as->journal); + BUG_ON(as->nr_new_nodes); BUG_ON(as->nr_pending); @@ -596,7 +561,6 @@ static void bch2_btree_update_free(struct btree_update *as) closure_debug_destroy(&as->cl); mempool_free(as, &c->btree_interior_update_pool); - percpu_ref_put(&c->writes); closure_wake_up(&c->btree_interior_update_wait); mutex_unlock(&c->btree_interior_update_lock); @@ -645,12 +609,12 @@ static void btree_update_wait_on_journal(struct closure *cl) int ret; ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); - if (ret < 0) - goto err; - if (!ret) { + if (ret == -EAGAIN) { continue_at(cl, btree_update_wait_on_journal, system_wq); return; } + if (ret < 0) + goto err; bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); err: @@ -690,6 +654,12 @@ retry: closure_wait(&btree_current_write(b)->wait, cl); list_del(&as->write_blocked_list); + + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); mutex_unlock(&c->btree_interior_update_lock); /* @@ -993,6 +963,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { list_del(&p->write_blocked_list); btree_update_reparent(as, p); + + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); } clear_btree_node_dirty(b); @@ -1046,14 +1022,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, struct btree_reserve *reserve; struct btree_update *as; - if (unlikely(!percpu_ref_tryget(&c->writes))) - return ERR_PTR(-EROFS); - reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); - if (IS_ERR(reserve)) { - percpu_ref_put(&c->writes); + if (IS_ERR(reserve)) return ERR_CAST(reserve); - } as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); memset(as, 0, sizeof(*as)); @@ -1097,21 +1068,32 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; struct btree *old = btree_node_root(c, b); - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; __bch2_btree_set_root_inmem(c, b); - bch2_mark_key(c, bkey_i_to_s_c(&b->key), - c->opts.btree_node_size, true, - gc_pos_btree_root(b->btree_id), - &stats, 0, 0); + mutex_lock(&c->btree_interior_update_lock); + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + + bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT| + BCH_BUCKET_MARK_GC); if (old && !btree_node_fake(old)) bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&old->key), - &stats); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, - gc_pos_btree_root(b->btree_id)); + fs_usage); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); + + bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + mutex_unlock(&c->btree_interior_update_lock); } static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) @@ -1149,7 +1131,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, struct btree *old; trace_btree_set_root(c, b); - BUG_ON(!b->written); + BUG_ON(!b->written && + !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); old = btree_node_root(c, b); @@ -1181,19 +1164,28 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct btree_node_iter *node_iter) { struct bch_fs *c = as->c; - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; struct bkey_packed *k; struct bkey tmp; BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); - if (bkey_extent_is_data(&insert->k)) - bch2_mark_key(c, bkey_i_to_s_c(insert), - c->opts.btree_node_size, true, - gc_pos_btree_node(b), &stats, 0, 0); + mutex_lock(&c->btree_interior_update_lock); + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); + + if (gc_visited(c, gc_pos_btree_node(b))) + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT| + BCH_BUCKET_MARK_GC); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) + bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) bch2_btree_node_iter_advance(node_iter, b); /* @@ -1203,10 +1195,13 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b if (k && !bkey_cmp_packed(b, k, &insert->k)) bch2_btree_node_free_index(as, b, bkey_disassemble(b, k, &tmp), - &stats); + fs_usage); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, - gc_pos_btree_node(b)); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); + + bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + mutex_unlock(&c->btree_interior_update_lock); bch2_btree_bset_insert_key(iter, b, node_iter, insert); set_btree_node_dirty(b); @@ -1324,7 +1319,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); - bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false); + bch2_btree_node_iter_init(&node_iter, b, &k->k.p); while (!bch2_keylist_empty(keys)) { k = bch2_keylist_front(keys); @@ -1362,7 +1357,8 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, } static void btree_split(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) + struct btree_iter *iter, struct keylist *keys, + unsigned flags) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(iter, b); @@ -1425,7 +1421,7 @@ static void btree_split(struct btree_update *as, struct btree *b, if (parent) { /* Split a non root node */ - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); } else if (n3) { bch2_btree_set_root(as, n3, iter); } else { @@ -1433,32 +1429,28 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_set_root(as, n1, iter); } - bch2_btree_open_bucket_put(c, n1); + bch2_open_buckets_put(c, &n1->ob); if (n2) - bch2_btree_open_bucket_put(c, n2); + bch2_open_buckets_put(c, &n2->ob); if (n3) - bch2_btree_open_bucket_put(c, n3); - - /* - * Note - at this point other linked iterators could still have @b read - * locked; we're depending on the bch2_btree_iter_node_replace() calls - * below removing all references to @b so we don't return with other - * iterators pointing to a node they have locked that's been freed. - * - * We have to free the node first because the bch2_iter_node_replace() - * calls will drop _our_ iterator's reference - and intent lock - to @b. - */ - bch2_btree_node_free_inmem(c, b, iter); + bch2_open_buckets_put(c, &n3->ob); /* Successful split, update the iterator to point to the new nodes: */ + six_lock_increment(&b->lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); if (n3) bch2_btree_iter_node_replace(iter, n3); if (n2) bch2_btree_iter_node_replace(iter, n2); bch2_btree_iter_node_replace(iter, n1); - bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time); + bch2_btree_node_free_inmem(c, b, iter); + + bch2_btree_trans_verify_locks(iter->trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], + start_time); } static void @@ -1491,9 +1483,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, btree_update_updated_node(as, b); - for_each_linked_btree_node(iter, b, linked) + trans_for_each_iter_with_node(iter->trans, b, linked) bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); - bch2_btree_node_iter_peek(&iter->l[b->level].iter, b); bch2_btree_iter_verify(iter, b); } @@ -1511,7 +1502,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * for leaf nodes -- inserts into interior nodes have to be atomic. */ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) + struct btree_iter *iter, struct keylist *keys, + unsigned flags) { struct bch_fs *c = as->c; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); @@ -1551,35 +1543,48 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, btree_node_interior_verify(b); - bch2_foreground_maybe_merge(c, iter, b->level); + /* + * when called from the btree_split path the new nodes aren't added to + * the btree iterator yet, so the merge path's unlock/wait/relock dance + * won't work: + */ + bch2_foreground_maybe_merge(c, iter, b->level, + flags|BTREE_INSERT_NOUNLOCK); return; split: - btree_split(as, b, iter, keys); + btree_split(as, b, iter, keys, flags); } int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, - unsigned btree_reserve_flags) + unsigned flags) { + struct btree_trans *trans = iter->trans; struct btree *b = iter->l[0].b; struct btree_update *as; struct closure cl; int ret = 0; + struct btree_iter *linked; /* * We already have a disk reservation and open buckets pinned; this * allocation must not block: */ - if (iter->btree_id == BTREE_ID_EXTENTS) - btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; + trans_for_each_iter(trans, linked) + if (linked->btree_id == BTREE_ID_EXTENTS) + flags |= BTREE_INSERT_USE_RESERVE; closure_init_stack(&cl); /* Hack, because gc and splitting nodes doesn't mix yet: */ - if (!down_read_trylock(&c->gc_lock)) { - bch2_btree_iter_unlock(iter); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && + !down_read_trylock(&c->gc_lock)) { + if (flags & BTREE_INSERT_NOUNLOCK) + return -EINTR; + + bch2_trans_unlock(trans); down_read(&c->gc_lock); - if (btree_iter_linked(iter)) + if (!bch2_trans_relock(trans)) ret = -EINTR; } @@ -1587,40 +1592,47 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip); ret = -EINTR; goto out; } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - btree_reserve_flags, &cl); + btree_update_reserve_required(c, b), flags, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (IS_ERR(as)) { ret = PTR_ERR(as); if (ret == -EAGAIN) { - bch2_btree_iter_unlock(iter); - up_read(&c->gc_lock); - closure_sync(&cl); - return -EINTR; + BUG_ON(flags & BTREE_INSERT_NOUNLOCK); + bch2_trans_unlock(trans); + ret = -EINTR; } goto out; } - btree_split(as, b, iter, NULL); + btree_split(as, b, iter, NULL, flags); bch2_btree_update_done(as); - bch2_btree_iter_set_locks_want(iter, 1); + /* + * We haven't successfully inserted yet, so don't downgrade all the way + * back to read locks; + */ + __bch2_btree_iter_downgrade(iter, 1); out: - up_read(&c->gc_lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); closure_sync(&cl); return ret; } -int __bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - unsigned level, - enum btree_node_sibling sib) +void __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) { + struct btree_trans *trans = iter->trans; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1632,29 +1644,28 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c, closure_init_stack(&cl); retry: - if (!bch2_btree_node_relock(iter, level)) - return 0; + BUG_ON(!btree_node_locked(iter, level)); b = iter->l[level].b; parent = btree_node_parent(iter, b); if (!parent) - return 0; + goto out; if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; + goto out; /* XXX: can't be holding read locks */ m = bch2_btree_node_get_sibling(c, iter, b, sib); if (IS_ERR(m)) { ret = PTR_ERR(m); - goto out; + goto err; } /* NULL means no sibling: */ if (!m) { b->sib_u64s[sib] = U16_MAX; - return 0; + goto out; } if (sib == btree_prev_sib) { @@ -1684,35 +1695,31 @@ retry: if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { six_unlock_intent(&m->lock); - return 0; + goto out; } /* We're changing btree topology, doesn't mix with gc: */ - if (!down_read_trylock(&c->gc_lock)) { - six_unlock_intent(&m->lock); - bch2_btree_iter_unlock(iter); - - down_read(&c->gc_lock); - up_read(&c->gc_lock); - ret = -EINTR; - goto out; - } + if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && + !down_read_trylock(&c->gc_lock)) + goto err_cycle_gc_lock; - if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) { + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ret = -EINTR; - goto out_unlock; + goto err_unlock; } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - &cl); + btree_update_reserve_required(c, parent) + 1, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (IS_ERR(as)) { ret = PTR_ERR(as); - goto out_unlock; + goto err_unlock; } + trace_btree_merge(c, b); + bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1738,36 +1745,72 @@ retry: bch2_btree_node_write(c, n, SIX_LOCK_intent); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); + + bch2_open_buckets_put(c, &n->ob); + + six_lock_increment(&b->lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_drop(iter, m); - bch2_btree_open_bucket_put(c, n); - bch2_btree_node_free_inmem(c, b, iter); - bch2_btree_node_free_inmem(c, m, iter); bch2_btree_iter_node_replace(iter, n); bch2_btree_iter_verify(iter, n); + bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(c, m, iter); + bch2_btree_update_done(as); -out_unlock: - if (ret != -EINTR && ret != -EAGAIN) - bch2_btree_iter_set_locks_want(iter, 1); - six_unlock_intent(&m->lock); - up_read(&c->gc_lock); + + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); out: - if (ret == -EAGAIN || ret == -EINTR) { - bch2_btree_iter_unlock(iter); - ret = -EINTR; - } + bch2_btree_trans_verify_locks(trans); + /* + * Don't downgrade locks here: we're called after successful insert, + * and the caller will downgrade locks after a successful insert + * anyways (in case e.g. a split was required first) + * + * And we're also called when inserting into interior nodes in the + * split path, and downgrading to read locks in there is potentially + * confusing: + */ closure_sync(&cl); + return; - if (ret == -EINTR) { +err_cycle_gc_lock: + six_unlock_intent(&m->lock); + + if (flags & BTREE_INSERT_NOUNLOCK) + goto out; + + bch2_trans_unlock(trans); + + down_read(&c->gc_lock); + up_read(&c->gc_lock); + ret = -EINTR; + goto err; + +err_unlock: + six_unlock_intent(&m->lock); + if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) + up_read(&c->gc_lock); +err: + BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); + + if ((ret == -EAGAIN || ret == -EINTR) && + !(flags & BTREE_INSERT_NOUNLOCK)) { + bch2_trans_unlock(trans); + closure_sync(&cl); ret = bch2_btree_iter_traverse(iter); - if (!ret) - goto retry; + if (ret) + goto out; + + goto retry; } - return ret; + goto out; } static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, @@ -1778,8 +1821,10 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as; as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - flags, cl); + (parent + ? btree_update_reserve_required(c, parent) + : 0) + 1, + flags, cl); if (IS_ERR(as)) { trace_btree_gc_rewrite_node_fail(c, b); return PTR_ERR(as); @@ -1798,17 +1843,18 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); } else { bch2_btree_set_root(as, n, iter); } - bch2_btree_open_bucket_put(c, n); + bch2_open_buckets_put(c, &n->ob); + six_lock_increment(&b->lock, SIX_LOCK_intent); + bch2_btree_iter_node_drop(iter, b); + bch2_btree_iter_node_replace(iter, n); bch2_btree_node_free_inmem(c, b, iter); - BUG_ON(!bch2_btree_iter_node_replace(iter, n)); - bch2_btree_update_done(as); return 0; } @@ -1822,7 +1868,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, __le64 seq, unsigned flags) { - unsigned locks_want = iter->locks_want; + struct btree_trans *trans = iter->trans; struct closure cl; struct btree *b; int ret; @@ -1831,11 +1877,11 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); - bch2_btree_iter_set_locks_want(iter, U8_MAX); + bch2_btree_iter_upgrade(iter, U8_MAX); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { if (!down_read_trylock(&c->gc_lock)) { - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); down_read(&c->gc_lock); } } @@ -1854,11 +1900,11 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ret != -EINTR) break; - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); closure_sync(&cl); } - bch2_btree_iter_set_locks_want(iter, locks_want); + bch2_btree_iter_downgrade(iter); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) up_read(&c->gc_lock); @@ -1871,7 +1917,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, struct btree_update *as, struct btree_iter *iter, struct btree *b, struct btree *new_hash, - struct bkey_i_extent *new_key) + struct bkey_i_btree_ptr *new_key) { struct btree *parent; int ret; @@ -1902,6 +1948,24 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, btree_interior_update_add_node_reference(as, b); + /* + * XXX: the rest of the update path treats this like we're actually + * inserting a new node and deleting the existing node, so the + * reservation needs to include enough space for @b + * + * that is actually sketch as fuck though and I am surprised the code + * seems to work like that, definitely need to go back and rework it + * into something saner. + * + * (I think @b is just getting double counted until the btree update + * finishes and "deletes" @b on disk) + */ + ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, + c->opts.btree_node_size * + bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + parent = btree_node_parent(iter, b); if (parent) { if (new_hash) { @@ -1912,7 +1976,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, } bch2_keylist_add(&as->parent_keys, &new_key->k_i); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -1928,21 +1992,33 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bkey_copy(&b->key, &new_key->k_i); } } else { - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage *fs_usage; BUG_ON(btree_node_root(c, b) != b); bch2_btree_node_lock_write(b, iter); - bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), - c->opts.btree_node_size, true, - gc_pos_btree_root(b->btree_id), - &stats, 0, 0); + mutex_lock(&c->btree_interior_update_lock); + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); + + bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + 0, fs_usage, 0, + BCH_BUCKET_MARK_INSERT); + if (gc_visited(c, gc_pos_btree_root(b->btree_id))) + bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + 0, NULL, 0, + BCH_BUCKET_MARK_INSERT|| + BCH_BUCKET_MARK_GC); + bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), - &stats); - bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, - gc_pos_btree_root(b->btree_id)); + fs_usage); + bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0); + + bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + mutex_unlock(&c->btree_interior_update_lock); if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { mutex_lock(&c->btree_cache.lock); @@ -1964,8 +2040,10 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, } int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, - struct btree *b, struct bkey_i_extent *new_key) + struct btree *b, + struct bkey_i_btree_ptr *new_key) { + struct btree *parent = btree_node_parent(iter, b); struct btree_update *as = NULL; struct btree *new_hash = NULL; struct closure cl; @@ -1973,11 +2051,14 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, closure_init_stack(&cl); + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) + return -EINTR; + if (!down_read_trylock(&c->gc_lock)) { - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(iter->trans); down_read(&c->gc_lock); - if (!bch2_btree_iter_relock(iter)) { + if (!bch2_trans_relock(iter->trans)) { ret = -EINTR; goto err; } @@ -1988,26 +2069,27 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, /* bch2_btree_reserve_get will unlock */ ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { - ret = -EINTR; - - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(iter->trans); up_read(&c->gc_lock); closure_sync(&cl); down_read(&c->gc_lock); - if (!bch2_btree_iter_relock(iter)) + if (!bch2_trans_relock(iter->trans)) { + ret = -EINTR; goto err; + } } new_hash = bch2_btree_node_mem_alloc(c); } as = bch2_btree_update_start(c, iter->btree_id, - btree_update_reserve_required(c, b), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); + if (IS_ERR(as)) { ret = PTR_ERR(as); if (ret == -EAGAIN) @@ -2016,21 +2098,22 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, if (ret != -EINTR) goto err; - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(iter->trans); up_read(&c->gc_lock); closure_sync(&cl); down_read(&c->gc_lock); - if (!bch2_btree_iter_relock(iter)) + if (!bch2_trans_relock(iter->trans)) goto err; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, - extent_i_to_s_c(new_key).s_c); + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i)); if (ret) goto err_free_update; __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); + + bch2_btree_iter_downgrade(iter); err: if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2059,7 +2142,6 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) BUG_ON(btree_node_root(c, b)); __bch2_btree_set_root_inmem(c, b); - bch2_btree_set_root_ondisk(c, b, READ); } void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) @@ -2082,9 +2164,9 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) b->level = 0; b->btree_id = id; - bkey_extent_init(&b->key); + bkey_btree_ptr_init(&b->key); b->key.k.p = POS_MAX; - bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id; + PTR_HASH(&b->key) = U64_MAX - id; bch2_bset_init_first(b, &b->data->keys); bch2_btree_build_aux_trees(b); @@ -2105,20 +2187,20 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) { - char *out = buf, *end = buf + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n", - as, - as->mode, - as->nodes_written, - atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, - bch2_journal_pin_seq(&c->journal, &as->journal)); + pr_buf(&out, "%p m %u w %u r %u j %llu\n", + as, + as->mode, + as->nodes_written, + atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, + as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); - return out - buf; + return out.pos - buf; } size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 25bfc7ab9ee0..e5156e908110 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H @@ -131,7 +132,6 @@ struct btree_update { void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, struct btree_iter *); void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); -void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *); struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, @@ -146,35 +146,42 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *, struct btree *); void bch2_btree_insert_node(struct btree_update *, struct btree *, - struct btree_iter *, struct keylist *); + struct btree_iter *, struct keylist *, + unsigned); int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, - unsigned, enum btree_node_sibling); +void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + unsigned, unsigned, enum btree_node_sibling); -static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, +static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree_iter *iter, - unsigned level, + unsigned level, unsigned flags, enum btree_node_sibling sib) { struct btree *b; + if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) + return; + if (!bch2_btree_node_relock(iter, level)) - return 0; + return; b = iter->l[level].b; if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - return 0; + return; - return __bch2_foreground_maybe_merge(c, iter, level, sib); + __bch2_foreground_maybe_merge(c, iter, level, flags, sib); } static inline void bch2_foreground_maybe_merge(struct bch_fs *c, struct btree_iter *iter, - unsigned level) + unsigned level, + unsigned flags) { - bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib); - bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib); + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_prev_sib); + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_next_sib); } void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); @@ -183,9 +190,17 @@ void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) { - unsigned depth = btree_node_root(c, b)->level - b->level; + unsigned depth = btree_node_root(c, b)->level + 1; - return btree_reserve_required_nodes(depth); + /* + * Number of nodes we might have to allocate in a worst case btree + * split operation - we split all the way up to the root, then allocate + * a new root, unless we're already at max depth: + */ + if (depth < BTREE_MAX_DEPTH) + return (depth - b->level) * 2 + 1; + else + return (depth - b->level) * 2 - 1; } static inline void btree_node_reset_sib_u64s(struct btree *b) @@ -216,14 +231,19 @@ static inline void *write_block(struct btree *b) return (void *) b->data + (b->written << 9); } +static inline bool __btree_addr_written(struct btree *b, void *p) +{ + return p < write_block(b); +} + static inline bool bset_written(struct btree *b, struct bset *i) { - return (void *) i < write_block(b); + return __btree_addr_written(b, i); } -static inline bool bset_unwritten(struct btree *b, struct bset *i) +static inline bool bkey_written(struct btree *b, struct bkey_packed *k) { - return (void *) i > write_block(b); + return __btree_addr_written(b, k); } static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, @@ -282,10 +302,9 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k) { - if (bset_written(b, bset(b, t))) { + if (bkey_written(b, k)) { EBUG_ON(b->uncompacted_whiteout_u64s < bkeyp_key_u64s(&b->format, k)); b->uncompacted_whiteout_u64s -= @@ -293,10 +312,9 @@ static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, } } -static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) +static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k) { - if (bset_written(b, bset(b, t))) { + if (bkey_written(b, k)) { BUG_ON(!k->needs_whiteout); b->uncompacted_whiteout_u64s += bkeyp_key_u64s(&b->format, k); @@ -308,40 +326,14 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, * insert into could be written out from under us) */ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) + struct btree *b, unsigned u64s) { if (unlikely(btree_node_fake(b))) return false; - if (btree_node_is_extents(b)) { - /* The insert key might split an existing key - * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: - */ - u64s += BKEY_EXTENT_U64s_MAX; - } - return u64s <= bch_btree_keys_u64s_remaining(c, b); } -static inline bool journal_res_insert_fits(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - unsigned u64s = 0; - struct btree_insert_entry *i; - - /* - * If we didn't get a journal reservation, we're in journal replay and - * we're not journalling updates: - */ - if (!trans->journal_res.ref) - return true; - - for (i = insert; i < trans->entries + trans->nr; i++) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - return u64s <= trans->journal_res.u64s; -} - ssize_t bch2_btree_updates_print(struct bch_fs *, char *); size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index cc41140fbe3a..4f12108bd6fe 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -1,19 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" +#include "buckets.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "replicas.h" #include <linux/sort.h> #include <trace/events/bcachefs.h> +inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +{ + bch2_btree_node_lock_write(b, iter); + + if (btree_node_just_written(b) && + bch2_btree_post_write_cleanup(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(c, b, iter); +} + +static void btree_trans_lock_write(struct bch_fs *c, struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update_leaf(trans, i) + bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); +} + +static void btree_trans_unlock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update_leaf(trans, i) + bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); +} + +static inline int btree_trans_cmp(struct btree_insert_entry l, + struct btree_insert_entry r) +{ + return cmp_int(l.deferred, r.deferred) ?: + btree_iter_cmp(l.iter, r.iter); +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -24,7 +69,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, { const struct bkey_format *f = &b->format; struct bkey_packed *k; - struct bset_tree *t; unsigned clobber_u64s; EBUG_ON(btree_node_just_written(b)); @@ -37,9 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, if (k && !bkey_cmp_packed(b, k, &insert->k)) { BUG_ON(bkey_whiteout(k)); - t = bch2_bkey_to_bset(b, k); - - if (bset_unwritten(b, bset(b, t)) && + if (!bkey_written(b, k) && bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && !bkey_whiteout(&insert->k)) { k->type = insert->k.type; @@ -50,9 +92,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, insert->k.needs_whiteout = k->needs_whiteout; - btree_keys_account_key_drop(&b->nr, t - b->set, k); + btree_account_key_drop(b, k); - if (t == bset_tree_last(b)) { + if (k >= btree_bset_last(b)->start) { clobber_u64s = k->u64s; /* @@ -62,20 +104,22 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, */ if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - k, clobber_u64s, 0); + bch2_btree_node_iter_fix(iter, b, node_iter, + k, clobber_u64s, 0); + bch2_btree_iter_verify(iter, b); return true; } goto overwrite; } - k->type = KEY_TYPE_DELETED; - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - k->u64s, k->u64s); + k->type = KEY_TYPE_deleted; + bch2_btree_node_iter_fix(iter, b, node_iter, k, + k->u64s, k->u64s); + bch2_btree_iter_verify(iter, b); if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, t, k); + reserve_whiteout(b, k); return true; } else { k->needs_whiteout = false; @@ -90,14 +134,14 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, insert->k.needs_whiteout = false; } - t = bset_tree_last(b); - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - clobber_u64s, k->u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, k->u64s); + bch2_btree_iter_verify(iter, b); return true; } @@ -110,8 +154,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write_cond(c, b, - (btree_current_write(b) == w && - w->journal.pin_list == journal_seq_pin(j, seq))); + (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->lock); } @@ -125,7 +168,28 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, return __btree_node_flush(j, pin, 1, seq); } -void bch2_btree_journal_key(struct btree_insert *trans, +static inline void __btree_journal_key(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_i *insert) +{ + struct journal *j = &trans->c->journal; + u64 seq = trans->journal_res.seq; + bool needs_whiteout = insert->k.needs_whiteout; + + /* ick */ + insert->k.needs_whiteout = false; + bch2_journal_add_keys(j, &trans->journal_res, + btree_id, insert); + insert->k.needs_whiteout = needs_whiteout; + + bch2_journal_set_has_inode(j, &trans->journal_res, + insert->k.p.inode); + + if (trans->journal_seq) + *trans->journal_seq = seq; +} + +void bch2_btree_journal_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { @@ -139,21 +203,9 @@ void bch2_btree_journal_key(struct btree_insert *trans, !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - u64 seq = trans->journal_res.seq; - bool needs_whiteout = insert->k.needs_whiteout; - - /* ick */ - insert->k.needs_whiteout = false; - bch2_journal_add_keys(j, &trans->journal_res, - iter->btree_id, insert); - insert->k.needs_whiteout = needs_whiteout; - - bch2_journal_set_has_inode(j, &trans->journal_res, - insert->k.p.inode); - - if (trans->journal_seq) - *trans->journal_seq = seq; - btree_bset_last(b)->journal_seq = cpu_to_le64(seq); + __btree_journal_key(trans, iter->btree_id, insert); + btree_bset_last(b)->journal_seq = + cpu_to_le64(trans->journal_res.seq); } if (unlikely(!journal_pin_active(&w->journal))) { @@ -171,9 +223,8 @@ void bch2_btree_journal_key(struct btree_insert *trans, set_btree_node_dirty(b); } -static enum btree_insert_ret -bch2_insert_fixup_key(struct btree_insert *trans, - struct btree_insert_entry *insert) +static void bch2_insert_fixup_key(struct btree_trans *trans, + struct btree_insert_entry *insert) { struct btree_iter *iter = insert->iter; struct btree_iter_level *l = &iter->l[0]; @@ -185,31 +236,25 @@ bch2_insert_fixup_key(struct btree_insert *trans, if (bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert->k)) bch2_btree_journal_key(trans, iter, insert->k); - - trans->did_work = true; - return BTREE_INSERT_OK; } /** * btree_insert_key - insert a key one key into a leaf node */ -static enum btree_insert_ret -btree_insert_key_leaf(struct btree_insert *trans, - struct btree_insert_entry *insert) +static void btree_insert_key_leaf(struct btree_trans *trans, + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; - enum btree_insert_ret ret; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - ret = !btree_node_is_extents(b) - ? bch2_insert_fixup_key(trans, insert) - : bch2_insert_fixup_extent(trans, insert); + if (!btree_node_is_extents(b)) + bch2_insert_fixup_key(trans, insert); + else + bch2_insert_fixup_extent(trans, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -224,314 +269,705 @@ btree_insert_key_leaf(struct btree_insert *trans, bch2_btree_iter_reinit_node(iter, b); trace_btree_insert_key(c, b, insert->k); - return ret; } -static bool same_leaf_as_prev(struct btree_insert *trans, - struct btree_insert_entry *i) +/* Deferred btree updates: */ + +static void deferred_update_flush(struct journal *j, + struct journal_entry_pin *pin, + u64 seq) { - /* - * Because we sorted the transaction entries, if multiple iterators - * point to the same leaf node they'll always be adjacent now: - */ - return i != trans->entries && - i[0].iter->l[0].b == i[-1].iter->l[0].b; -} + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct deferred_update *d = + container_of(pin, struct deferred_update, journal); + struct journal_preres res = { 0 }; + u64 tmp[32]; + struct bkey_i *k = (void *) tmp; + int ret; -#define trans_for_each_entry(trans, i) \ - for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) + if (d->allocated_u64s > ARRAY_SIZE(tmp)) { + k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS); -inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - bch2_btree_node_lock_write(b, iter); + BUG_ON(!k); /* XXX */ + } - if (btree_node_just_written(b) && - bch2_btree_post_write_cleanup(c, b)) - bch2_btree_iter_reinit_node(iter, b); + spin_lock(&d->lock); + if (d->dirty) { + BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s); - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(c, b, iter); + swap(res, d->res); + + BUG_ON(d->k.k.u64s > d->allocated_u64s); + + bkey_copy(k, &d->k); + d->dirty = false; + spin_unlock(&d->lock); + + ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED); + bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), + c, "error flushing deferred btree update: %i", ret); + + spin_lock(&d->lock); + } + + if (!d->dirty) + bch2_journal_pin_drop(j, &d->journal); + spin_unlock(&d->lock); + + bch2_journal_preres_put(j, &res); + if (k != (void *) tmp) + kfree(k); } -static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans) +static void btree_insert_key_deferred(struct btree_trans *trans, + struct btree_insert_entry *insert) { - struct btree_insert_entry *i; + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct deferred_update *d = insert->d; + int difference; - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, - i->iter); + BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); + BUG_ON(insert->k->u64s > d->allocated_u64s); + + __btree_journal_key(trans, d->btree_id, insert->k); + + spin_lock(&d->lock); + BUG_ON(jset_u64s(insert->k->u64s) > + trans->journal_preres.u64s); + + difference = jset_u64s(insert->k->u64s) - d->res.u64s; + if (difference > 0) { + trans->journal_preres.u64s -= difference; + d->res.u64s += difference; + } + + bkey_copy(&d->k, insert->k); + d->dirty = true; + + bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, + deferred_update_flush); + spin_unlock(&d->lock); } -static void multi_unlock_write(struct btree_insert *trans) +void bch2_deferred_update_free(struct bch_fs *c, + struct deferred_update *d) { - struct btree_insert_entry *i; + deferred_update_flush(&c->journal, &d->journal, 0); - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); + BUG_ON(journal_pin_active(&d->journal)); + + bch2_journal_pin_flush(&c->journal, &d->journal); + kfree(d); } -static inline int btree_trans_cmp(struct btree_insert_entry l, - struct btree_insert_entry r) +struct deferred_update * +bch2_deferred_update_alloc(struct bch_fs *c, + enum btree_id btree_id, + unsigned u64s) { - return btree_iter_cmp(l.iter, r.iter); + struct deferred_update *d; + + BUG_ON(u64s > U8_MAX); + + d = kmalloc(offsetof(struct deferred_update, k) + + u64s * sizeof(u64), GFP_NOFS); + BUG_ON(!d); + + memset(d, 0, offsetof(struct deferred_update, k)); + + spin_lock_init(&d->lock); + d->allocated_u64s = u64s; + d->btree_id = btree_id; + + return d; } /* Normal update interface: */ -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -int __bch2_btree_insert_at(struct btree_insert *trans) +static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_iter *split = NULL; - bool cycle_gc_lock = false; - unsigned u64s; - int ret; + enum btree_id btree_id = !i->deferred + ? i->iter->btree_id + : i->d->btree_id; - trans_for_each_entry(trans, i) { + if (!i->deferred) { BUG_ON(i->iter->level); BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - BUG_ON(debug_check_bkeys(c) && - bch2_bkey_invalid(c, i->iter->btree_id, - bkey_i_to_s_c(i->k))); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + !bch2_extent_is_atomic(i->k, i->iter)); + + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + !(trans->flags & BTREE_INSERT_ATOMIC)); } - bubble_sort(trans->entries, trans->nr, btree_trans_cmp); + BUG_ON(debug_check_bkeys(c) && + !bkey_deleted(&i->k->k) && + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id)); +} - if (unlikely(!percpu_ref_tryget(&c->writes))) - return -EROFS; -retry_locks: - ret = -EINTR; - trans_for_each_entry(trans, i) { - if (!bch2_btree_iter_set_locks_want(i->iter, 1)) - goto err; +static int bch2_trans_journal_preres_get(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + unsigned u64s = 0; + int ret; - if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) { - ret = bch2_btree_iter_traverse(i->iter); - if (ret) - goto err; - } + trans_for_each_update(trans, i) + if (i->deferred) + u64s += jset_u64s(i->k->k.u64s); + + if (!u64s) + return 0; + + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, u64s, + JOURNAL_RES_GET_NONBLOCK); + if (ret != -EAGAIN) + return ret; + + bch2_trans_unlock(trans); + + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, u64s, 0); + if (ret) + return ret; + + if (!bch2_trans_relock(trans)) { + trace_trans_restart_journal_preres_get(trans->ip); + return -EINTR; } -retry: - trans->did_work = false; - u64s = 0; - trans_for_each_entry(trans, i) - if (!i->done) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + return 0; +} - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch2_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; +static int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + struct bch_fs *c = trans->c; + int ret; + + if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) + flags |= JOURNAL_RES_GET_RESERVED; + + ret = bch2_journal_res_get(&c->journal, &trans->journal_res, + trans->journal_u64s, flags); + + return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; +} + +static enum btree_insert_ret +btree_key_can_insert(struct btree_trans *trans, + struct btree_insert_entry *insert, + unsigned *u64s) +{ + struct bch_fs *c = trans->c; + struct btree *b = insert->iter->l[0].b; + static enum btree_insert_ret ret; + + if (unlikely(btree_node_fake(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + + ret = !btree_node_is_extents(b) + ? BTREE_INSERT_OK + : bch2_extent_can_insert(trans, insert, u64s); if (ret) - goto err; + return ret; - multi_lock_write(c, trans); + if (*u64s > bch_btree_keys_u64s_remaining(c, b)) + return BTREE_INSERT_BTREE_NODE_FULL; - if (race_fault()) { - ret = -EINTR; - goto unlock; - } + return BTREE_INSERT_OK; +} + +static int btree_trans_check_can_insert(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct btree_insert_entry *i; + unsigned u64s = 0; + int ret; - u64s = 0; - trans_for_each_entry(trans, i) { + trans_for_each_update_iter(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; - /* - * bch2_btree_node_insert_fits() must be called under write lock: - * with only an intent lock, another thread can still call - * bch2_btree_node_write(), converting an unwritten bset to a - * written one - */ - if (!i->done) { - u64s += i->k->k.u64s + i->extra_res; - if (!bch2_btree_node_insert_fits(c, - i->iter->l[0].b, u64s)) { - split = i->iter; - goto unlock; + u64s += i->k->k.u64s; + ret = btree_key_can_insert(trans, i, &u64s); + if (ret) { + *stopped_at = i; + return ret; + } + } + + return 0; +} + +static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_insert_entry *insert) +{ + if (likely(!insert->deferred)) + btree_insert_key_leaf(trans, insert); + else + btree_insert_key_deferred(trans, insert); +} + +static inline bool update_triggers_transactional(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && + (i->iter->btree_id == BTREE_ID_EXTENTS || + i->iter->btree_id == BTREE_ID_INODES); +} + +static inline bool update_has_triggers(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + !i->deferred && + btree_node_type_needs_gc(i->iter->btree_id); +} + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_btree_insert_at(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; + bool saw_non_marked; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; + int ret; + + trans_for_each_update_iter(trans, i) + BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + + trans_for_each_update_iter(trans, i) + i->marked = false; + + do { + saw_non_marked = false; + + trans_for_each_update_iter(trans, i) { + if (i->marked) + continue; + + saw_non_marked = true; + i->marked = true; + + if (update_has_triggers(trans, i) && + update_triggers_transactional(trans, i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + if (ret) + goto out_clear_replicas; } } + } while (saw_non_marked); + + btree_trans_lock_write(c, trans); + + if (race_fault()) { + ret = -EINTR; + trace_trans_restart_fault_inject(trans->ip); + goto out; } - ret = 0; - split = NULL; - cycle_gc_lock = false; + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + ret = btree_trans_check_can_insert(trans, stopped_at); + if (ret) + goto out; - trans_for_each_entry(trans, i) { - if (i->done) + trans_for_each_update_iter(trans, i) { + if (i->deferred || + !btree_node_type_needs_gc(i->iter->btree_id)) continue; - switch (btree_insert_key_leaf(trans, i)) { - case BTREE_INSERT_OK: - i->done = true; - break; - case BTREE_INSERT_JOURNAL_RES_FULL: - case BTREE_INSERT_NEED_TRAVERSE: - ret = -EINTR; - break; - case BTREE_INSERT_NEED_RESCHED: - ret = -EAGAIN; - break; - case BTREE_INSERT_BTREE_NODE_FULL: - split = i->iter; - break; - case BTREE_INSERT_ENOSPC: - ret = -ENOSPC; - break; - case BTREE_INSERT_NEED_GC_LOCK: - cycle_gc_lock = true; - ret = -EINTR; - break; - default: - BUG(); + if (!fs_usage) { + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); } - if (!trans->did_work && (ret || split)) - break; + if (!bch2_bkey_replicas_marked_locked(c, + bkey_i_to_s_c(i->k), true)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto out; + } } -unlock: - multi_unlock_write(trans); - bch2_journal_res_put(&c->journal, &trans->journal_res); - if (split) - goto split; - if (ret) - goto err; + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + trans->journal_u64s = 0; - trans_for_each_entry(trans, i) - if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) + trans_for_each_update(trans, i) + trans->journal_u64s += jset_u64s(i->k->k.u64s); + + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); + if (ret) goto out; + } - trans_for_each_entry(trans, i) { - /* - * iterators are inconsistent when they hit end of leaf, until - * traversed again - */ - if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE && - !same_leaf_as_prev(trans, i)) - bch2_foreground_maybe_merge(c, i->iter, 0); + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (journal_seq_verify(c)) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (inject_invalid_keys(c)) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; } + + trans_for_each_update_iter(trans, i) + if (update_has_triggers(trans, i) && + !update_triggers_transactional(trans, i)) + bch2_mark_update(trans, i, fs_usage, mark_flags); + + if (fs_usage && trans->fs_usage_deltas) + bch2_replicas_delta_list_apply(c, fs_usage, + trans->fs_usage_deltas); + + if (fs_usage) + bch2_trans_fs_usage_apply(trans, fs_usage); + + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + unlikely(c->gc_pos.phase)) + trans_for_each_update_iter(trans, i) + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i, NULL, + mark_flags| + BCH_BUCKET_MARK_GC); + + trans_for_each_update(trans, i) + do_btree_insert_one(trans, i); out: - /* make sure we didn't lose an error: */ - if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_entry(trans, i) - BUG_ON(!i->done); + BUG_ON(ret && + (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && + trans->journal_res.ref); + + btree_trans_unlock_write(trans); + + if (fs_usage) { + bch2_fs_usage_scratch_put(c, fs_usage); + percpu_up_read(&c->mark_lock); + } + + bch2_journal_res_put(&c->journal, &trans->journal_res); +out_clear_replicas: + if (trans->fs_usage_deltas) { + memset(&trans->fs_usage_deltas->fs_usage, 0, + sizeof(trans->fs_usage_deltas->fs_usage)); + trans->fs_usage_deltas->used = 0; + } - percpu_ref_put(&c->writes); return ret; -split: - /* - * have to drop journal res before splitting, because splitting means - * allocating new btree nodes, and holding a journal reservation - * potentially blocks the allocator: - */ - ret = bch2_btree_split_leaf(c, split, trans->flags); +} + +static noinline +int bch2_trans_commit_error(struct btree_trans *trans, + struct btree_insert_entry *i, + int ret) +{ + struct bch_fs *c = trans->c; + unsigned flags = trans->flags; + struct btree_insert_entry *src, *dst; + + src = dst = trans->updates; + + while (src < trans->updates + trans->nr_updates) { + if (!src->triggered) { + *dst = *src; + dst++; + } + src++; + } + + trans->nr_updates = dst - trans->updates; /* - * This can happen when we insert part of an extent - with an update - * with multiple keys, we don't want to redo the entire update - that's - * just too confusing: + * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree + * update; if we haven't done anything yet it doesn't apply */ - if (!ret && - (trans->flags & BTREE_INSERT_ATOMIC) && - trans->did_work) + flags &= ~BTREE_INSERT_NOUNLOCK; + + switch (ret) { + case BTREE_INSERT_BTREE_NODE_FULL: + ret = bch2_btree_split_leaf(c, i->iter, flags); + + /* + * if the split succeeded without dropping locks the insert will + * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the + * caller peeked() and is overwriting won't have changed) + */ +#if 0 + /* + * XXX: + * split -> btree node merging (of parent node) might still drop + * locks when we're not passing it BTREE_INSERT_NOUNLOCK + * + * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that + * will inhibit merging - but we don't have a reliable way yet + * (do we?) of checking if we dropped locks in this path + */ + if (!ret) + goto retry; +#endif + + /* + * don't care if we got ENOSPC because we told split it + * couldn't block: + */ + if (!ret || + ret == -EINTR || + (flags & BTREE_INSERT_NOUNLOCK)) { + trace_trans_restart_btree_node_split(trans->ip); + ret = -EINTR; + } + break; + case BTREE_INSERT_ENOSPC: + ret = -ENOSPC; + break; + case BTREE_INSERT_NEED_MARK_REPLICAS: + bch2_trans_unlock(trans); + + trans_for_each_update_iter(trans, i) { + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); + if (ret) + return ret; + } + + if (bch2_trans_relock(trans)) + return 0; + + trace_trans_restart_mark_replicas(trans->ip); ret = -EINTR; + break; + case BTREE_INSERT_NEED_JOURNAL_RES: + bch2_trans_unlock(trans); - if (ret) - goto err; + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); + if (ret) + return ret; - /* - * if the split didn't have to drop locks the insert will still be - * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() - * and is overwriting won't have changed) - */ - goto retry_locks; -err: - if (cycle_gc_lock) { - down_read(&c->gc_lock); - up_read(&c->gc_lock); + if (bch2_trans_relock(trans)) + return 0; + + trace_trans_restart_journal_res_get(trans->ip); + ret = -EINTR; + break; + default: + BUG_ON(ret >= 0); + break; } if (ret == -EINTR) { - trans_for_each_entry(trans, i) { - int ret2 = bch2_btree_iter_traverse(i->iter); - if (ret2) { - ret = ret2; - goto out; - } + int ret2 = bch2_btree_iter_traverse_all(trans); + + if (ret2) { + trace_trans_restart_traverse(trans->ip); + return ret2; } /* * BTREE_ITER_ATOMIC means we have to return -EINTR if we * dropped locks: */ - if (!(trans->flags & BTREE_INSERT_ATOMIC)) - goto retry; + if (!(flags & BTREE_INSERT_ATOMIC)) + return 0; + + trace_trans_restart_atomic(trans->ip); } - goto out; + return ret; } -int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) +/** + * __bch_btree_insert_at - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +static int __bch2_trans_commit(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) { - struct bkey_i k; + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret; - bkey_init(&k.k); - k.k.p = iter->pos; + trans_for_each_update_iter(trans, i) { + if (!bch2_btree_iter_upgrade(i->iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto err; + } + + ret = btree_iter_err(i->iter); + if (ret) + goto err; + } + + ret = do_btree_insert_at(trans, stopped_at); + if (unlikely(ret)) + goto err; - return bch2_btree_insert_at(iter->c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags, - BTREE_INSERT_ENTRY(iter, &k)); + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + + trans_for_each_update_leaf(trans, i) + bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); + + trans->nounlock = false; + + trans_for_each_update_iter(trans, i) + bch2_btree_iter_downgrade(i->iter); +err: + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); + + return ret; } -int bch2_btree_insert_list_at(struct btree_iter *iter, - struct keylist *keys, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq, unsigned flags) +int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + unsigned flags) { - BUG_ON(flags & BTREE_INSERT_ATOMIC); - BUG_ON(bch2_keylist_empty(keys)); - bch2_verify_keylist_sorted(keys); + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + unsigned orig_mem_top = trans->mem_top; + int ret = 0; + + if (!trans->nr_updates) + goto out_noupdates; + + /* for the sake of sanity: */ + BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC)); + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + + if (!trans->commit_start) + trans->commit_start = local_clock(); - while (!bch2_keylist_empty(keys)) { - int ret = bch2_btree_insert_at(iter->c, disk_res, hook, - journal_seq, flags, - BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + trans->disk_res = disk_res; + trans->journal_seq = journal_seq; + trans->flags = flags; + + trans_for_each_update(trans, i) + btree_insert_entry_checks(trans, i); + bch2_btree_trans_verify_locks(trans); + + if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + !percpu_ref_tryget(&c->writes))) { + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + return -EROFS; + + bch2_trans_unlock(trans); + + ret = bch2_fs_read_write_early(c); if (ret) return ret; - bch2_keylist_pop_front(keys); + percpu_ref_get(&c->writes); + + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + goto err; + } } +retry: + ret = bch2_trans_journal_preres_get(trans); + if (ret) + goto err; - return 0; + ret = __bch2_trans_commit(trans, &i); + if (ret) + goto err; +out: + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&c->writes); +out_noupdates: + if (!ret && trans->commit_start) { + bch2_time_stats_update(&c->times[BCH_TIME_btree_update], + trans->commit_start); + trans->commit_start = 0; + } + + BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); + + if (!ret) { + bch2_trans_unlink_iters(trans, ~trans->iters_touched| + trans->iters_unlink_on_commit); + trans->iters_touched = 0; + } else { + bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit); + } + trans->nr_updates = 0; + trans->mem_top = 0; + + return ret; +err: + ret = bch2_trans_commit_error(trans, i, ret); + + /* can't loop if it was passed in and we changed it: */ + if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) + ret = -EINTR; + + if (!ret) { + /* free memory used by triggers, they'll be reexecuted: */ + trans->mem_top = orig_mem_top; + goto retry; + } + + goto out; +} + +struct btree_insert_entry *bch2_trans_update(struct btree_trans *trans, + struct btree_insert_entry entry) +{ + struct btree_insert_entry *i; + + BUG_ON(trans->nr_updates >= trans->nr_iters + 4); + + for (i = trans->updates; + i < trans->updates + trans->nr_updates; + i++) + if (btree_trans_cmp(entry, *i) < 0) + break; + + memmove(&i[1], &i[0], + (void *) &trans->updates[trans->nr_updates] - (void *) i); + trans->nr_updates++; + *i = entry; + return i; } /** - * bch_btree_insert - insert keys into the extent btree + * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs * @id: btree to insert into * @insert_keys: list of keys to insert @@ -540,50 +976,42 @@ int bch2_btree_insert_list_at(struct btree_iter *iter, int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, struct disk_reservation *disk_res, - struct extent_insert_hook *hook, u64 *journal_seq, int flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; int ret; - bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, - BTREE_INSERT_ENTRY(&iter, k)); - bch2_btree_iter_unlock(&iter); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + + ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); return ret; } -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, - struct bpos end, - struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) -{ - struct btree_iter iter; +int bch2_btree_delete_at_range(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + u64 *journal_seq) +{ struct bkey_s_c k; int ret = 0; - - bch2_btree_iter_init(&iter, c, id, start, - BTREE_ITER_INTENT); - - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - /* really shouldn't be using a bare, unpadded bkey_i */ +retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { struct bkey_i delete; - if (bkey_cmp(iter.pos, end) >= 0) - break; - bkey_init(&delete.k); /* @@ -596,33 +1024,75 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, * (bch2_btree_iter_peek() does guarantee that iter.pos >= * bkey_start_pos(k.k)). */ - delete.k.p = iter.pos; - delete.k.version = version; + delete.k.p = iter->pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - /* - * The extents btree is special - KEY_TYPE_DISCARD is - * used for deletions, not KEY_TYPE_DELETED. This is an - * internal implementation detail that probably - * shouldn't be exposed (internally, KEY_TYPE_DELETED is - * used as a proxy for k->size == 0): - */ - delete.k.type = KEY_TYPE_DISCARD; + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + unsigned max_sectors = + KEY_SIZE_MAX & (~0 << trans->c->block_bits); /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); + bch2_extent_trim_atomic(&delete, iter); } - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &delete)); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); + ret = bch2_trans_commit(trans, NULL, journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); if (ret) break; - bch2_btree_iter_cond_resched(&iter); + bch2_trans_cond_resched(trans); + } + + if (ret == -EINTR) { + ret = 0; + goto retry; } - bch2_btree_iter_unlock(&iter); + return ret; + +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags); +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + /* + * XXX: whether we need mem/more iters depends on whether this btree id + * has triggers + */ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); + + iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); + + ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); + ret = bch2_trans_exit(&trans) ?: ret; + + BUG_ON(ret == -EINTR); return ret; } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b17189ee2e4f..b6b3ac5111ca 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for manipulating bucket marks for garbage collection. * @@ -63,89 +64,26 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_background.h" +#include "bset.h" #include "btree_gc.h" +#include "btree_update.h" #include "buckets.h" +#include "ec.h" #include "error.h" #include "movinggc.h" +#include "replicas.h" #include <linux/preempt.h> #include <trace/events/bcachefs.h> -#ifdef DEBUG_BUCKETS - -#define lg_local_lock lg_global_lock -#define lg_local_unlock lg_global_unlock - -static void bch2_fs_stats_verify(struct bch_fs *c) -{ - struct bch_fs_usage stats = - __bch2_fs_usage_read(c); - unsigned i; - - for (i = 0; i < ARRAY_SIZE(stats.s); i++) { - if ((s64) stats.s[i].data[S_META] < 0) - panic("replicas %u meta underflow: %lli\n", - i + 1, stats.s[i].data[S_META]); - - if ((s64) stats.s[i].data[S_DIRTY] < 0) - panic("replicas %u dirty underflow: %lli\n", - i + 1, stats.s[i].data[S_DIRTY]); - - if ((s64) stats.s[i].persistent_reserved < 0) - panic("replicas %u reserved underflow: %lli\n", - i + 1, stats.s[i].persistent_reserved); - } - - if ((s64) stats.online_reserved < 0) - panic("sectors_online_reserved underflow: %lli\n", - stats.online_reserved); -} - -static void bch2_dev_stats_verify(struct bch_dev *ca) -{ - struct bch_dev_usage stats = - __bch2_dev_usage_read(ca); - u64 n = ca->mi.nbuckets - ca->mi.first_bucket; - unsigned i; - - for (i = 0; i < ARRAY_SIZE(stats.buckets); i++) - BUG_ON(stats.buckets[i] > n); - BUG_ON(stats.buckets_alloc > n); - BUG_ON(stats.buckets_unavailable > n); -} - -static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) -{ - if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) { - u64 used = __bch2_fs_sectors_used(c); - u64 cached = 0; - u64 avail = atomic64_read(&c->sectors_available); - int cpu; - - for_each_possible_cpu(cpu) - cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache; - - if (used + avail + cached > c->capacity) - panic("used %llu avail %llu cached %llu capacity %llu\n", - used, avail, cached, c->capacity); - } -} - -#else - -static void bch2_fs_stats_verify(struct bch_fs *c) {} -static void bch2_dev_stats_verify(struct bch_dev *ca) {} -static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {} - -#endif - /* * Clear journal_seq_valid for buckets for which it's not needed, to prevent * wraparound: */ void bch2_bucket_seq_cleanup(struct bch_fs *c) { + u64 journal_seq = atomic64_read(&c->journal.seq); u16 last_seq_ondisk = c->journal.last_seq_ondisk; struct bch_dev *ca; struct bucket_array *buckets; @@ -153,6 +91,12 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) struct bucket_mark m; unsigned i; + if (journal_seq - c->last_bucket_seq_cleanup < + (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) + return; + + c->last_bucket_seq_cleanup = journal_seq; + for_each_member_device(ca, c, i) { down_read(&ca->bucket_lock); buckets = bucket_array(ca); @@ -170,85 +114,192 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) } } -#define bch2_usage_add(_acc, _stats) \ -do { \ - typeof(_acc) _a = (_acc), _s = (_stats); \ - unsigned i; \ - \ - for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \ - ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \ -} while (0) +void bch2_fs_usage_initialize(struct bch_fs *c) +{ + struct bch_fs_usage *usage; + unsigned i; -#define bch2_usage_read_raw(_stats) \ -({ \ - typeof(*this_cpu_ptr(_stats)) _acc; \ - int cpu; \ - \ - memset(&_acc, 0, sizeof(_acc)); \ - \ - for_each_possible_cpu(cpu) \ - bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \ - \ - _acc; \ -}) + percpu_down_write(&c->mark_lock); + usage = c->usage_base; -#define bch2_usage_read_cached(_c, _cached, _uncached) \ -({ \ - typeof(_cached) _ret; \ - unsigned _seq; \ - \ - do { \ - _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \ - _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \ - ? bch2_usage_read_raw(_uncached) \ - : (_cached); \ - } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \ - \ - _ret; \ -}) + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + usage->reserved += usage->persistent_reserved[i]; + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca) + switch (e->data_type) { + case BCH_DATA_BTREE: + usage->btree += usage->replicas[i]; + break; + case BCH_DATA_USER: + usage->data += usage->replicas[i]; + break; + case BCH_DATA_CACHED: + usage->cached += usage->replicas[i]; + break; + } + } + + percpu_up_write(&c->mark_lock); +} + +void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) { - return bch2_usage_read_raw(ca->usage_percpu); + if (fs_usage == c->usage_scratch) + mutex_unlock(&c->usage_scratch_lock); + else + kfree(fs_usage); +} + +struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) +{ + struct bch_fs_usage *ret; + unsigned bytes = fs_usage_u64s(c) * sizeof(u64); + + ret = kzalloc(bytes, GFP_NOWAIT); + if (ret) + return ret; + + if (mutex_trylock(&c->usage_scratch_lock)) + goto out_pool; + + ret = kzalloc(bytes, GFP_NOFS); + if (ret) + return ret; + + mutex_lock(&c->usage_scratch_lock); +out_pool: + ret = c->usage_scratch; + memset(ret, 0, bytes); + return ret; } struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) { - return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu); + struct bch_dev_usage ret; + + memset(&ret, 0, sizeof(ret)); + acc_u64s_percpu((u64 *) &ret, + (u64 __percpu *) ca->usage[0], + sizeof(ret) / sizeof(u64)); + + return ret; } -struct bch_fs_usage -__bch2_fs_usage_read(struct bch_fs *c) +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) { - return bch2_usage_read_raw(c->usage_percpu); + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & 1]); } -struct bch_fs_usage -bch2_fs_usage_read(struct bch_fs *c) +u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) { - return bch2_usage_read_cached(c, - c->usage_cached, - c->usage_percpu); + ssize_t offset = v - (u64 *) c->usage_base; + unsigned seq; + u64 ret; + + BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); + percpu_rwsem_assert_held(&c->mark_lock); + + do { + seq = read_seqcount_begin(&c->usage_lock); + ret = *v + + percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + + percpu_u64_get((u64 __percpu *) c->usage[1] + offset); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; } -struct fs_usage_sum { - u64 data; - u64 reserved; -}; +struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) +{ + struct bch_fs_usage *ret; + unsigned seq, v, u64s = fs_usage_u64s(c); +retry: + ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); + if (unlikely(!ret)) + return NULL; + + percpu_down_read(&c->mark_lock); + + v = fs_usage_u64s(c); + if (unlikely(u64s != v)) { + u64s = v; + percpu_up_read(&c->mark_lock); + kfree(ret); + goto retry; + } + + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(ret, c->usage_base, u64s * sizeof(u64)); + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} -static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) +void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) +{ + unsigned u64s = fs_usage_u64s(c); + + BUG_ON(idx >= 2); + + write_seqcount_begin(&c->usage_lock); + + acc_u64s_percpu((u64 *) c->usage_base, + (u64 __percpu *) c->usage[idx], u64s); + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + + write_seqcount_end(&c->usage_lock); +} + +void bch2_fs_usage_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_fs_usage *fs_usage) { - struct fs_usage_sum sum = { 0 }; unsigned i; - for (i = 0; i < ARRAY_SIZE(stats.s); i++) { - sum.data += (stats.s[i].data[S_META] + - stats.s[i].data[S_DIRTY]) * (i + 1); - sum.reserved += stats.s[i].persistent_reserved * (i + 1); + pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); + + pr_buf(out, "hidden:\t\t\t\t%llu\n", + fs_usage->hidden); + pr_buf(out, "data:\t\t\t\t%llu\n", + fs_usage->data); + pr_buf(out, "cached:\t\t\t\t%llu\n", + fs_usage->cached); + pr_buf(out, "reserved:\t\t\t%llu\n", + fs_usage->reserved); + pr_buf(out, "nr_inodes:\t\t\t%llu\n", + fs_usage->nr_inodes); + pr_buf(out, "online reserved:\t\t%llu\n", + fs_usage->online_reserved); + + for (i = 0; + i < ARRAY_SIZE(fs_usage->persistent_reserved); + i++) { + pr_buf(out, "%u replicas:\n", i + 1); + pr_buf(out, "\treserved:\t\t%llu\n", + fs_usage->persistent_reserved[i]); } - sum.reserved += stats.online_reserved; - return sum; + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + pr_buf(out, "\t"); + bch2_replicas_entry_to_text(out, e); + pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); + } } #define RESERVE_FACTOR 6 @@ -260,24 +311,51 @@ static u64 reserve_factor(u64 r) static u64 avail_factor(u64 r) { - return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1; + return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); } -u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) { - struct fs_usage_sum sum = __fs_usage_sum(stats); - - return sum.data + reserve_factor(sum.reserved); + return min(fs_usage->hidden + + fs_usage->btree + + fs_usage->data + + reserve_factor(fs_usage->reserved + + fs_usage->online_reserved), + c->capacity); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) +static struct bch_fs_usage_short +__bch2_fs_usage_read_short(struct bch_fs *c) { - return min(c->capacity, __bch2_fs_sectors_used(c, stats)); + struct bch_fs_usage_short ret; + u64 data, reserved; + + ret.capacity = c->capacity - + bch2_fs_usage_read_one(c, &c->usage_base->hidden); + + data = bch2_fs_usage_read_one(c, &c->usage_base->data) + + bch2_fs_usage_read_one(c, &c->usage_base->btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); + + ret.used = min(ret.capacity, data + reserve_factor(reserved)); + ret.free = ret.capacity - ret.used; + + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + + return ret; } -u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats) +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *c) { - return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats)); + struct bch_fs_usage_short ret; + + percpu_down_read(&c->mark_lock); + ret = __bch2_fs_usage_read_short(c); + percpu_up_read(&c->mark_lock); + + return ret; } static inline int is_unavailable_bucket(struct bucket_mark m) @@ -299,76 +377,94 @@ static inline int is_fragmented_bucket(struct bucket_mark m, static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors - ? BCH_DATA_CACHED + ? BCH_DATA_CACHED : m.data_type; } -static bool bucket_became_unavailable(struct bch_fs *c, - struct bucket_mark old, +static bool bucket_became_unavailable(struct bucket_mark old, struct bucket_mark new) { return is_available_bucket(old) && - !is_available_bucket(new) && - (!c || c->gc_pos.phase == GC_PHASE_DONE); + !is_available_bucket(new); } -void bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *stats, +int bch2_fs_usage_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, struct disk_reservation *disk_res, - struct gc_pos gc_pos) + unsigned journal_seq) { - struct fs_usage_sum sum = __fs_usage_sum(*stats); - s64 added = sum.data + sum.reserved; + s64 added = fs_usage->data + fs_usage->reserved; + s64 should_not_have_added; + int ret = 0; + + percpu_rwsem_assert_held(&c->mark_lock); /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0)); + should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); + if (WARN_ONCE(should_not_have_added > 0, + "disk usage increased without a reservation")) { + atomic64_sub(should_not_have_added, &c->sectors_available); + added -= should_not_have_added; + ret = -1; + } if (added > 0) { - disk_res->sectors -= added; - stats->online_reserved -= added; + disk_res->sectors -= added; + fs_usage->online_reserved -= added; } - percpu_down_read_preempt_disable(&c->usage_lock); - /* online_reserved not subject to gc: */ - this_cpu_ptr(c->usage_percpu)->online_reserved += - stats->online_reserved; - stats->online_reserved = 0; + preempt_disable(); + acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), + (u64 *) fs_usage, fs_usage_u64s(c)); + preempt_enable(); - if (!gc_will_visit(c, gc_pos)) - bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats); + return ret; +} - bch2_fs_stats_verify(c); - percpu_up_read_preempt_enable(&c->usage_lock); +static inline void account_bucket(struct bch_fs_usage *fs_usage, + struct bch_dev_usage *dev_usage, + enum bch_data_type type, + int nr, s64 size) +{ + if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) + fs_usage->hidden += size; - memset(stats, 0, sizeof(*stats)); + dev_usage->buckets[type] += nr; } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new) + struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new, + bool gc) { struct bch_dev_usage *dev_usage; - if (c) - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); - if (old.data_type && new.data_type && - old.data_type != new.data_type) { - BUG_ON(!c); - bch2_fs_inconsistent(c, - "different types of data in same bucket: %u, %u", - old.data_type, new.data_type); - } + bch2_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], + bch2_data_types[new.data_type]); + + preempt_disable(); + dev_usage = this_cpu_ptr(ca->usage[gc]); - dev_usage = this_cpu_ptr(ca->usage_percpu); + if (bucket_type(old)) + account_bucket(fs_usage, dev_usage, bucket_type(old), + -1, -ca->mi.bucket_size); - dev_usage->buckets[bucket_type(old)]--; - dev_usage->buckets[bucket_type(new)]++; + if (bucket_type(new)) + account_bucket(fs_usage, dev_usage, bucket_type(new), + 1, ca->mi.bucket_size); dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + dev_usage->buckets_ec += + (int) new.stripe - (int) old.stripe; dev_usage->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); @@ -378,203 +474,426 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, (int) new.cached_sectors - (int) old.cached_sectors; dev_usage->sectors_fragmented += is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); + preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); +} + +void bch2_dev_usage_from_buckets(struct bch_fs *c) +{ + struct bch_dev *ca; + struct bucket_mark old = { .v.counter = 0 }; + struct bucket_array *buckets; + struct bucket *g; + unsigned i; + int cpu; - bch2_dev_stats_verify(ca); + c->usage_base->hidden = 0; + + for_each_member_device(ca, c, i) { + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(ca->usage[0], cpu), 0, + sizeof(*ca->usage[0])); + + buckets = bucket_array(ca); + + for_each_bucket(g, buckets) + bch2_dev_usage_update(c, ca, c->usage_base, + old, g->mark, false); + } } -#define bucket_data_cmpxchg(c, ca, g, new, expr) \ +#define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \ ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, _old, new); \ + bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \ _old; \ }) -bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) +static inline void update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) { - struct bucket *g; - struct bucket_mark new; + int idx = bch2_replicas_entry_idx(c, r); - percpu_rwsem_assert_held(&c->usage_lock); + BUG_ON(idx < 0); + BUG_ON(!sectors); - g = bucket(ca, b); + switch (r->data_type) { + case BCH_DATA_BTREE: + fs_usage->btree += sectors; + break; + case BCH_DATA_USER: + fs_usage->data += sectors; + break; + case BCH_DATA_CACHED: + fs_usage->cached += sectors; + break; + } + fs_usage->replicas[idx] += sectors; +} - *old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (!is_available_bucket(new)) { - percpu_up_read_preempt_enable(&c->usage_lock); - return false; - } +static inline void update_cached_sectors(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + update_replicas(c, fs_usage, &r.e, sectors); +} + +static struct replicas_delta_list * +replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +{ + struct replicas_delta_list *d = trans->fs_usage_deltas; + unsigned new_size = d ? (d->size + more) * 2 : 128; + + if (!d || d->used + more > d->size) { + d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); + BUG_ON(!d); + + d->size = new_size; + trans->fs_usage_deltas = d; + } + return d; +} + +static inline void update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry *r, + s64 sectors) +{ + struct replicas_delta_list *d; + struct replicas_delta *n; + unsigned b = replicas_entry_bytes(r) + 8; + + d = replicas_deltas_realloc(trans, b); + + n = (void *) d->d + d->used; + n->delta = sectors; + memcpy(&n->r, r, replicas_entry_bytes(r)); + d->used += b; +} + +static inline void update_cached_sectors_list(struct btree_trans *trans, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + update_replicas_list(trans, &r.e, sectors); +} + +void bch2_replicas_delta_list_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + + acc_u64s((u64 *) fs_usage, + (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); + + while (d != top) { + BUG_ON((void *) d > (void *) top); + + update_replicas(c, fs_usage, &d->r, d->delta); + + d = (void *) d + replicas_entry_bytes(&d->r) + 8; + } +} + +#define do_mark_fn(fn, c, pos, flags, ...) \ +({ \ + int gc, ret = 0; \ + \ + percpu_rwsem_assert_held(&c->mark_lock); \ + \ + for (gc = 0; gc < 2 && !ret; gc++) \ + if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ + (gc && gc_visited(c, pos))) \ + ret = fn(c, __VA_ARGS__, gc); \ + ret; \ +}) + +static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *ret, + bool gc) +{ + struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + BUG_ON(!is_available_bucket(new)); - new.owned_by_allocator = 1; + new.owned_by_allocator = true; + new.dirty = true; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; new.gen++; })); + if (old.cached_sectors) + update_cached_sectors(c, fs_usage, ca->dev_idx, + -((s64) old.cached_sectors)); + + if (!gc) + *ret = old; + return 0; +} + +void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *old) +{ + do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, + ca, b, old); + if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); - return true; +} + +static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) +{ + struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.owned_by_allocator = owned_by_allocator; + })); + + BUG_ON(!gc && + !owned_by_allocator && !old.owned_by_allocator); + + return 0; } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { + preempt_disable(); + + do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, + ca, b, owned_by_allocator); + + preempt_enable(); +} + +static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + bool gc = flags & BCH_BUCKET_MARK_GC; + struct bkey_alloc_unpacked u; + struct bch_dev *ca; struct bucket *g; - struct bucket_mark old, new; + struct bucket_mark old, m; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ + if ((flags & BCH_BUCKET_MARK_GC) && + !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE)) + return 0; - percpu_rwsem_assert_held(&c->usage_lock); - g = bucket(ca, b); + ca = bch_dev_bkey_exists(c, k.k->p.inode); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; + if (k.k->p.offset >= ca->mi.nbuckets) + return 0; - old = bucket_data_cmpxchg(c, ca, g, new, ({ - new.owned_by_allocator = owned_by_allocator; + g = __bucket(ca, k.k->p.offset, gc); + u = bch2_alloc_unpack(k); + + old = bucket_cmpxchg(g, m, ({ + m.gen = u.gen; + m.data_type = u.data_type; + m.dirty_sectors = u.dirty_sectors; + m.cached_sectors = u.cached_sectors; + + if (journal_seq) { + m.journal_seq_valid = 1; + m.journal_seq = journal_seq; + } })); - BUG_ON(!owned_by_allocator && !old.owned_by_allocator && - c->gc_pos.phase == GC_PHASE_DONE); + if (!(flags & BCH_BUCKET_MARK_ALLOC_READ)) + bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); + + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) && + old.cached_sectors) { + update_cached_sectors(c, fs_usage, ca->dev_idx, + -old.cached_sectors); + trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), + old.cached_sectors); + } + + return 0; } -#define saturated_add(ca, dst, src, max) \ -do { \ - BUG_ON((int) (dst) + (src) < 0); \ - if ((dst) == (max)) \ - ; \ - else if ((dst) + (src) <= (max)) \ - dst += (src); \ - else { \ - dst = (max); \ - trace_sectors_saturated(ca); \ - } \ -} while (0) +#define checked_add(a, b) \ +({ \ + unsigned _res = (unsigned) (a) + (b); \ + bool overflow = _res > U16_MAX; \ + if (overflow) \ + _res = U16_MAX; \ + (a) = _res; \ + overflow; \ +}) + +static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type type, + unsigned sectors, bool gc) +{ + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + bool overflow; + + BUG_ON(type != BCH_DATA_SB && + type != BCH_DATA_JOURNAL); + + old = bucket_cmpxchg(g, new, ({ + new.dirty = true; + new.data_type = type; + overflow = checked_add(new.dirty_sectors, sectors); + })); + + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %u > U16_MAX", + old.dirty_sectors, sectors); + + if (c) + bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), + old, new, gc); + + return 0; +} void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors, struct gc_pos pos, unsigned flags) { - struct bucket *g; - struct bucket_mark old, new; + BUG_ON(type != BCH_DATA_SB && + type != BCH_DATA_JOURNAL); - BUG_ON(!type); + preempt_disable(); if (likely(c)) { - percpu_rwsem_assert_held(&c->usage_lock); - - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; + do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, + ca, b, type, sectors); + } else { + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); } - rcu_read_lock(); - - g = bucket(ca, b); - old = bucket_data_cmpxchg(c, ca, g, new, ({ - saturated_add(ca, new.dirty_sectors, sectors, - GC_MAX_SECTORS_USED); - new.data_type = type; - })); - - rcu_read_unlock(); - - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); + preempt_enable(); } -/* Reverting this until the copygc + compression issue is fixed: */ - -static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) +static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, + s64 delta) { - if (!sectors) - return 0; + if (delta > 0) { + /* + * marking a new extent, which _will have size_ @delta + * + * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE + * case, we haven't actually created the key we'll be inserting + * yet (for the split) - so we don't want to be using + * k->size/crc.live_size here: + */ + return __ptr_disk_sectors(p, delta); + } else { + BUG_ON(-delta > p.crc.live_size); - return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size, - crc.uncompressed_size)); + return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) - + (s64) ptr_disk_sectors(p); + } } -/* - * Checking against gc's position has to be done here, inside the cmpxchg() - * loop, to avoid racing with the start of gc clearing all the marks - GC does - * that with the gc pos seqlock held. - */ -static void bch2_mark_pointer(struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr, - struct bch_extent_crc_unpacked crc, - s64 sectors, enum s_alloc type, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) +static void bucket_set_stripe(struct bch_fs *c, + const struct bch_stripe *v, + struct bch_fs_usage *fs_usage, + u64 journal_seq, + unsigned flags) { - struct bucket_mark old, new; - unsigned saturated; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr); - enum bch_data_type data_type = type == S_META - ? BCH_DATA_BTREE : BCH_DATA_USER; - u64 v; - - if (crc.compression_type) { - unsigned old_sectors, new_sectors; + bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE); + bool gc = flags & BCH_BUCKET_MARK_GC; + unsigned i; - if (sectors > 0) { - old_sectors = 0; - new_sectors = sectors; - } else { - old_sectors = e.k->size; - new_sectors = e.k->size + sectors; - } + for (i = 0; i < v->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket_mark new, old; - sectors = -__disk_sectors(crc, old_sectors) - +__disk_sectors(crc, new_sectors); - } + BUG_ON(ptr_stale(ca, ptr)); - if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { - if (journal_seq) - bucket_cmpxchg(g, new, ({ + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.dirty = true; + new.stripe = enabled; + if (journal_seq) { new.journal_seq_valid = 1; new.journal_seq = journal_seq; - })); - - return; + } + })); } +} + +static bool bch2_mark_pointer(struct bch_fs *c, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + bool gc = flags & BCH_BUCKET_MARK_GC; + struct bucket_mark old, new; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); + bool overflow; + u64 v; v = atomic64_read(&g->_mark.v); do { new.v.counter = old.v.counter = v; - saturated = 0; + + new.dirty = true; /* * Check this after reading bucket mark to guard against * the allocator invalidating a bucket after we've already * checked the gen */ - if (gen_after(new.gen, ptr->gen)) { + if (gen_after(new.gen, p.ptr.gen)) { BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)); - EBUG_ON(!ptr->cached && + EBUG_ON(!p.ptr.cached && test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - return; + return true; } - if (!ptr->cached && - new.dirty_sectors == GC_MAX_SECTORS_USED && - sectors < 0) - saturated = -sectors; - - if (ptr->cached) - saturated_add(ca, new.cached_sectors, sectors, - GC_MAX_SECTORS_USED); + if (!p.ptr.cached) + overflow = checked_add(new.dirty_sectors, sectors); else - saturated_add(ca, new.dirty_sectors, sectors, - GC_MAX_SECTORS_USED); + overflow = checked_add(new.cached_sectors, sectors); if (!new.dirty_sectors && !new.cached_sectors) { @@ -596,129 +915,769 @@ static void bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, old, new); + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %lli > U16_MAX", + !p.ptr.cached + ? old.dirty_sectors + : old.cached_sectors, sectors); + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); + + return false; +} + +static int bch2_mark_stripe_ptr(struct bch_fs *c, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, + s64 sectors, unsigned flags) +{ + bool gc = flags & BCH_BUCKET_MARK_GC; + struct stripe *m; + unsigned old, new, nr_data; + int blocks_nonempty_delta; + s64 parity_sectors; + + BUG_ON(!sectors); + + m = genradix_ptr(&c->stripes[gc], p.idx); + + spin_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + spin_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", + (u64) p.idx); + return -1; + } + + BUG_ON(m->r.e.data_type != data_type); + + nr_data = m->nr_blocks - m->nr_redundant; + + parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); + + if (sectors < 0) + parity_sectors = -parity_sectors; + sectors += parity_sectors; + + old = m->block_sectors[p.block]; + m->block_sectors[p.block] += sectors; + new = m->block_sectors[p.block]; + + blocks_nonempty_delta = (int) !!new - (int) !!old; + if (blocks_nonempty_delta) { + m->blocks_nonempty += blocks_nonempty_delta; + + if (!gc) + bch2_stripes_heap_update(c, m, p.idx); + } + + m->dirty = true; + + spin_unlock(&c->ec_stripes_heap_lock); + + update_replicas(c, fs_usage, &m->r.e, sectors); - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); + return 0; +} - if (saturated && - atomic_long_add_return(saturated, - &ca->saturated_count) >= - bucket_to_sector(ca, ca->free_inc.size)) { - if (c->gc_thread) { - trace_gc_sectors_saturated(c); - wake_up_process(c->gc_thread); +static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, enum bch_data_type data_type, + struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + s64 dirty_sectors = 0; + unsigned i; + int ret; + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + BUG_ON(!sectors); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = data_type == BCH_DATA_BTREE + ? sectors + : ptr_disk_sectors_delta(p, sectors); + bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, + fs_usage, journal_seq, flags); + + if (p.ptr.cached) { + if (disk_sectors && !stale) + update_cached_sectors(c, fs_usage, p.ptr.dev, + disk_sectors); + } else if (!p.ec_nr) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + for (i = 0; i < p.ec_nr; i++) { + ret = bch2_mark_stripe_ptr(c, p.ec[i], + data_type, fs_usage, + disk_sectors, flags); + if (ret) + return ret; + } + + r.e.nr_required = 0; } } + + if (dirty_sectors) + update_replicas(c, fs_usage, &r.e, dirty_sectors); + + return 0; } -void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - struct gc_pos pos, - struct bch_fs_usage *stats, +static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + bool gc = flags & BCH_BUCKET_MARK_GC; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + size_t idx = s.k->p.offset; + struct stripe *m = genradix_ptr(&c->stripes[gc], idx); + unsigned i; + + spin_lock(&c->ec_stripes_heap_lock); + + if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) { + spin_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "error marking nonexistent stripe %zu", + idx); + return -1; + } + + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) { + m->sectors = le16_to_cpu(s.v->sectors); + m->algorithm = s.v->algorithm; + m->nr_blocks = s.v->nr_blocks; + m->nr_redundant = s.v->nr_redundant; + + bch2_bkey_to_replicas(&m->r.e, k); + + /* + * XXX: account for stripes somehow here + */ +#if 0 + update_replicas(c, fs_usage, &m->r.e, stripe_sectors); +#endif + + /* gc recalculates these fields: */ + if (!(flags & BCH_BUCKET_MARK_GC)) { + for (i = 0; i < s.v->nr_blocks; i++) { + m->block_sectors[i] = + stripe_blockcount_get(s.v, i); + m->blocks_nonempty += !!m->block_sectors[i]; + } + } + + if (!gc) + bch2_stripes_heap_update(c, m, idx); + m->alive = true; + } else { + if (!gc) + bch2_stripes_heap_del(c, m, idx); + memset(m, 0, sizeof(*m)); + } + + spin_unlock(&c->ec_stripes_heap_lock); + + bucket_set_stripe(c, s.v, fs_usage, 0, flags); + return 0; +} + +int bch2_mark_key_locked(struct bch_fs *c, + struct bkey_s_c k, s64 sectors, + struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - /* - * synchronization w.r.t. GC: - * - * Normally, bucket sector counts/marks are updated on the fly, as - * references are added/removed from the btree, the lists of buckets the - * allocator owns, other metadata buckets, etc. - * - * When GC is in progress and going to mark this reference, we do _not_ - * mark this reference here, to avoid double counting - GC will count it - * when it gets to it. - * - * To know whether we should mark a given reference (GC either isn't - * running, or has already marked references at this position) we - * construct a total order for everything GC walks. Then, we can simply - * compare the position of the reference we're marking - @pos - with - * GC's current position. If GC is going to mark this reference, GC's - * current position will be less than @pos; if GC's current position is - * greater than @pos GC has either already walked this position, or - * isn't running. - * - * To avoid racing with GC's position changing, we have to deal with - * - GC's position being set to GC_POS_MIN when GC starts: - * usage_lock guards against this - * - GC's position overtaking @pos: we guard against this with - * whatever lock protects the data structure the reference lives in - * (e.g. the btree node lock, or the relevant allocator lock). - */ + int ret = 0; - percpu_down_read_preempt_disable(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - flags |= BCH_BUCKET_MARK_GC_WILL_VISIT; + preempt_disable(); - if (!stats) - stats = this_cpu_ptr(c->usage_percpu); + if (!fs_usage || (flags & BCH_BUCKET_MARK_GC)) + fs_usage = fs_usage_ptr(c, journal_seq, + flags & BCH_BUCKET_MARK_GC); switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - enum s_alloc type = metadata ? S_META : S_DIRTY; - unsigned replicas = 0; - - BUG_ON(metadata && bkey_extent_is_cached(e.k)); - BUG_ON(!sectors); - - extent_for_each_ptr_crc(e, ptr, crc) { - bch2_mark_pointer(c, e, ptr, crc, sectors, type, - stats, journal_seq, flags); - replicas += !ptr->cached; + case KEY_TYPE_alloc: + ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_btree_ptr: + sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + + ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE, + fs_usage, journal_seq, flags); + break; + case KEY_TYPE_extent: + ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, + fs_usage, journal_seq, flags); + break; + case KEY_TYPE_stripe: + ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); + break; + case KEY_TYPE_inode: + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) + fs_usage->nr_inodes++; + else + fs_usage->nr_inodes--; + break; + case KEY_TYPE_reservation: { + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + + sectors *= replicas; + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; + break; + } + } + + preempt_enable(); + + return ret; +} + +int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + int ret; + + percpu_down_read(&c->mark_lock); + ret = bch2_mark_key_locked(c, k, sectors, + fs_usage, journal_seq, flags); + percpu_up_read(&c->mark_lock); + + return ret; +} + +inline int bch2_mark_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c old, + struct bkey_i *new, + struct bch_fs_usage *fs_usage, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = iter->l[0].b; + s64 sectors = 0; + + if (btree_node_is_extents(b) + ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 + : bkey_cmp(new->k.p, old.k->p)) + return 0; + + if (btree_node_is_extents(b)) { + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + sectors = -((s64) old.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + sectors = bkey_start_offset(old.k) - + new->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + sectors = old.k->p.offset - new->k.p.offset; + BUG_ON(sectors <= 0); + + bch2_mark_key_locked(c, old, sectors, + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); + + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; } - if (replicas) { - BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s)); - stats->s[replicas - 1].data[type] += sectors; + BUG_ON(sectors >= 0); + } + + return bch2_mark_key_locked(c, old, sectors, fs_usage, + trans->journal_res.seq, + BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1; +} + +int bch2_mark_update(struct btree_trans *trans, + struct btree_insert_entry *insert, + struct bch_fs_usage *fs_usage, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert->iter; + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + int ret = 0; + + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + + if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT)) + bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), + bpos_min(insert->k->k.p, b->key.k.p).offset - + bkey_start_offset(&insert->k->k), + fs_usage, trans->journal_res.seq, + BCH_BUCKET_MARK_INSERT|flags); + + if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) + return 0; + + /* + * For non extents, we only mark the new key, not the key being + * overwritten - unless we're actually deleting: + */ + if ((iter->btree_id == BTREE_ID_ALLOC || + iter->btree_id == BTREE_ID_EC) && + !bkey_deleted(&insert->k->k)) + return 0; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + + ret = bch2_mark_overwrite(trans, iter, k, insert->k, + fs_usage, flags); + if (ret <= 0) + break; + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return ret; +} + +void bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct bch_fs_usage *fs_usage) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + static int warned_disk_usage = 0; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + char buf[200]; + + if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, + trans->journal_res.seq) || + warned_disk_usage || + xchg(&warned_disk_usage, 1)) + return; + + pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + + trans_for_each_update_iter(trans, i) { + struct btree_iter *iter = i->iter; + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + + pr_err("while inserting"); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + pr_err("%s", buf); + pr_err("overlapping with"); + + node_iter = iter->l[0].iter; + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k; + + k = bkey_disassemble(b, _k, &unpacked); + + if (btree_node_is_extents(b) + ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(i->k->k.p, k.k->p)) + break; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); + + bch2_btree_node_iter_advance(&node_iter, b); } - break; } - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); +} - if (r.v->nr_replicas) { - BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s)); - stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors; +/* trans_mark: */ + +static int trans_get_key(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + struct btree_iter **iter, + struct bkey_s_c *k) +{ + unsigned i; + int ret; + + for (i = 0; i < trans->nr_updates; i++) + if (!trans->updates[i].deferred && + trans->updates[i].iter->btree_id == btree_id && + !bkey_cmp(pos, trans->updates[i].iter->pos)) { + *iter = trans->updates[i].iter; + *k = bkey_i_to_s_c(trans->updates[i].k); + return 0; } - break; + + *iter = __bch2_trans_get_iter(trans, btree_id, pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0); + if (IS_ERR(*iter)) + return PTR_ERR(*iter); + + *k = bch2_btree_iter_peek_slot(*iter); + ret = bkey_err(*k); + if (ret) + bch2_trans_iter_put(trans, *iter); + return ret; +} + +static void *trans_update_key(struct btree_trans *trans, + struct btree_iter *iter, + unsigned u64s) +{ + struct bkey_i *new_k; + unsigned i; + + new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(new_k)) + return new_k; + + bkey_init(&new_k->k); + new_k->k.p = iter->pos; + + for (i = 0; i < trans->nr_updates; i++) + if (!trans->updates[i].deferred && + trans->updates[i].iter == iter) { + trans->updates[i].k = new_k; + return new_k; + } + + bch2_trans_update(trans, ((struct btree_insert_entry) { + .iter = iter, + .k = new_k, + .triggered = true, + })); + + return new_k; +} + +static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + bool overflow; + int ret; + + ret = trans_get_key(trans, BTREE_ID_ALLOC, + POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), + &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_alloc) { + bch_err_ratelimited(c, "pointer to nonexistent bucket %u:%zu", + p.ptr.dev, + PTR_BUCKET_NR(ca, &p.ptr)); + ret = -1; + goto out; } + + u = bch2_alloc_unpack(k); + + if (gen_after(u.gen, p.ptr.gen)) { + ret = 1; + goto out; } - percpu_up_read_preempt_enable(&c->usage_lock); + + if (!p.ptr.cached) + overflow = checked_add(u.dirty_sectors, sectors); + else + overflow = checked_add(u.cached_sectors, sectors); + + u.data_type = u.dirty_sectors || u.cached_sectors + ? data_type : 0; + + bch2_fs_inconsistent_on(overflow, c, + "bucket sector count overflow: %u + %lli > U16_MAX", + !p.ptr.cached + ? u.dirty_sectors + : u.cached_sectors, sectors); + + a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); +out: + bch2_trans_iter_put(trans, iter); + return ret; } -/* Disk reservations: */ +static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct bch_extent_stripe_ptr p, + s64 sectors, enum bch_data_type data_type) +{ + struct bch_replicas_padded r; + struct btree_iter *iter; + struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_s_stripe s; + unsigned nr_data; + s64 parity_sectors; + int ret = 0; + + BUG_ON(!sectors); + + ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_stripe) { + bch_err_ratelimited(trans->c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -1; + goto out; + } + + new_k = trans_update_key(trans, iter, k.k->u64s); + ret = PTR_ERR_OR_ZERO(new_k); + if (ret) + goto out; + + bkey_reassemble(new_k, k); + s = bkey_i_to_s_stripe(new_k); + + nr_data = s.v->nr_blocks - s.v->nr_redundant; + + parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data); + + if (sectors < 0) + parity_sectors = -parity_sectors; -static u64 __recalc_sectors_available(struct bch_fs *c) + stripe_blockcount_set(s.v, p.block, + stripe_blockcount_get(s.v, p.block) + + sectors + parity_sectors); + + bch2_bkey_to_replicas(&r.e, s.s_c); + + update_replicas_list(trans, &r.e, sectors); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_extent(struct btree_trans *trans, + struct bkey_s_c k, + s64 sectors, enum bch_data_type data_type) { - int cpu; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + s64 dirty_sectors = 0; + bool stale; + unsigned i; + int ret; + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + BUG_ON(!sectors); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = data_type == BCH_DATA_BTREE + ? sectors + : ptr_disk_sectors_delta(p, sectors); + + ret = bch2_trans_mark_pointer(trans, p, disk_sectors, + data_type); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (disk_sectors && !stale) + update_cached_sectors_list(trans, p.ptr.dev, + disk_sectors); + } else if (!p.ec_nr) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + for (i = 0; i < p.ec_nr; i++) { + ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i], + disk_sectors, data_type); + if (ret) + return ret; + } + + r.e.nr_required = 0; + } + } + + if (dirty_sectors) + update_replicas_list(trans, &r.e, dirty_sectors); + + return 0; +} + +int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, + s64 sectors, unsigned flags) +{ + struct replicas_delta_list *d; + struct bch_fs *c = trans->c; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + ? c->opts.btree_node_size + : -c->opts.btree_node_size; + + return bch2_trans_mark_extent(trans, k, sectors, + BCH_DATA_BTREE); + case KEY_TYPE_extent: + return bch2_trans_mark_extent(trans, k, sectors, + BCH_DATA_USER); + case KEY_TYPE_inode: + d = replicas_deltas_realloc(trans, 0); + + if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) + d->fs_usage.nr_inodes++; + else + d->fs_usage.nr_inodes--; + return 0; + case KEY_TYPE_reservation: { + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + + d = replicas_deltas_realloc(trans, 0); + + sectors *= replicas; + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(d->fs_usage.persistent_reserved)); + + d->fs_usage.reserved += sectors; + d->fs_usage.persistent_reserved[replicas - 1] += sectors; + return 0; + } + default: + return 0; + } +} - for_each_possible_cpu(cpu) - per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; +int bch2_trans_mark_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + int ret; + + if (!btree_node_type_needs_gc(iter->btree_id)) + return 0; + + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(insert), + bpos_min(insert->k.p, b->key.k.p).offset - + bkey_start_offset(&insert->k), + BCH_BUCKET_MARK_INSERT); + if (ret) + return ret; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k; + s64 sectors = 0; + + k = bkey_disassemble(b, _k, &unpacked); + + if (btree_node_is_extents(b) + ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 + : bkey_cmp(insert->k.p, k.k->p)) + break; + + if (btree_node_is_extents(b)) { + switch (bch2_extent_overlap(&insert->k, k.k)) { + case BCH_EXTENT_OVERLAP_ALL: + sectors = -((s64) k.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + sectors = bkey_start_offset(&insert->k) - + k.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + sectors = bkey_start_offset(k.k) - + insert->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + sectors = k.k->p.offset - insert->k.p.offset; + BUG_ON(sectors <= 0); + + ret = bch2_trans_mark_key(trans, k, sectors, + BCH_BUCKET_MARK_INSERT); + if (ret) + return ret; + + sectors = bkey_start_offset(&insert->k) - + k.k->p.offset; + break; + } - return bch2_fs_sectors_free(c, bch2_fs_usage_read(c)); + BUG_ON(sectors >= 0); + } + + ret = bch2_trans_mark_key(trans, k, sectors, + BCH_BUCKET_MARK_OVERWRITE); + if (ret) + return ret; + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return 0; } -/* Used by gc when it's starting: */ -void bch2_recalc_sectors_available(struct bch_fs *c) +/* Disk reservations: */ + +static u64 bch2_recalc_sectors_available(struct bch_fs *c) { - percpu_down_write(&c->usage_lock); - atomic64_set(&c->sectors_available, __recalc_sectors_available(c)); - percpu_up_write(&c->usage_lock); + percpu_u64_set(&c->pcpu->sectors_available, 0); + + return avail_factor(__bch2_fs_usage_read_short(c).free); } void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - percpu_down_read_preempt_disable(&c->usage_lock); - this_cpu_sub(c->usage_percpu->online_reserved, + percpu_down_read(&c->mark_lock); + this_cpu_sub(c->usage[0]->online_reserved, res->sectors); - - bch2_fs_stats_verify(c); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read(&c->mark_lock); res->sectors = 0; } @@ -728,15 +1687,16 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, unsigned sectors, int flags) { - struct bch_fs_usage *stats; + struct bch_fs_pcpu *pcpu; u64 old, v, get; s64 sectors_available; int ret; - percpu_down_read_preempt_disable(&c->usage_lock); - stats = this_cpu_ptr(c->usage_percpu); + percpu_down_read(&c->mark_lock); + preempt_disable(); + pcpu = this_cpu_ptr(c->pcpu); - if (sectors <= stats->available_cache) + if (sectors <= pcpu->sectors_available) goto out; v = atomic64_read(&c->sectors_available); @@ -745,64 +1705,42 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { - percpu_up_read_preempt_enable(&c->usage_lock); + preempt_enable(); + percpu_up_read(&c->mark_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, old, old - get)) != old); - stats->available_cache += get; + pcpu->sectors_available += get; out: - stats->available_cache -= sectors; - stats->online_reserved += sectors; - res->sectors += sectors; + pcpu->sectors_available -= sectors; + this_cpu_add(c->usage[0]->online_reserved, sectors); + res->sectors += sectors; - bch2_disk_reservations_verify(c, flags); - bch2_fs_stats_verify(c); - percpu_up_read_preempt_enable(&c->usage_lock); + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; recalculate: - /* - * GC recalculates sectors_available when it starts, so that hopefully - * we don't normally end up blocking here: - */ + percpu_down_write(&c->mark_lock); - /* - * Piss fuck, we can be called from extent_insert_fixup() with btree - * locks held: - */ - - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) { - if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD)) - down_read(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) - return -EINTR; - } - - percpu_down_write(&c->usage_lock); - sectors_available = __recalc_sectors_available(c); + sectors_available = bch2_recalc_sectors_available(c); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); - stats->online_reserved += sectors; - res->sectors += sectors; + this_cpu_add(c->usage[0]->online_reserved, sectors); + res->sectors += sectors; ret = 0; - - bch2_disk_reservations_verify(c, flags); } else { atomic64_set(&c->sectors_available, sectors_available); ret = -ENOSPC; } - bch2_fs_stats_verify(c); - percpu_up_write(&c->usage_lock); - - if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) - up_read(&c->gc_lock); + percpu_up_write(&c->mark_lock); return ret; } @@ -822,8 +1760,8 @@ static void buckets_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; - unsigned long *buckets_dirty = NULL; - u8 *oldest_gens = NULL; + unsigned long *buckets_nouse = NULL; + unsigned long *buckets_written = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; @@ -832,10 +1770,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); /* XXX: these should be tunable */ - size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); - size_t free_inc_reserve = copygc_reserve / 2; - bool resize = ca->buckets != NULL, + size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); + size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), + btree_reserve * 2); + bool resize = ca->buckets[0] != NULL, start_copygc = ca->copygc_thread != NULL; int ret = -ENOMEM; unsigned i; @@ -848,17 +1787,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || - !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8), + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || - !(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) * + !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) || + !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || + !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) goto err; @@ -870,7 +1810,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (resize) { down_write(&c->gc_lock); down_write(&ca->bucket_lock); - percpu_down_write(&c->usage_lock); + percpu_down_write(&c->mark_lock); } old_buckets = bucket_array(ca); @@ -881,22 +1821,22 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets->b, old_buckets->b, n * sizeof(struct bucket)); - memcpy(oldest_gens, - ca->oldest_gens, - n * sizeof(u8)); - memcpy(buckets_dirty, - ca->buckets_dirty, + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); + memcpy(buckets_written, + ca->buckets_written, BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets, buckets); + rcu_assign_pointer(ca->buckets[0], buckets); buckets = old_buckets; - swap(ca->oldest_gens, oldest_gens); - swap(ca->buckets_dirty, buckets_dirty); + swap(ca->buckets_nouse, buckets_nouse); + swap(ca->buckets_written, buckets_written); if (resize) - percpu_up_write(&c->usage_lock); + percpu_up_write(&c->mark_lock); spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) { @@ -931,10 +1871,10 @@ err: free_fifo(&free_inc); for (i = 0; i < RESERVE_NR; i++) free_fifo(&free[i]); - kvpfree(buckets_dirty, + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvpfree(buckets_written, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); - kvpfree(oldest_gens, - nbuckets * sizeof(u8)); if (buckets) call_rcu(&old_buckets->rcu, buckets_free_rcu); @@ -950,19 +1890,20 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - kvpfree(ca->buckets_dirty, + kvpfree(ca->buckets_written, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - kvpfree(rcu_dereference_protected(ca->buckets, 1), + kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage_percpu); + free_percpu(ca->usage[0]); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage))) + if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) return -ENOMEM; return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 4deb6c37391c..5ab6f3d34137 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Code for manipulating bucket marks for garbage collection. * @@ -16,35 +17,47 @@ #define bucket_cmpxchg(g, new, expr) \ ({ \ + struct bucket *_g = g; \ u64 _v = atomic64_read(&(g)->_mark.v); \ struct bucket_mark _old; \ \ do { \ (new).v.counter = _old.v.counter = _v; \ expr; \ - } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \ + } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ _old.v.counter, \ (new).v.counter)) != _old.v.counter);\ _old; \ }) -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline struct bucket_array *__bucket_array(struct bch_dev *ca, + bool gc) { - return rcu_dereference_check(ca->buckets, + return rcu_dereference_check(ca->buckets[gc], !ca->fs || - percpu_rwsem_is_held(&ca->fs->usage_lock) || + percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +static inline struct bucket_array *bucket_array(struct bch_dev *ca) { - struct bucket_array *buckets = bucket_array(ca); + return __bucket_array(ca, false); +} + +static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) +{ + struct bucket_array *buckets = __bucket_array(ca, gc); BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); return buckets->b + b; } +static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +{ + return __bucket(ca, b, false); +} + static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { @@ -63,7 +76,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) { - return bucket(ca, b)->mark.gen - ca->oldest_gens[b]; + struct bucket *g = bucket(ca, b); + + return g->mark.gen - g->oldest_gen; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -73,9 +88,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, } static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) + const struct bch_extent_ptr *ptr, + bool gc) { - return bucket(ca, PTR_BUCKET_NR(ca, ptr)); + return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); } static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, @@ -84,7 +100,7 @@ static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, struct bucket_mark m; rcu_read_lock(); - m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark); + m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); rcu_read_unlock(); return m; @@ -112,12 +128,21 @@ static inline u8 ptr_stale(struct bch_dev *ca, return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } -/* bucket gc marks */ +static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, + unsigned live_size) +{ + return live_size && p.crc.compression_type + ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, + p.crc.uncompressed_size)) + : live_size; +} + +static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p) +{ + return __ptr_disk_sectors(p, p.crc.live_size); +} -/* The dirty and cached sector counts saturate. If this occurs, - * reference counting alone will not free the bucket, and a btree - * GC must be performed. */ -#define GC_MAX_SECTORS_USED ((1U << 15) - 1) +/* bucket gc marks */ static inline unsigned bucket_sectors_used(struct bucket_mark mark) { @@ -131,11 +156,26 @@ static inline bool bucket_unused(struct bucket_mark mark) !bucket_sectors_used(mark); } +static inline bool is_available_bucket(struct bucket_mark mark) +{ + return (!mark.owned_by_allocator && + !mark.dirty_sectors && + !mark.stripe); +} + +static inline bool bucket_needs_journal_commit(struct bucket_mark m, + u16 last_seq_ondisk) +{ + return m.journal_seq_valid && + ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); +} + /* Device usage: */ -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *); struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); +void bch2_dev_usage_from_buckets(struct bch_fs *); + static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -172,44 +212,36 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ -static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s) +static inline unsigned fs_usage_u64s(struct bch_fs *c) { - switch (s) { - case S_META: - return BCH_DATA_BTREE; - case S_DIRTY: - return BCH_DATA_USER; - default: - BUG(); - } + + return sizeof(struct bch_fs_usage) / sizeof(u64) + + READ_ONCE(c->replicas.nr); } -struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); -struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); -void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); +void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); +struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); -u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); -u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage); +u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -static inline bool is_available_bucket(struct bucket_mark mark) -{ - return (!mark.owned_by_allocator && - !mark.dirty_sectors && - !mark.nouse); -} +struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} +void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); + +void bch2_fs_usage_to_text(struct printbuf *, + struct bch_fs *, struct bch_fs_usage *); + +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); + +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *); + +/* key/bucket marking: */ void bch2_bucket_seq_cleanup(struct bch_fs *); +void bch2_fs_usage_initialize(struct bch_fs *); -bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, +void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); @@ -217,15 +249,36 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -#define BCH_BUCKET_MARK_NOATOMIC (1 << 0) -#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) -#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) -#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) - -void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos, - struct bch_fs_usage *, u64, unsigned); - -void bch2_recalc_sectors_available(struct bch_fs *); +#define BCH_BUCKET_MARK_INSERT (1 << 0) +#define BCH_BUCKET_MARK_OVERWRITE (1 << 1) +#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2) +#define BCH_BUCKET_MARK_GC (1 << 3) +#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4) +#define BCH_BUCKET_MARK_NOATOMIC (1 << 5) + +int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64, + struct bch_fs_usage *, u64, unsigned); +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, + struct bch_fs_usage *, u64, unsigned); +int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, unsigned); + +int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, struct bkey_i *, + struct bch_fs_usage *, unsigned); +int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, + struct bch_fs_usage *, unsigned); + +void bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned); +int bch2_trans_mark_update(struct btree_trans *, + struct btree_iter *iter, + struct bkey_i *insert); +void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); + +/* disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); @@ -237,8 +290,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1) -#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2) int bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 10f00861385e..e51d297976be 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -1,31 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BUCKETS_TYPES_H #define _BUCKETS_TYPES_H +#include "bcachefs_format.h" #include "util.h" +#define BUCKET_JOURNAL_SEQ_BITS 16 + struct bucket_mark { union { - struct { - atomic64_t v; - }; + atomic64_t v; struct { - u8 gen; - u8 data_type:3, - gen_valid:1, - owned_by_allocator:1, - nouse:1, - journal_seq_valid:1; - u16 dirty_sectors; - u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket - * can't be reused until the journal sequence number written to - * disk is >= the bucket's journal sequence number: - */ - u16 journal_seq; + u8 gen; + u8 data_type:3, + owned_by_allocator:1, + dirty:1, + journal_seq_valid:1, + stripe:1; + u16 dirty_sectors; + u16 cached_sectors; + + /* + * low bits of journal sequence number when this bucket was most + * recently modified: if journal_seq_valid is set, this bucket can't be + * reused until the journal sequence number written to disk is >= the + * bucket's journal sequence number: + */ + u16 journal_seq; }; }; }; @@ -37,6 +39,8 @@ struct bucket { }; u16 io_time[2]; + u8 oldest_gen; + unsigned gen_valid:1; }; struct bucket_array { @@ -49,6 +53,7 @@ struct bucket_array { struct bch_dev_usage { u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; /* _compressed_ sectors: */ @@ -56,32 +61,59 @@ struct bch_dev_usage { u64 sectors_fragmented; }; -/* kill, switch to bch_data_type? */ -enum s_alloc { - S_META, - S_DIRTY, - S_ALLOC_NR, -}; - struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - /* _uncompressed_ sectors: */ + u64 online_reserved; - u64 available_cache; - struct { - u64 data[S_ALLOC_NR]; - u64 persistent_reserved; - } s[BCH_REPLICAS_MAX]; + /* fields after online_reserved are cleared/recalculated by gc: */ + u64 gc_start[0]; + + u64 hidden; + u64 btree; + u64 data; + u64 cached; + u64 reserved; + u64 nr_inodes; + + /* XXX: add stats for compression ratio */ +#if 0 + u64 uncompressed; + u64 compressed; +#endif + + /* broken out: */ + + u64 persistent_reserved[BCH_REPLICAS_MAX]; + u64 replicas[]; +}; + +struct bch_fs_usage_short { + u64 capacity; + u64 used; + u64 free; + u64 nr_inodes; +}; + +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + struct bch_fs_usage fs_usage; + struct replicas_delta d[0]; }; /* * A reservation for space on disk: */ struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; + u64 sectors; + u32 gen; + unsigned nr_replicas; }; struct copygc_heap_entry { diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 5593b9a1de27..059eca01ccc4 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_CHARDEV #include "bcachefs.h" -#include "alloc.h" #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" @@ -158,7 +158,7 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) if (arg.flags || arg.pad) return -EINVAL; - return bch2_fs_start(c) ? -EIO : 0; + return bch2_fs_start(c); } static long bch2_ioctl_stop(struct bch_fs *c) @@ -303,10 +303,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.iter.btree_id, - .p.pos = ctx->stats.iter.pos, + .p.btree_id = ctx->stats.btree_id, + .p.pos = ctx->stats.pos, .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)), + .p.sectors_total = bch2_fs_usage_read_short(c).used, }; if (len < sizeof(e)) @@ -394,22 +394,31 @@ static long bch2_ioctl_usage(struct bch_fs *c, } { - struct bch_fs_usage src = bch2_fs_usage_read(c); + struct bch_fs_usage *src; struct bch_ioctl_fs_usage dst = { .capacity = c->capacity, - .used = bch2_fs_sectors_used(c, src), - .online_reserved = src.online_reserved, }; + src = bch2_fs_usage_read(c); + if (!src) + return -ENOMEM; + + dst.used = bch2_fs_sectors_used(c, src); + dst.online_reserved = src->online_reserved; + + percpu_up_read(&c->mark_lock); + for (i = 0; i < BCH_REPLICAS_MAX; i++) { dst.persistent_reserved[i] = - src.s[i].persistent_reserved; - - for (j = 0; j < S_ALLOC_NR; j++) - dst.sectors[s_alloc_to_data_type(j)][i] = - src.s[i].data[j]; + src->persistent_reserved[i]; +#if 0 + for (j = 0; j < BCH_DATA_NR; j++) + dst.sectors[j][i] = src.replicas[i].data[j]; +#endif } + kfree(src); + ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); if (ret) return ret; diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h index c3057b07523c..3a4890d39ff9 100644 --- a/fs/bcachefs/chardev.h +++ b/fs/bcachefs/chardev.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_CHARDEV_H #define _BCACHEFS_CHARDEV_H diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 28d086bc0e61..ee90f3402c36 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "checksum.h" #include "super.h" @@ -9,133 +10,11 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <keys/user-type.h> -/* - * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any - * use permitted, subject to terms of PostgreSQL license; see.) - - * If we have a 64-bit integer type, then a 64-bit CRC looks just like the - * usual sort of implementation. (See Ross Williams' excellent introduction - * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from - * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) - * If we have no working 64-bit type, then fake it with two 32-bit registers. - * - * The present implementation is a normal (not "reflected", in Williams' - * terms) 64-bit CRC, using initial all-ones register contents and a final - * bit inversion. The chosen polynomial is borrowed from the DLT1 spec - * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): - * - * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x + 1 -*/ - -static const u64 crc_table[256] = { - 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, - 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, - 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, - 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, - 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, - 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, - 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, - 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, - 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, - 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, - 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, - 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, - 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, - 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, - 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, - 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, - 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, - 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, - 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, - 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, - 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, - 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, - 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, - 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, - 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, - 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, - 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, - 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, - 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, - 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, - 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, - 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, - 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, - 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, - 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, - 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, - 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, - 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, - 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, - 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, - 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, - 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, - 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, - 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, - 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, - 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, - 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, - 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, - 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, - 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, - 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, - 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, - 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, - 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, - 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, - 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, - 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, - 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, - 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, - 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, - 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, - 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, - 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, - 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, - 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, - 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, - 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, - 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, - 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, - 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, - 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, - 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, - 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, - 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, - 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, - 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, - 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, - 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, - 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, - 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, - 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, - 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, - 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, - 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, - 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, - 0x9AFCE626CE85B507ULL, -}; - -u64 bch2_crc64_update(u64 crc, const void *_data, size_t len) -{ - const unsigned char *data = _data; - - while (len--) { - int i = ((int) (crc >> 56) ^ *data++) & 0xFF; - crc = crc_table[i] ^ (crc << 8); - } - - return crc; -} - static u64 bch2_checksum_init(unsigned type) { switch (type) { @@ -188,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -213,10 +92,10 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, } int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, - void *buf, size_t len) + void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -224,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -232,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -401,22 +281,8 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } -static inline bool bch2_checksum_mergeable(unsigned type) -{ - - switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: - return true; - default: - return false; - } -} - -static struct bch_csum bch2_checksum_merge(unsigned type, - struct bch_csum a, - struct bch_csum b, size_t b_len) +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, + struct bch_csum b, size_t b_len) { BUG_ON(!bch2_checksum_mergeable(type)); @@ -597,7 +463,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -680,7 +546,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -708,7 +574,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -740,7 +606,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 2690cc4baeea..afdbbf702970 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_CHECKSUM_H #define _BCACHEFS_CHECKSUM_H @@ -5,9 +6,29 @@ #include "extents_types.h" #include "super-io.h" -#include <crypto/chacha20.h> +#include <linux/crc64.h> +#include <crypto/chacha.h> -u64 bch2_crc64_update(u64, const void *, size_t); +static inline bool bch2_checksum_mergeable(unsigned type) +{ + + switch (type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: + return true; + default: + return false; + } +} + +struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, + struct bch_csum, size_t); + +static inline u64 bch2_crc64_update(u64 crc, const void *p, size_t len) +{ + return crc64_be(crc, p, len); +} #define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) #define BCH_NONCE_BTREE cpu_to_le32(2 << 28) @@ -109,14 +130,6 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c, return true; } -static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -}; - /* returns true if not equal */ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) { @@ -130,9 +143,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index c67376f96f5a..8ac6990c6971 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "clock.h" @@ -21,7 +22,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) if (clock->timers.data[i] == timer) goto out; - BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp)); + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); out: spin_unlock(&clock->timer_lock); } @@ -34,7 +35,7 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) for (i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) { - heap_del(&clock->timers, i, io_timer_cmp); + heap_del(&clock->timers, i, io_timer_cmp, NULL); break; } @@ -127,7 +128,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) - heap_pop(&clock->timers, ret, io_timer_cmp); + heap_pop(&clock->timers, ret, io_timer_cmp, NULL); spin_unlock(&clock->timer_lock); diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 1e2a7dea4ddd..5cb043c579d8 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_CLOCK_H #define _BCACHEFS_CLOCK_H diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index df404b6dd3fe..2b5e499e12b4 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_CLOCK_TYPES_H #define _BCACHEFS_CLOCK_TYPES_H diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 6379905bad7b..a7264d802ed7 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "checksum.h" #include "compress.h" @@ -5,7 +6,6 @@ #include "io.h" #include "super-io.h" -#include "lz4.h" #include <linux/lz4.h> #include <linux/zlib.h> #include <linux/zstd.h> @@ -159,11 +159,6 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_LZ4_OLD: - ret = bch2_lz4_decompress(src_data.b, &src_len, - dst_data, dst_len); - if (ret) - goto err; - break; case BCH_COMPRESSION_LZ4: ret = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); @@ -601,11 +596,13 @@ have_compressed: goto out; } - ret = mempool_init_kmalloc_pool( - &c->decompress_workspace, - 1, decompress_workspace_size); - if (ret) - goto out; + if (!mempool_initialized(&c->decompress_workspace)) { + ret = mempool_init_kmalloc_pool( + &c->decompress_workspace, + 1, decompress_workspace_size); + if (ret) + goto out; + } out: pr_verbose_init(c->opts, "ret %i", ret); return ret; diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 06fff6a57f94..4bab1f61b3b5 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_COMPRESS_H #define _BCACHEFS_COMPRESS_H diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 71f649bc4c7f..c758982bc1af 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Assorted bcachefs debug code * @@ -35,7 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) struct btree *v = c->verify_data; struct btree_node *n_ondisk, *n_sorted, *n_inmemory; struct bset *sorted, *inmemory; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct bch_dev *ca; struct bio *bio; @@ -55,14 +56,17 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->btree_id = b->btree_id; bch2_btree_keys_init(v, &c->expensive_debug_checks); - if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0) + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick) <= 0) return; ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (!bch2_dev_get_ioref(ca, READ)) return; - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); + bio = bio_alloc_bioset(GFP_NOIO, + buf_pages(n_sorted, btree_bytes(c)), + &c->btree_bio); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_opf = REQ_OP_READ|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; @@ -201,7 +205,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int err; @@ -216,19 +221,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); - k = bch2_btree_iter_peek(&iter); + bch2_trans_init(&trans, i->c, 0, 0); - while (k.k && !(err = btree_iter_err(k))) { - bch2_bkey_val_to_text(i->c, bkey_type(0, i->id), - i->buf, sizeof(i->buf), k); + iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + k = bch2_btree_iter_peek(iter); + + while (k.k && !(err = bkey_err(k))) { + bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); i->bytes = strlen(i->buf); BUG_ON(i->bytes >= PAGE_SIZE); i->buf[i->bytes] = '\n'; i->bytes++; - k = bch2_btree_iter_next(&iter); - i->from = iter.pos; + k = bch2_btree_iter_next(iter); + i->from = iter->pos; err = flush_buf(i); if (err) @@ -237,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; } @@ -253,7 +259,8 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct btree *b; int err; @@ -268,9 +275,11 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (!i->size || !bkey_cmp(POS_MAX, i->from)) return i->ret; - for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) { - i->bytes = bch2_print_btree_node(i->c, b, i->buf, - sizeof(i->buf)); + bch2_trans_init(&trans, i->c, 0, 0); + + for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { + bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); + i->bytes = strlen(i->buf); err = flush_buf(i); if (err) break; @@ -286,7 +295,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (!i->size) break; } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; } @@ -302,7 +311,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct btree *prev_node = NULL; int err; @@ -318,32 +328,33 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (!i->size) return i->ret; - bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH); + bch2_trans_init(&trans, i->c, 0, 0); - while ((k = bch2_btree_iter_peek(&iter)).k && - !(err = btree_iter_err(k))) { - struct btree_iter_level *l = &iter.l[0]; + iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(err = bkey_err(k))) { + struct btree_iter_level *l = &iter->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); if (l->b != prev_node) { - i->bytes = bch2_print_btree_node(i->c, l->b, i->buf, - sizeof(i->buf)); + bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); + i->bytes = strlen(i->buf); err = flush_buf(i); if (err) break; } prev_node = l->b; - i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf, - sizeof(i->buf)); - + bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); + i->bytes = strlen(i->buf); err = flush_buf(i); if (err) break; - bch2_btree_iter_next(&iter); - i->from = iter.pos; + bch2_btree_iter_next(iter); + i->from = iter->pos; err = flush_buf(i); if (err) @@ -352,7 +363,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (!i->size) break; } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; } diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index b5de1a7072d4..56c2d1ab5f63 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DEBUG_H #define _BCACHEFS_DEBUG_H diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index df9913f8967b..1442dacef0de 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" @@ -12,17 +13,10 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { - unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent); + unsigned len = bkey_val_bytes(d.k) - + offsetof(struct bch_dirent, d_name); - while (len && !d.v->d_name[len - 1]) - --len; - - return len; -} - -static unsigned dirent_val_u64s(unsigned len) -{ - return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64)); + return strnlen(d.v->d_name, len); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -71,8 +65,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) const struct bch_hash_desc bch2_dirent_hash_desc = { .btree_id = BTREE_ID_DIRENTS, - .key_type = BCH_DIRENT, - .whiteout_type = BCH_DIRENT_WHITEOUT, + .key_type = KEY_TYPE_dirent, .hash_key = dirent_hash_key, .hash_bkey = dirent_hash_bkey, .cmp_key = dirent_cmp_key, @@ -81,69 +74,53 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_dirent d; + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); unsigned len; - switch (k.k->type) { - case BCH_DIRENT: - if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) - return "value too small"; + if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) + return "value too small"; - d = bkey_s_c_to_dirent(k); - len = bch2_dirent_name_bytes(d); + len = bch2_dirent_name_bytes(d); + if (!len) + return "empty name"; - if (!len) - return "empty name"; - - if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) - return "value too big"; - - if (len > NAME_MAX) - return "dirent name too big"; - - if (memchr(d.v->d_name, '/', len)) - return "dirent name has invalid characters"; + /* + * older versions of bcachefs were buggy and creating dirent + * keys that were bigger than necessary: + */ + if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) + return "value too big"; - return NULL; - case BCH_DIRENT_WHITEOUT: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; + if (len > BCH_NAME_MAX) + return "dirent name too big"; - default: - return "invalid type"; - } + return NULL; } -void bch2_dirent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_s_c_dirent d; - size_t n = 0; - - switch (k.k->type) { - case BCH_DIRENT: - d = bkey_s_c_to_dirent(k); - - n += bch_scnmemcpy(buf + n, size - n, d.v->d_name, - bch2_dirent_name_bytes(d)); - n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum); - break; - case BCH_DIRENT_WHITEOUT: - scnprintf(buf, size, "whiteout"); - break; - } + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + bch_scnmemcpy(out, d.v->d_name, + bch2_dirent_name_bytes(d)); + pr_buf(out, " -> %llu", d.v->d_inum); } -static struct bkey_i_dirent *dirent_create_key(u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + u8 type, const struct qstr *name, u64 dst) { struct bkey_i_dirent *dirent; unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); - dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!dirent) - return NULL; + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + BUG_ON(u64s > U8_MAX); + + dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; bkey_dirent_init(&dirent->k_i); dirent->k.u64s = u64s; @@ -153,30 +130,39 @@ static struct bkey_i_dirent *dirent_create_key(u8 type, memcpy(dirent->v.d_name, name->name, name->len); memset(dirent->v.d_name + name->len, 0, bkey_val_bytes(&dirent->k) - - (sizeof(struct bch_dirent) + name->len)); + offsetof(struct bch_dirent, d_name) - + name->len); EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); return dirent; } -int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *journal_seq, int flags) +int __bch2_dirent_create(struct btree_trans *trans, + u64 dir_inum, const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + int flags) { struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(type, name, dst_inum); - if (!dirent) - return -ENOMEM; + dirent = dirent_create_key(trans, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; - ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum, - journal_seq, &dirent->k_i, flags); - kfree(dirent); + return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, &dirent->k_i, flags); +} - return ret; +int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *journal_seq, int flags) +{ + return bch2_trans_do(c, journal_seq, flags, + __bch2_dirent_create(&trans, dir_inum, hash_info, + type, name, dst_inum, flags)); } static void dirent_copy_target(struct bkey_i_dirent *dst, @@ -192,147 +178,119 @@ static struct bpos bch2_dirent_pos(struct bch_inode_info *inode, return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name)); } -int bch2_dirent_rename(struct bch_fs *c, +int bch2_dirent_rename(struct btree_trans *trans, struct bch_inode_info *src_dir, const struct qstr *src_name, struct bch_inode_info *dst_dir, const struct qstr *dst_name, - u64 *journal_seq, enum bch_rename_mode mode) + enum bch_rename_mode mode) { - struct btree_iter src_iter, dst_iter, whiteout_iter; + struct btree_iter *src_iter, *dst_iter; struct bkey_s_c old_src, old_dst; - struct bkey delete; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos src_pos = bch2_dirent_pos(src_dir, src_name); struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name); - bool need_whiteout; - int ret = -ENOMEM; - - bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_link(&src_iter, &dst_iter); - - bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos, - BTREE_ITER_SLOTS); - bch2_btree_iter_link(&src_iter, &whiteout_iter); - - if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(0, src_name, 0); - if (!new_src) - goto err; - } else { - new_src = (void *) &delete; - } - - new_dst = dirent_create_key(0, dst_name, 0); - if (!new_dst) - goto err; -retry: - /* - * Note that on -EINTR/dropped locks we're not restarting the lookup - * from the original hashed position (like we do when creating dirents, - * in bch_hash_set) - we never move existing dirents to different slot: - */ - old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - &src_iter, src_name); - if ((ret = btree_iter_err(old_src))) - goto err; - - ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - &whiteout_iter, &src_iter); - if (ret < 0) - goto err; - need_whiteout = ret; + int ret; /* + * Lookup dst: + * * Note that in BCH_RENAME mode, we're _not_ checking if * the target already exists - we're relying on the VFS * to do that check for us for correctness: */ - old_dst = mode == BCH_RENAME - ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter) - : bch2_hash_lookup_at(bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - &dst_iter, dst_name); - if ((ret = btree_iter_err(old_dst))) - goto err; - - switch (mode) { - case BCH_RENAME: + dst_iter = mode == BCH_RENAME + ? bch2_hash_hole(trans, bch2_dirent_hash_desc, + &dst_dir->ei_str_hash, + dst_dir->v.i_ino, dst_name) + : bch2_hash_lookup(trans, bch2_dirent_hash_desc, + &dst_dir->ei_str_hash, + dst_dir->v.i_ino, dst_name, + BTREE_ITER_INTENT); + if (IS_ERR(dst_iter)) + return PTR_ERR(dst_iter); + old_dst = bch2_btree_iter_peek_slot(dst_iter); + + /* Lookup src: */ + src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, + &src_dir->ei_str_hash, + src_dir->v.i_ino, src_name, + BTREE_ITER_INTENT); + if (IS_ERR(src_iter)) + return PTR_ERR(src_iter); + old_src = bch2_btree_iter_peek_slot(src_iter); + + /* Create new dst key: */ + new_dst = dirent_create_key(trans, 0, dst_name, 0); + if (IS_ERR(new_dst)) + return PTR_ERR(new_dst); + + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_dst->k.p = dst_iter->pos; + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(trans, 0, src_name, 0); + if (IS_ERR(new_src)) + return PTR_ERR(new_src); + + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + new_src->k.p = src_iter->pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (IS_ERR(new_src)) + return PTR_ERR(new_src); bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_src->k.p = src_iter->pos; - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && + bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { /* - * If we couldn't insert new_dst at its hashed - * position (dst_pos) due to a hash collision, - * and we're going to be deleting in - * between the hashed position and first empty - * slot we found - just overwrite the pos we - * were going to delete: - * - * Note: this is a correctness issue, in this - * situation bch2_hash_needs_whiteout() could - * return false when the whiteout would have - * been needed if we inserted at the pos - * __dirent_find_hole() found + * We have a hash collision for the new dst key, + * and new_src - the key we're deleting - is between + * new_dst's hashed slot and the slot we're going to be + * inserting it into - oops. This will break the hash + * table if we don't deal with it: */ - new_dst->k.p = src_iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, - journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, + if (mode == BCH_RENAME) { + /* + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ + new_dst->k.p = src_iter->pos; + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(src_iter, &new_dst->k_i)); - goto err; + return 0; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to + * overwrite old_dst - just make sure to use a + * whiteout when deleting src: + */ + new_src->k.type = KEY_TYPE_whiteout; + } + } else { + /* Check if we need a whiteout to delete src: */ + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, + &src_dir->ei_str_hash, + src_iter); + if (ret < 0) + return ret; + + if (ret) + new_src->k.type = KEY_TYPE_whiteout; } - - if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_OVERWRITE: - bkey_init(&new_src->k); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { - /* - * Same case described above - - * bch_hash_needs_whiteout could spuriously - * return false, but we have to insert at - * dst_iter.pos because we're overwriting - * another dirent: - */ - new_src->k.type = BCH_DIRENT_WHITEOUT; - } else if (need_whiteout) - new_src->k.type = BCH_DIRENT_WHITEOUT; - break; - case BCH_RENAME_EXCHANGE: - dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - break; } - new_src->k.p = src_iter.pos; - new_dst->k.p = dst_iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i), - BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i)); -err: - if (ret == -EINTR) - goto retry; - - bch2_btree_iter_unlock(&whiteout_iter); - bch2_btree_iter_unlock(&dst_iter); - bch2_btree_iter_unlock(&src_iter); - - if (new_src != (void *) &delete) - kfree(new_src); - kfree(new_dst); - return ret; + bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i)); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i)); + return 0; +} + +int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name) +{ + return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, name); } int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, @@ -340,66 +298,83 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, const struct qstr *name, u64 *journal_seq) { - return bch2_hash_delete(bch2_dirent_hash_desc, hash_info, - c, dir_inum, journal_seq, name); + return bch2_trans_do(c, journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL, + __bch2_dirent_delete(&trans, dir_inum, hash_info, name)); } u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, const struct bch_hash_info *hash_info, const struct qstr *name) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; - u64 inum; + u64 inum = 0; - k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c, - dir_inum, &iter, name); - if (IS_ERR(k.k)) { - bch2_btree_iter_unlock(&iter); - return 0; + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc, + hash_info, dir_inum, name, 0); + if (IS_ERR(iter)) { + BUG_ON(PTR_ERR(iter) == -EINTR); + goto out; } + k = bch2_btree_iter_peek_slot(iter); inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - bch2_btree_iter_unlock(&iter); - +out: + bch2_trans_exit(&trans); return inum; } -int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) { - struct btree_iter iter; + struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; + int ret; - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) { + for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, + POS(dir_inum, 0), 0, k, ret) { if (k.k->p.inode > dir_inum) break; - if (k.k->type == BCH_DIRENT) { + if (k.k->type == KEY_TYPE_dirent) { ret = -ENOTEMPTY; break; } } - bch2_btree_iter_unlock(&iter); + bch2_trans_iter_put(trans, iter); return ret; } +int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +{ + return bch2_trans_do(c, NULL, 0, + bch2_empty_dir_trans(&trans, dir_inum)); +} + int bch2_readdir(struct bch_fs *c, struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; unsigned len; + int ret; if (!dir_emit_dots(file, ctx)) return 0; - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(inode->v.i_ino, ctx->pos), 0, k) { - if (k.k->type != BCH_DIRENT) + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + POS(inode->v.i_ino, ctx->pos), 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) continue; dirent = bkey_s_c_to_dirent(k); @@ -423,7 +398,7 @@ int bch2_readdir(struct bch_fs *c, struct file *file, ctx->pos = k.k->p.offset + 1; } - bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; - return 0; + return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 5d066af18f95..bc64718a7832 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DIRENT_H #define _BCACHEFS_DIRENT_H @@ -6,9 +7,9 @@ extern const struct bch_hash_desc bch2_dirent_hash_desc; const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -#define bch2_bkey_dirent_ops (struct bkey_ops) { \ +#define bch2_bkey_ops_dirent (struct bkey_ops) { \ .key_invalid = bch2_dirent_invalid, \ .val_to_text = bch2_dirent_to_text, \ } @@ -21,8 +22,22 @@ struct bch_hash_info; struct bch_inode_info; unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); + +static inline unsigned dirent_val_u64s(unsigned len) +{ + return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, + sizeof(u64)); +} + +int __bch2_dirent_create(struct btree_trans *, u64, + const struct bch_hash_info *, u8, + const struct qstr *, u64, int); int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); + +int __bch2_dirent_delete(struct btree_trans *, u64, + const struct bch_hash_info *, + const struct qstr *); int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *, u64 *); @@ -32,14 +47,15 @@ enum bch_rename_mode { BCH_RENAME_EXCHANGE, }; -int bch2_dirent_rename(struct bch_fs *, +int bch2_dirent_rename(struct btree_trans *, struct bch_inode_info *, const struct qstr *, struct bch_inode_info *, const struct qstr *, - u64 *, enum bch_rename_mode); + enum bch_rename_mode); u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); +int bch2_empty_dir_trans(struct btree_trans *, u64); int bch2_empty_dir(struct bch_fs *, u64); int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 87f3940e1df5..4a4ec8f46108 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "disk_groups.h" #include "super-io.h" @@ -82,11 +83,10 @@ err: return err; } -static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size, +static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { - char *out = buf, *end = buf + size; struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g; @@ -96,18 +96,14 @@ static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size, g < groups->entries + nr_groups; g++) { if (g != groups->entries) - out += scnprintf(out, end - out, " "); + pr_buf(out, " "); if (BCH_GROUP_DELETED(g)) - out += scnprintf(out, end - out, "[deleted]"); + pr_buf(out, "[deleted]"); else - out += scnprintf(out, end - out, - "[parent %llu name %s]", - BCH_GROUP_PARENT(g), - g->label); + pr_buf(out, "[parent %llu name %s]", + BCH_GROUP_PARENT(g), g->label); } - - return out - buf; } const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { @@ -342,10 +338,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -int bch2_disk_path_print(struct bch_sb_handle *sb, - char *buf, size_t len, unsigned v) +void bch2_disk_path_to_text(struct printbuf *out, + struct bch_sb_handle *sb, + unsigned v) { - char *out = buf, *end = out + len; struct bch_sb_field_disk_groups *groups = bch2_sb_get_disk_groups(sb->sb); struct bch_disk_group *g; @@ -373,26 +369,18 @@ int bch2_disk_path_print(struct bch_sb_handle *sb, } while (nr) { - unsigned b = 0; - v = path[--nr]; g = groups->entries + v; - if (end != out) - b = min_t(size_t, end - out, - strnlen(g->label, sizeof(g->label))); - memcpy(out, g->label, b); - if (b < end - out) - out[b] = '\0'; - out += b; + bch_scnmemcpy(out, g->label, + strnlen(g->label, sizeof(g->label))); if (nr) - out += scnprintf(out, end - out, "."); + pr_buf(out, "."); } - - return out - buf; + return; inval: - return scnprintf(buf, len, "invalid group %u", v); + pr_buf(out, "invalid group %u", v); } int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) @@ -451,14 +439,14 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) return -EINVAL; } -int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) +void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) { struct target t = target_decode(v); - int ret; switch (t.type) { case TARGET_NULL: - return scnprintf(buf, len, "none"); + pr_buf(out, "none"); + break; case TARGET_DEV: { struct bch_dev *ca; @@ -470,13 +458,13 @@ int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) if (ca && percpu_ref_tryget(&ca->io_ref)) { char b[BDEVNAME_SIZE]; - ret = scnprintf(buf, len, "/dev/%s", - bdevname(ca->disk_sb.bdev, b)); + pr_buf(out, "/dev/%s", + bdevname(ca->disk_sb.bdev, b)); percpu_ref_put(&ca->io_ref); } else if (ca) { - ret = scnprintf(buf, len, "offline device %u", t.dev); + pr_buf(out, "offline device %u", t.dev); } else { - ret = scnprintf(buf, len, "invalid device %u", t.dev); + pr_buf(out, "invalid device %u", t.dev); } rcu_read_unlock(); @@ -484,12 +472,10 @@ int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v) } case TARGET_GROUP: mutex_lock(&c->sb_lock); - ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group); + bch2_disk_path_to_text(out, &c->disk_sb, t.group); mutex_unlock(&c->sb_lock); break; default: BUG(); } - - return ret; } diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h index e92c0dc50970..c8e0c37a5e1a 100644 --- a/fs/bcachefs/disk_groups.h +++ b/fs/bcachefs/disk_groups.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DISK_GROUPS_H #define _BCACHEFS_DISK_GROUPS_H @@ -54,14 +55,28 @@ static inline struct target target_decode(unsigned target) } const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask devs = c->rw_devs[data_type]; + const struct bch_devs_mask *t = bch2_target_to_mask(c, target); + + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + return devs; +} + bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned); +void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, + unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); int bch2_sb_disk_groups_to_cpu(struct bch_fs *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 index 000000000000..dba861111a8d --- /dev/null +++ b/fs/bcachefs/ec.c @@ -0,0 +1,1369 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "keylist.h" +#include "recovery.h" +#include "super-io.h" +#include "util.h" + +#include <linux/sort.h> + +#ifdef __KERNEL__ + +#include <linux/raid/pq.h> +#include <linux/raid/xor.h> + +static void raid5_recov(unsigned disks, unsigned failed_idx, + size_t size, void **data) +{ + unsigned i = 2, nr; + + BUG_ON(failed_idx >= disks); + + swap(data[0], data[failed_idx]); + memcpy(data[0], data[1], size); + + while (i < disks) { + nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); + xor_blocks(nr, size, data[0], data + i); + i += nr; + } + + swap(data[0], data[failed_idx]); +} + +static void raid_gen(int nd, int np, size_t size, void **v) +{ + if (np >= 1) + raid5_recov(nd + np, nd, size, v); + if (np >= 2) + raid6_call.gen_syndrome(nd + np, size, v); + BUG_ON(np > 2); +} + +static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) +{ + switch (nr) { + case 0: + break; + case 1: + if (ir[0] < nd + 1) + raid5_recov(nd + 1, ir[0], size, v); + else + raid6_call.gen_syndrome(nd + np, size, v); + break; + case 2: + if (ir[1] < nd) { + /* data+data failure. */ + raid6_2data_recov(nd + np, size, ir[0], ir[1], v); + } else if (ir[0] < nd) { + /* data + p/q failure */ + + if (ir[1] == nd) /* data + p failure */ + raid6_datap_recov(nd + np, size, ir[0], v); + else { /* data + q failure */ + raid5_recov(nd + 1, ir[0], size, v); + raid6_call.gen_syndrome(nd + np, size, v); + } + } else { + raid_gen(nd, np, size, v); + } + break; + default: + BUG(); + } +} + +#else + +#include <raid/raid.h> + +#endif + +struct ec_bio { + struct bch_dev *ca; + struct ec_stripe_buf *buf; + size_t idx; + struct bio bio; +}; + +/* Stripes btree keys: */ + +const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + + if (k.k->p.inode) + return "invalid stripe key"; + + if (bkey_val_bytes(k.k) < sizeof(*s)) + return "incorrect value size"; + + if (bkey_val_bytes(k.k) < sizeof(*s) || + bkey_val_u64s(k.k) < stripe_val_u64s(s)) + return "incorrect value size"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned i; + + pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + s->algorithm, + le16_to_cpu(s->sectors), + s->nr_blocks - s->nr_redundant, + s->nr_redundant, + s->csum_type, + 1U << s->csum_granularity_bits); + + for (i = 0; i < s->nr_blocks; i++) + pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, + (u64) s->ptrs[i].offset, + stripe_blockcount_get(s, i)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +static int ptr_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + const struct bch_extent_ptr *ptr) +{ + unsigned i; + + for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { + const struct bch_extent_ptr *ptr2 = v->ptrs + i; + + if (ptr->dev == ptr2->dev && + ptr->gen == ptr2->gen && + ptr->offset >= ptr2->offset && + ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) + return i; + } + + return -1; +} + +static int extent_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + int idx; + + if (!bkey_extent_is_data(k.k)) + return -1; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) { + idx = ptr_matches_stripe(c, v, ptr); + if (idx >= 0) + return idx; + } + + return -1; +} + +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) +{ + struct bkey_s_c_extent e; + const union bch_extent_entry *entry; + + if (!bkey_extent_is_data(k.k)) + return false; + + e = bkey_s_c_to_extent(k); + + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; + + return false; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + struct open_buckets *blocks, + struct open_buckets *parity, + unsigned stripe_size) +{ + struct open_bucket *ob; + unsigned i, u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = parity->nr + blocks->nr; + s->v.nr_redundant = parity->nr; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_type = BCH_CSUM_CRC32C; + s->v.pad = 0; + + open_bucket_for_each(c, blocks, ob, i) + s->v.ptrs[i] = ob->ptr; + + open_bucket_for_each(c, parity, ob, i) + s->v.ptrs[blocks->nr + i] = ob->ptr; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +/* Checksumming: */ + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csums_per_device = stripe_csums_per_device(v); + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i, j; + + if (!csum_bytes) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + + for (i = 0; i < v->nr_blocks; i++) { + for (j = 0; j < csums_per_device; j++) { + unsigned offset = j << v->csum_granularity_bits; + unsigned len = min(csum_granularity, buf->size - offset); + + struct bch_csum csum = + bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + (offset << 9), + len << 9); + + memcpy(stripe_csum(v, i, j), &csum, csum_bytes); + } + } +} + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i; + + if (!csum_bytes) + return; + + for (i = 0; i < v->nr_blocks; i++) { + unsigned offset = buf->offset; + unsigned end = buf->offset + buf->size; + + if (!test_bit(i, buf->valid)) + continue; + + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); + struct bch_csum csum; + + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + ((offset + len) & (csum_granularity - 1))); + + csum = bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + ((offset - buf->offset) << 9), + len << 9); + + if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { + __bcache_io_error(c, + "checksum error while doing reconstruct read (%u:%u)", + i, j); + clear_bit(i, buf->valid); + break; + } + + offset += len; + } + } +} + +/* Erasure coding: */ + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + + raid_gen(nr_data, v->nr_redundant, bytes, buf->data); +} + +static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) +{ + return nr - bitmap_weight(buf->valid, nr); +} + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ + return __ec_nr_failed(buf, buf->key.v.nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } + + for (i = 0; i < nr_data; i++) + if (!test_bit(i, buf->valid)) + failed[nr_failed++] = i; + + raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); + return 0; +} + +/* IO: */ + +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + unsigned rw, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &buf->key.v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, + DIV_ROUND_UP(bytes, PAGE_SIZE)); + unsigned b = min_t(size_t, bytes - offset, + nr_iovecs << PAGE_SHIFT); + struct ec_bio *ec_bio; + + ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, + &c->ec_bioset), + struct ec_bio, bio); + + ec_bio->ca = ca; + ec_bio->buf = buf; + ec_bio->idx = idx; + + bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); + bio_set_op_attrs(&ec_bio->bio, rw, 0); + + ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); + ec_bio->bio.bi_iter.bi_size = b; + ec_bio->bio.bi_end_io = ec_block_endio; + ec_bio->bio.bi_private = cl; + + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset); + + closure_get(cl); + percpu_ref_get(&ca->io_ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + + percpu_ref_put(&ca->io_ref); +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct ec_stripe_buf *buf; + struct closure cl; + struct bkey_s_c k; + struct bch_stripe *v; + unsigned stripe_idx; + unsigned offset, end; + unsigned i, nr_data, csum_granularity; + int ret = 0, idx; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.idx || + rbio->pick.idx - 1 >= rbio->pick.ec_nr); + + stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; + + buf = kzalloc(sizeof(*buf), GFP_NOIO); + if (!buf) + return -ENOMEM; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, + POS(0, stripe_idx), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { + __bcache_io_error(c, + "error doing reconstruct read: stripe not found"); + kfree(buf); + return bch2_trans_exit(&trans) ?: -EIO; + } + + bkey_reassemble(&buf->key.k_i, k); + bch2_trans_exit(&trans); + + v = &buf->key.v; + + nr_data = v->nr_blocks - v->nr_redundant; + + idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); + BUG_ON(idx < 0); + + csum_granularity = 1U << v->csum_granularity_bits; + + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; + end = offset + bio_sectors(&rbio->bio); + + BUG_ON(end > le16_to_cpu(v->sectors)); + + buf->offset = round_down(offset, csum_granularity); + buf->size = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)) - buf->offset; + + for (i = 0; i < v->nr_blocks; i++) { + buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); + if (!buf->data[i]) { + ret = -ENOMEM; + goto err; + } + } + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < v->nr_blocks; i++) { + struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr)) { + __bcache_io_error(c, + "error doing reconstruct read: stale pointer"); + clear_bit(i, buf->valid); + continue; + } + + ec_block_io(c, buf, REQ_OP_READ, i, &cl); + } + + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; + } + + ec_validate_checksums(c, buf); + + ret = ec_do_recov(c, buf); + if (ret) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, + buf->data[idx] + ((offset - buf->offset) << 9)); +err: + for (i = 0; i < v->nr_blocks; i++) + kfree(buf->data[i]); + kfree(buf); + return ret; +} + +/* stripe bucket accounting: */ + +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +{ + ec_stripes_heap n, *h = &c->ec_stripes_heap; + + if (idx >= h->size) { + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + return -ENOMEM; + + spin_lock(&c->ec_stripes_heap_lock); + if (n.size > h->size) { + memcpy(n.data, h->data, h->used * sizeof(h->data[0])); + n.used = h->used; + swap(*h, n); + } + spin_unlock(&c->ec_stripes_heap_lock); + + free_heap(&n); + } + + if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) + return -ENOMEM; + + if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) + return -ENOMEM; + + return 0; +} + +static int ec_stripe_mem_alloc(struct bch_fs *c, + struct btree_iter *iter) +{ + size_t idx = iter->pos.offset; + int ret = 0; + + if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) + return ret; + + bch2_trans_unlock(iter->trans); + ret = -EINTR; + + if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) + return ret; + + return -ENOMEM; +} + +static ssize_t stripe_idx_to_delete(struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + + return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; +} + +static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, + struct ec_stripe_heap_entry l, + struct ec_stripe_heap_entry r) +{ + return ((l.blocks_nonempty > r.blocks_nonempty) - + (l.blocks_nonempty < r.blocks_nonempty)); +} + +static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + size_t i) +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + + genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m = genradix_ptr(&c->stripes[0], idx); + + BUG_ON(!m->alive); + BUG_ON(m->heap_idx >= h->used); + BUG_ON(h->data[m->heap_idx].idx != idx); +} + +void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + size_t i; + + if (m->alive) { + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + } else { + bch2_stripes_heap_insert(c, m, idx); + } + + if (stripe_idx_to_delete(c) >= 0) + schedule_work(&c->ec_stripe_delete_work); +} + +void bch2_stripes_heap_del(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + heap_verify_backpointer(c, idx); + + m->alive = false; + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); +} + +void bch2_stripes_heap_insert(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + BUG_ON(heap_full(&c->ec_stripes_heap)); + + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = m->blocks_nonempty, + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + m->alive = true; + + heap_verify_backpointer(c, idx); +} + +/* stripe deletion */ + +static int ec_stripe_delete(struct bch_fs *c, size_t idx) +{ + return bch2_btree_delete_range(c, BTREE_ID_EC, + POS(0, idx), + POS(0, idx + 1), + NULL); +} + +static void ec_stripe_delete_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, ec_stripe_delete_work); + ssize_t idx; + + down_read(&c->gc_lock); + mutex_lock(&c->ec_stripe_create_lock); + + while (1) { + spin_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); + spin_unlock(&c->ec_stripes_heap_lock); + + if (idx < 0) + break; + + ec_stripe_delete(c, idx); + } + + mutex_unlock(&c->ec_stripe_create_lock); + up_read(&c->gc_lock); +} + +/* stripe creation: */ + +static int ec_stripe_bkey_insert(struct bch_fs *c, + struct bkey_i_stripe *stripe) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + /* XXX: start pos hint */ + for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) + break; + + if (bkey_deleted(k.k)) + goto found_slot; + } + + if (!ret) + ret = -ENOSPC; + goto err; +found_slot: + ret = ec_stripe_mem_alloc(c, iter); + if (ret) + goto err; + + stripe->k.p = iter->pos; + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); +err: + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +static void extent_stripe_ptr_add(struct bkey_s_extent e, + struct ec_stripe_buf *s, + struct bch_extent_ptr *ptr, + unsigned block) +{ + struct bch_extent_stripe_ptr *dst = (void *) ptr; + union bch_extent_entry *end = extent_entry_last(e); + + memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); + e.k->u64s += sizeof(*dst) / sizeof(u64); + + *dst = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .idx = s->key.k.p.offset, + }; +} + +static int ec_stripe_update_ptrs(struct bch_fs *c, + struct ec_stripe_buf *s, + struct bkey *pos) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + BKEY_PADDED(k) tmp; + int ret = 0, dev, idx; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(pos), + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { + bch2_btree_iter_next(iter); + continue; + } + + idx = extent_matches_stripe(c, &s->key.v, k); + if (idx < 0) { + bch2_btree_iter_next(iter); + continue; + } + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + + dev = s->key.v.ptrs[idx].dev; + + bkey_reassemble(&tmp.k, k); + e = bkey_i_to_s_extent(&tmp.k); + + extent_for_each_ptr(e, ptr) + if (ptr->dev != dev) + ptr->cached = true; + + ptr = (void *) bch2_extent_has_device(e.c, dev); + BUG_ON(!ptr); + + extent_stripe_ptr_add(e, s, ptr, idx); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k)); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); + if (ret == -EINTR) + ret = 0; + if (ret) + break; + } + + bch2_trans_exit(&trans); + + return ret; +} + +/* + * data buckets of new stripe all written: create the stripe + */ +static void ec_stripe_create(struct ec_stripe_new *s) +{ + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bkey_i *k; + struct bch_stripe *v = &s->stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + struct closure cl; + int ret; + + BUG_ON(s->h->s == s); + + closure_init_stack(&cl); + + if (s->err) { + bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + + if (!percpu_ref_tryget(&c->writes)) + goto err; + + BUG_ON(bitmap_weight(s->blocks_allocated, + s->blocks.nr) != s->blocks.nr); + + ec_generate_ec(&s->stripe); + + ec_generate_checksums(&s->stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) + ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); + + closure_sync(&cl); + + for (i = nr_data; i < v->nr_blocks; i++) + if (!test_bit(i, s->stripe.valid)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err_put_writes; + } + + mutex_lock(&c->ec_stripe_create_lock); + + ret = ec_stripe_bkey_insert(c, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_unlock; + } + + for_each_keylist_key(&s->keys, k) { + ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + if (ret) + break; + } + +err_unlock: + mutex_unlock(&c->ec_stripe_create_lock); +err_put_writes: + percpu_ref_put(&c->writes); +err: + open_bucket_for_each(c, &s->blocks, ob, i) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } + + bch2_open_buckets_put(c, &s->parity); + + bch2_keylist_free(&s->keys, s->inline_keys); + + mutex_lock(&s->h->lock); + list_del(&s->list); + mutex_unlock(&s->h->lock); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); +} + +static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = h->s; + + list_add(&s->list, &h->stripes); + h->s = NULL; + + return s; +} + +static void ec_stripe_new_put(struct ec_stripe_new *s) +{ + BUG_ON(atomic_read(&s->pin) <= 0); + if (atomic_dec_and_test(&s->pin)) + ec_stripe_create(s); +} + +/* have a full bucket - hand it off to be erasure coded: */ +void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + if (ob->sectors_free) + s->err = -1; + + ec_stripe_new_put(s); +} + +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + s->err = -EIO; +} + +void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct bch_dev *ca; + unsigned offset; + + if (!ob) + return NULL; + + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); +} + +void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + struct bpos pos, unsigned sectors) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct ec_stripe_new *ec; + + if (!ob) + return; + + ec = ob->ec; + mutex_lock(&ec->lock); + + if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, + ARRAY_SIZE(ec->inline_keys), + BKEY_U64s)) { + BUG(); + } + + bkey_init(&ec->keys.top->k); + ec->keys.top->k.p = pos; + bch2_key_resize(&ec->keys.top->k, sectors); + bch2_keylist_push(&ec->keys); + + mutex_unlock(&ec->lock); +} + +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); + unsigned r = *((const unsigned *) _r); + + return cmp_int(l, r); +} + +/* pick most common bucket size: */ +static unsigned pick_blocksize(struct bch_fs *c, + struct bch_devs_mask *devs) +{ + struct bch_dev *ca; + unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + struct { + unsigned nr, size; + } cur = { 0, 0 }, best = { 0, 0 }; + + for_each_member_device_rcu(ca, c, i, devs) + sizes[nr++] = ca->mi.bucket_size; + + sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); + + for (i = 0; i < nr; i++) { + if (sizes[i] != cur.size) { + if (cur.nr > best.nr) + best = cur; + + cur.nr = 0; + cur.size = sizes[i]; + } + + cur.nr++; + } + + if (cur.nr > best.nr) + best = cur; + + return best.size; +} + +int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s; + unsigned i; + + BUG_ON(h->parity.nr != h->redundancy); + BUG_ON(!h->blocks.nr); + BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + mutex_init(&s->lock); + atomic_set(&s->pin, 1); + s->c = c; + s->h = h; + s->blocks = h->blocks; + s->parity = h->parity; + + memset(&h->blocks, 0, sizeof(h->blocks)); + memset(&h->parity, 0, sizeof(h->parity)); + + bch2_keylist_init(&s->keys, s->inline_keys); + + s->stripe.offset = 0; + s->stripe.size = h->blocksize; + memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); + + ec_stripe_key_init(c, &s->stripe.key, + &s->blocks, &s->parity, + h->blocksize); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { + s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); + if (!s->stripe.data[i]) + goto err; + } + + h->s = s; + + return 0; +err: + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); + return -ENOMEM; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + unsigned algo, unsigned redundancy) +{ + struct ec_stripe_head *h; + struct bch_dev *ca; + unsigned i; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + mutex_lock(&h->lock); + INIT_LIST_HEAD(&h->stripes); + + h->target = target; + h->algo = algo; + h->redundancy = redundancy; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_USER, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) + __clear_bit(i, h->devs.d); + + h->blocksize = pick_blocksize(c, &h->devs); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + list_add(&h->list, &c->ec_new_stripe_list); + return h; +} + +void bch2_ec_stripe_head_put(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = NULL; + + if (h->s && + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr) == h->s->blocks.nr) + s = ec_stripe_set_pending(h); + + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + struct ec_stripe_head *h; + + if (!redundancy) + return NULL; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy) { + mutex_lock(&h->lock); + goto found; + } + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy); +found: + mutex_unlock(&c->ec_new_stripe_lock); + return h; +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + struct ec_stripe_new *s = NULL; + + mutex_lock(&h->lock); + bch2_open_buckets_stop_dev(c, ca, + &h->blocks, + BCH_DATA_USER); + bch2_open_buckets_stop_dev(c, ca, + &h->parity, + BCH_DATA_USER); + + if (!h->s) + goto unlock; + + open_bucket_for_each(c, &h->s->blocks, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + open_bucket_for_each(c, &h->s->parity, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + goto unlock; +found: + h->s->err = -1; + s = ec_stripe_set_pending(h); +unlock: + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); + } + mutex_unlock(&c->ec_new_stripe_lock); +} + +static int __bch2_stripe_write_key(struct btree_trans *trans, + struct btree_iter *iter, + struct stripe *m, + size_t idx, + struct bkey_i_stripe *new_key, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + unsigned i; + int ret; + + bch2_btree_iter_set_pos(iter, POS(0, idx)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_stripe) + return -EIO; + + bkey_reassemble(&new_key->k_i, k); + + spin_lock(&c->ec_stripes_heap_lock); + + for (i = 0; i < new_key->v.nr_blocks; i++) + stripe_blockcount_set(&new_key->v, i, + m->block_sectors[i]); + m->dirty = false; + + spin_unlock(&c->ec_stripes_heap_lock); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i)); + + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +} + +int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct genradix_iter giter; + struct bkey_i_stripe *new_key; + struct stripe *m; + int ret = 0; + + new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); + BUG_ON(!new_key); + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + genradix_for_each(&c->stripes[0], giter, m) { + if (!m->dirty) + continue; + + ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, + new_key, flags); + if (ret) + break; + + *wrote = true; + } + + bch2_trans_exit(&trans); + + kfree(new_key); + + return ret; +} + +int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) +{ + struct journal_key *i; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + ret = bch2_fs_ec_start(c); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EC, POS_MIN, 0, k, ret) + bch2_mark_key(c, k, 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) { + bch_err(c, "error reading stripes: %i", ret); + return ret; + } + + for_each_journal_key(*journal_keys, i) + if (i->btree_id == BTREE_ID_EC) + bch2_mark_key(c, bkey_i_to_s_c(i->k), + 0, NULL, 0, + BCH_BUCKET_MARK_ALLOC_READ| + BCH_BUCKET_MARK_NOATOMIC); + + return 0; +} + +int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + size_t i, idx = 0; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(iter); + if (!IS_ERR_OR_NULL(k.k)) + idx = k.k->p.offset + 1; + ret = bch2_trans_exit(&trans); + if (ret) + return ret; + + if (!gc && + !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), + GFP_KERNEL)) + return -ENOMEM; +#if 0 + ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); +#else + for (i = 0; i < idx; i++) + if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) + return -ENOMEM; +#endif + return 0; +} + +int bch2_fs_ec_start(struct bch_fs *c) +{ + return bch2_ec_mem_alloc(c, false); +} + +void bch2_fs_ec_exit(struct bch_fs *c) +{ + struct ec_stripe_head *h; + + while (1) { + mutex_lock(&c->ec_new_stripe_lock); + h = list_first_entry_or_null(&c->ec_new_stripe_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); + mutex_unlock(&c->ec_new_stripe_lock); + if (!h) + break; + + BUG_ON(h->s); + BUG_ON(!list_empty(&h->stripes)); + kfree(h); + } + + free_heap(&c->ec_stripes_heap); + genradix_free(&c->stripes[0]); + bioset_exit(&c->ec_bioset); +} + +int bch2_fs_ec_init(struct bch_fs *c) +{ + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); + + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), + BIOSET_NEED_BVECS); +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 index 000000000000..8d9fbfd19f66 --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "keylist_types.h" + +const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ops_stripe (struct bkey_ops) { \ + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ +} + +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static inline unsigned stripe_csum_offset(const struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + + return sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, + unsigned idx) +{ + return stripe_csum_offset(s, s->nr_blocks, 0) + + sizeof(u16) * idx; +} + +static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, + unsigned idx) +{ + return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); +} + +static inline void stripe_blockcount_set(struct bch_stripe *s, + unsigned idx, unsigned v) +{ + __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); + + *p = cpu_to_le16(v); +} + +static inline unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), + sizeof(u64)); +} + +static inline void *stripe_csum(struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + return (void *) s + stripe_csum_offset(s, dev, csum_idx); +} + +struct bch_read_bio; + +struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; + unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + void *data[EC_STRIPE_MAX]; + + union { + struct bkey_i_stripe key; + u64 pad[255]; + }; +}; + +struct ec_stripe_head; + +struct ec_stripe_new { + struct bch_fs *c; + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; + + /* counts in flight writes, stripe is created when pin == 0 */ + atomic_t pin; + + int err; + + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + struct open_buckets blocks; + struct open_buckets parity; + + struct keylist keys; + u64 inline_keys[BKEY_U64s * 8]; + + struct ec_stripe_buf stripe; +}; + +struct ec_stripe_head { + struct list_head list; + struct mutex lock; + + struct list_head stripes; + + unsigned target; + unsigned algo; + unsigned redundancy; + + struct bch_devs_mask devs; + unsigned nr_active_devs; + + unsigned blocksize; + + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + + struct open_buckets blocks; + struct open_buckets parity; + + struct ec_stripe_new *s; +}; + +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); +void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, + struct bpos, unsigned); + +void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + +int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +void bch2_ec_stripe_head_put(struct ec_stripe_head *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, + unsigned, unsigned); + +void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); + +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); + +void bch2_ec_flush_new_stripes(struct bch_fs *); + +struct journal_keys; +int bch2_stripes_read(struct bch_fs *, struct journal_keys *); +int bch2_stripes_write(struct bch_fs *, unsigned, bool *); + +int bch2_ec_mem_alloc(struct bch_fs *, bool); + +int bch2_fs_ec_start(struct bch_fs *); + +void bch2_fs_ec_exit(struct bch_fs *); +int bch2_fs_ec_init(struct bch_fs *); + +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 index 000000000000..5c3f77c8aac7 --- /dev/null +++ b/fs/bcachefs/ec_types.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H + +#include <linux/llist.h> + +#define EC_STRIPE_MAX 16 + +struct bch_replicas_padded { + struct bch_replicas_entry e; + u8 pad[EC_STRIPE_MAX]; +}; + +struct stripe { + size_t heap_idx; + + u16 sectors; + u8 algorithm; + + u8 nr_blocks; + u8 nr_redundant; + + unsigned alive:1; + unsigned dirty:1; + u8 blocks_nonempty; + u16 block_sectors[EC_STRIPE_MAX]; + + struct bch_replicas_padded r; +}; + +struct ec_stripe_heap_entry { + size_t idx; + unsigned blocks_nonempty; +}; + +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 2a357fc33ef7..1aaff44e18cf 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" #include "io.h" @@ -66,10 +67,17 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, bool fix = false, print = true, suppressing = false; char _buf[sizeof(s->buf)], *buf = _buf; - mutex_lock(&c->fsck_error_lock); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) - goto print; + return bch2_inconsistent_error(c) + ? FSCK_ERR_EXIT + : FSCK_ERR_FIX; + } + + mutex_lock(&c->fsck_error_lock); list_for_each_entry(s, &c->fsck_errors, list) if (s->fmt == fmt) @@ -99,11 +107,7 @@ print: if (c->opts.fix_errors == FSCK_OPT_EXIT) { bch_err(c, "%s, exiting", buf); - mutex_unlock(&c->fsck_error_lock); - return FSCK_ERR_EXIT; - } - - if (flags & FSCK_CAN_FIX) { + } else if (flags & FSCK_CAN_FIX) { if (c->opts.fix_errors == FSCK_OPT_ASK) { printk(KERN_ERR "%s: fix?", buf); fix = ask_yn(); @@ -131,12 +135,16 @@ print: mutex_unlock(&c->fsck_error_lock); - if (fix) - set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags); - - return fix ? FSCK_ERR_FIX - : flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE - : FSCK_ERR_EXIT; + if (fix) { + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + return FSCK_ERR_FIX; + } else { + set_bit(BCH_FS_ERROR, &c->flags); + return c->opts.fix_errors == FSCK_OPT_EXIT || + !(flags & FSCK_CAN_IGNORE) + ? FSCK_ERR_EXIT + : FSCK_ERR_IGNORE; + } } void bch2_flush_fsck_errs(struct bch_fs *c) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index f65ef132461e..2591e12305b7 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_ERROR_H #define _BCACHEFS_ERROR_H @@ -147,12 +148,18 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define need_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define need_fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) + #define mustfix_fsck_err(c, ...) \ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) #define mustfix_fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) +#define fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) + #define fsck_err_on(cond, c, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index b85af711b9f9..e286048b5bf8 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> * @@ -27,225 +28,287 @@ #include <trace/events/bcachefs.h> -static void sort_key_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) { - i->k += __btree_node_offset_to_key(b, i->k)->u64s; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + bkey_for_each_ptr(p, ptr) + nr_ptrs++; - if (i->k == i->end) - *i = iter->data[--iter->used]; + return nr_ptrs; } -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ -#define key_sort_cmp(h, l, r) \ -({ \ - bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)) \ - \ - ?: (l).k - (r).k; \ -}) - -static inline bool should_drop_next_key(struct btree_node_iter_large *iter, - struct btree *b) +unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) { - struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; - struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); - - if (bkey_whiteout(k)) - return true; + unsigned nr_ptrs = 0; - if (iter->used < 2) - return false; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: { + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; - if (iter->used > 2 && - key_sort_cmp(iter, r[0], r[1]) >= 0) - r++; + bkey_for_each_ptr(p, ptr) + nr_ptrs += !ptr->cached; + BUG_ON(!nr_ptrs); + break; + } + case KEY_TYPE_reservation: + nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } - /* - * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older and - * should be dropped. - */ - return !bkey_cmp_packed(b, - __btree_node_offset_to_key(b, l->k), - __btree_node_offset_to_key(b, r->k)); + return nr_ptrs; } -struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) { - struct bkey_packed *out = dst->start; - struct btree_nr_keys nr; + unsigned i, durability = 0; + struct bch_dev *ca; + + if (p.ptr.cached) + return 0; - memset(&nr, 0, sizeof(nr)); + ca = bch_dev_bkey_exists(c, p.ptr.dev); - heap_resort(iter, key_sort_cmp); + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); - while (!bch2_btree_node_iter_large_end(iter)) { - if (!should_drop_next_key(iter, b)) { - struct bkey_packed *k = - __btree_node_offset_to_key(b, iter->data->k); + for (i = 0; i < p.ec_nr; i++) { + struct stripe *s = + genradix_ptr(&c->stripes[0], p.idx); - bkey_copy(out, k); - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } + if (WARN_ON(!s)) + continue; - sort_key_next(iter, b, iter->data); - heap_sift_down(iter, 0, key_sort_cmp); + durability = max_t(unsigned, durability, s->nr_redundant); } - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; + return durability; } -/* Common among btree and extent ptrs */ +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, p); + + return durability; +} + +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) { - const struct bch_extent_ptr *ptr; + struct bch_dev_io_failures *i; - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; return NULL; } -bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev) +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) { - struct bch_extent_ptr *ptr; - bool dropped = false; + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); - extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); - if (dropped) - bch2_extent_drop_redundant_crcs(e); - return dropped; + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; + } } -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +/* + * returns true if p1 is better than p2: + */ +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) { - const struct bch_extent_ptr *ptr; + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; } - return NULL; + if (force_reconstruct_read(c)) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; } -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) +/* + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. + */ +int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) { - const struct bch_extent_ptr *ptr; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; + struct bch_dev *ca; + int ret = 0; - extent_for_each_ptr(e, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return ptr; + if (k.k->type == KEY_TYPE_error) + return -EIO; - return NULL; -} + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + ca = bch_dev_bkey_exists(c, p.ptr.dev); -unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e) -{ - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + /* + * If there are any dirty pointers it's an error if we can't + * read: + */ + if (!ret && !p.ptr.cached) + ret = -EIO; - extent_for_each_ptr(e, ptr) - nr_ptrs++; + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + continue; - return nr_ptrs; + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (force_reconstruct_read(c) && + !p.idx && p.ec_nr) + p.idx++; + + if (p.idx >= p.ec_nr + 1) + continue; + + if (ret > 0 && !ptr_better(c, p, *pick)) + continue; + + *pick = p; + ret = 1; + } + + return ret; } -unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) +void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); - extent_for_each_ptr(e, ptr) - nr_ptrs += !ptr->cached; - break; + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - case BCH_RESERVATION: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->u64s++; break; + default: + BUG(); } +} - return nr_ptrs; +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, - const struct bch_extent_ptr *ptr) +/* extent specific utility code */ + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) { - struct bch_dev *ca; + const struct bch_extent_ptr *ptr; - if (ptr->cached) - return 0; + extent_for_each_ptr(e, ptr) + if (ptr->dev == dev) + return ptr; - ca = bch_dev_bkey_exists(c, ptr->dev); + return NULL; +} - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return 0; +const struct bch_extent_ptr * +bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) +{ + const struct bch_extent_ptr *ptr; - return ca->mi.durability; + extent_for_each_ptr(e, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.group && + ca->mi.group - 1 == group) + return ptr; + } + + return NULL; } -unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e) +const struct bch_extent_ptr * +bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target) { const struct bch_extent_ptr *ptr; - unsigned durability = 0; extent_for_each_ptr(e, ptr) - durability += bch2_extent_ptr_durability(c, ptr); + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return ptr; - return durability; + return NULL; } unsigned bch2_extent_is_compressed(struct bkey_s_c k) { - struct bkey_s_c_extent e; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; unsigned ret = 0; switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc.compression_type != BCH_COMPRESSION_NONE && - crc.compressed_size < crc.live_size) - ret = max_t(unsigned, ret, crc.compressed_size); + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; + } } return ret; @@ -254,34 +317,67 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k) bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, struct bch_extent_ptr m, u64 offset) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) - if (ptr->dev == m.dev && - ptr->gen == m.gen && - (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) == + extent_for_each_ptr_decode(e, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) == (s64) m.offset - offset) - return ptr; + return true; - return NULL; + return false; } -/* Doesn't cleanup redundant crcs */ -void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) { - EBUG_ON(ptr < &e.v->start->ptr || - ptr >= &extent_entry_last(e)->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - memmove_u64s_down(ptr, ptr + 1, - (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1)); - e.k->u64s -= sizeof(*ptr) / sizeof(u64); + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; } -void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { - __bch2_extent_drop_ptr(e, ptr); - bch2_extent_drop_redundant_crcs(e); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + src = extent_entry_next(to_entry(ptr)); + if (src != ptrs.end && + !extent_entry_is_crc(src)) + drop_crc = false; + + dst = to_entry(ptr); + while ((prev = extent_entry_prev(ptrs, dst))) { + if (extent_entry_is_ptr(prev)) + break; + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; + break; + } + + dst = prev; + } + + memmove_u64s_down(dst, src, + (u64 *) ptrs.end - (u64 *) src); + k.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -323,38 +419,38 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, struct bch_extent_crc_unpacked n) { struct bch_extent_crc_unpacked u; - struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; union bch_extent_entry *i; + bool ret = false; /* Find a checksum entry that covers only live data: */ - if (!n.csum_type) + if (!n.csum_type) { extent_for_each_crc(extent_i_to_s(e), u, i) if (!u.compression_type && u.csum_type && u.live_size == u.uncompressed_size) { n = u; - break; + goto found; } - - if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n)) return false; - + } +found: BUG_ON(n.compression_type); BUG_ON(n.offset); BUG_ON(n.live_size != e->k.size); - bch2_extent_crc_append(e, n); restart_narrow_pointers: - extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u) - if (can_narrow_crc(u, n)) { - ptr->offset += u.offset; - extent_ptr_append(e, *ptr); - __bch2_extent_drop_ptr(extent_i_to_s(e), ptr); + extent_for_each_ptr_decode(extent_i_to_s(e), p, i) + if (can_narrow_crc(p.crc, n)) { + bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(e, &p); + ret = true; goto restart_narrow_pointers; } - bch2_extent_drop_redundant_crcs(extent_i_to_s(e)); - return true; + return ret; } /* returns true if not equal */ @@ -371,150 +467,113 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, bch2_crc_cmp(l.csum, r.csum)); } -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) +void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - union bch_extent_entry *entry = e.v->start; - union bch_extent_crc *crc, *prev = NULL; - struct bch_extent_crc_unpacked u, prev_u = { 0 }; - - while (entry != extent_entry_last(e)) { - union bch_extent_entry *next = extent_entry_next(entry); - size_t crc_u64s = extent_entry_u64s(entry); - - if (!extent_entry_is_crc(entry)) - goto next; + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - crc = entry_to_crc(entry); - u = bch2_extent_crc_unpack(e.k, crc); + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); - if (next == extent_entry_last(e)) { - /* crc entry with no pointers after it: */ - goto drop; - } - - if (extent_entry_is_crc(next)) { - /* no pointers before next crc entry: */ - goto drop; - } - - if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) { - /* identical to previous crc entry: */ - goto drop; - } - - if (!prev && - !u.csum_type && - !u.compression_type) { - /* null crc entry: */ - union bch_extent_entry *e2; - - extent_for_each_entry_from(e, e2, extent_entry_next(entry)) { - if (!extent_entry_is_ptr(e2)) - break; - - e2->ptr.offset += u.offset; - } - goto drop; + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } - - prev = crc; - prev_u = u; -next: - entry = next; - continue; -drop: - memmove_u64s_down(crc, next, - (u64 *) extent_entry_last(e) - (u64 *) next); - e.k->u64s -= crc_u64s; } - - EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c)); -} - -static bool should_drop_ptr(const struct bch_fs *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) -{ - return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr); } -static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e) +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bch_extent_ptr *ptr = &e.v->start->ptr; - bool dropped = false; - - while ((ptr = extent_ptr_next(e, ptr))) - if (should_drop_ptr(c, e.c, ptr)) { - __bch2_extent_drop_ptr(e, ptr); - dropped = true; - } else - ptr++; - - if (dropped) - bch2_extent_drop_redundant_crcs(e); -} + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; + struct bch_dev *ca; + bool first = true; -bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k) -{ - return bch2_extent_normalize(c, k); -} + bkey_extent_entry_for_each(ptrs, entry) { + if (!first) + pr_buf(out, " "); -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : "", + ca && ptr_stale(ca, ptr) + ? " stale" : ""); + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + crc.csum_type, + crc.compression_type); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); - break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); - break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); - break; - case BCH_EXTENT_ENTRY_ptr: - break; - } + pr_buf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; + default: + pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; } - break; - } + + first = false; } } static const char *extent_ptr_invalid(const struct bch_fs *c, - struct bkey_s_c_extent e, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr2; struct bch_dev *ca; - if (ptr->dev >= c->sb.nr_devices || - !c->devs[ptr->dev]) + if (!bch2_dev_exists2(c, ptr->dev)) return "pointer to invalid device"; ca = bch_dev_bkey_exists(c, ptr->dev); if (!ca) return "pointer to invalid device"; - extent_for_each_ptr(e, ptr2) + bkey_for_each_ptr(ptrs, ptr2) if (ptr != ptr2 && ptr->dev == ptr2->dev) return "multiple pointers to same device"; @@ -531,236 +590,125 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, return NULL; } -static size_t extent_print_ptrs(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c_extent e) +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) { - char *out = buf, *end = buf + size; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - bool first = true; + unsigned size_ondisk = k.k->size; + const char *reason; + unsigned nonce = UINT_MAX; -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + if (k.k->type == KEY_TYPE_btree_ptr) + size_ondisk = c->opts.btree_node_size; - extent_for_each_entry(e, entry) { - if (!first) - p(" "); + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) + return "invalid extent entry type"; - switch (__extent_entry_type(entry)) { + if (k.k->type == KEY_TYPE_btree_ptr && + !extent_entry_is_ptr(entry)) + return "has non ptr field"; + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + reason = extent_ptr_invalid(c, k, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u", - crc.compressed_size, - crc.uncompressed_size, - crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); - break; - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - p("ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); - break; - default: - p("(invalid extent entry %.16llx)", *((u64 *) entry)); - goto out; - } - - first = false; - } -out: - if (bkey_extent_is_cached(e.k)) - p(" cached"); -#undef p - return out - buf; -} + if (crc.offset + crc.live_size > + crc.uncompressed_size) + return "checksum offset + key size > uncompressed size"; -static inline bool dev_latency_better(struct bch_fs *c, - const struct bch_extent_ptr *ptr1, - const struct bch_extent_ptr *ptr2) -{ - struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev); - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + size_ondisk = crc.compressed_size; - /* Pick at random, biased in favor of the faster device: */ + if (!bch2_checksum_type_valid(c, crc.csum_type)) + return "invalid checksum type"; - return bch2_rand_range(l1 + l2) > l1; -} + if (crc.compression_type >= BCH_COMPRESSION_NR) + return "invalid compression type"; -static int extent_pick_read_device(struct bch_fs *c, - struct bkey_s_c_extent e, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) -{ - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; - struct bch_dev *ca; - int ret = 0; - - extent_for_each_ptr_crc(e, ptr, crc) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - if (ptr->cached && ptr_stale(ca, ptr)) - continue; - - if (avoid && test_bit(ptr->dev, avoid->d)) - continue; - - if (ret && !dev_latency_better(c, ptr, &pick->ptr)) - continue; - - *pick = (struct extent_pick_ptr) { - .ptr = *ptr, - .crc = crc, - }; - - ret = 1; + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + return "incorrect nonce"; + } + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } - return ret; + return NULL; } /* Btree ptrs */ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (bkey_extent_is_cached(k.k)) - return "cached"; - - if (k.k->size) - return "nonzero key size"; - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - switch (k.k->type) { - case BCH_EXTENT: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - const struct bch_extent_ptr *ptr; - const char *reason; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (extent_entry_is_crc(entry)) - return "has crc field"; - } - - extent_for_each_ptr(e, ptr) { - reason = extent_ptr_invalid(c, e, ptr, - c->opts.btree_node_size, - true); - if (reason) - return reason; - } - - return NULL; - } - - default: - return "invalid value type"; - } + return bch2_bkey_ptrs_invalid(c, k); } void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - unsigned seq; const char *err; char buf[160]; struct bucket_mark mark; struct bch_dev *ca; - unsigned replicas = 0; - bool bad; - extent_for_each_ptr(e, ptr) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) - continue; + mark = ptr_bucket_mark(ca, ptr); err = "stale"; - if (ptr_stale(ca, ptr)) + if (gen_after(mark.gen, ptr->gen)) goto err; - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - err = "inconsistent"; - if (bad) + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) goto err; } - if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), k); - bch2_fs_bug(c, - "btree key bad (replicas not marked in superblock):\n%s", - buf); - return; - } - return; err: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi " - "gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); } -void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - - invalid = bch2_btree_ptr_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p -} - -int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), - avoid, pick); + bch2_bkey_ptrs_to_text(out, c, k); } /* Extents */ -static bool __bch2_cut_front(struct bpos where, struct bkey_s k) +bool __bch2_cut_front(struct bpos where, struct bkey_s k) { u64 len = 0; @@ -778,7 +726,7 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) * cause offset to point to the next bucket: */ if (!len) - __set_bkey_deleted(k.k); + k.k->type = KEY_TYPE_deleted; else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); union bch_extent_entry *entry; @@ -799,6 +747,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) case BCH_EXTENT_ENTRY_crc128: entry->crc128.offset += e.k->size - len; break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } if (extent_entry_is_crc(entry)) @@ -811,11 +761,6 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) return true; } -bool bch2_cut_front(struct bpos where, struct bkey_i *k) -{ - return __bch2_cut_front(where, bkey_i_to_s(k)); -} - bool bch2_cut_back(struct bpos where, struct bkey *k) { u64 len = 0; @@ -833,7 +778,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k) k->size = len; if (!len) - __set_bkey_deleted(k); + k->type = KEY_TYPE_deleted; return true; } @@ -851,464 +796,221 @@ void bch2_key_resize(struct bkey *k, k->size = new_size; } -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static bool __extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) +static bool extent_i_save(struct btree *b, struct bkey_packed *dst, + struct bkey_i *src) { struct bkey_format *f = &b->format; struct bkey_i *dst_unpacked; - bool ret; - - if ((dst_unpacked = packed_to_bkey(dst))) { - dst_unpacked->k = *src; - ret = true; - } else { - ret = bch2_bkey_pack_key(dst, src, f); - } + struct bkey_packed tmp; - if (ret && iter) - bch2_verify_key_order(b, iter, dst); - - return ret; -} - -static void extent_save(struct btree *b, struct btree_node_iter *iter, - struct bkey_packed *dst, struct bkey *src) -{ - BUG_ON(!__extent_save(b, iter, dst, src)); -} + if ((dst_unpacked = packed_to_bkey(dst))) + dst_unpacked->k = src->k; + else if (bch2_bkey_pack_key(&tmp, &src->k, f)) + memcpy_u64s(dst, &tmp, f->key_u64s); + else + return false; -/* - * If keys compare equal, compare by pointer order: - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -#define extent_sort_cmp(h, l, r) \ -({ \ - struct bkey _ul = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (l).k)); \ - struct bkey _ur = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - bkey_cmp(bkey_start_pos(&_ul), \ - bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ -}) - -static inline void extent_sort_sift(struct btree_node_iter_large *iter, - struct btree *b, size_t i) -{ - heap_sift_down(iter, i, extent_sort_cmp); + memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); + return true; } -static inline void extent_sort_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - sort_key_next(iter, b, i); - heap_sift_down(iter, i - iter->data, extent_sort_cmp); -} +static bool bch2_extent_merge_inline(struct bch_fs *, + struct btree_iter *, + struct bkey_packed *, + struct bkey_packed *, + bool); -static void extent_sort_append(struct bch_fs *c, - struct btree *b, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, - struct bkey_packed *k) +static void verify_extent_nonoverlapping(struct bch_fs *c, + struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) { - struct bkey_format *f = &b->format; - BKEY_PADDED(k) tmp; - - if (bkey_whiteout(k)) - return; - - bch2_bkey_unpack(b, &tmp.k, k); +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; - if (*prev && - bch2_extent_merge(c, b, (void *) *prev, &tmp.k)) + if (!expensive_debug_checks(c)) return; - if (*prev) { - bch2_bkey_pack(*prev, (void *) *prev, f); - - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); } +#endif - bkey_copy(*prev, &tmp.k); +#endif } -struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) +static void verify_modified_extent(struct btree_iter *iter, + struct bkey_packed *k) { - struct bkey_format *f = &b->format; - struct btree_node_iter_set *_l = iter->data, *_r; - struct bkey_packed *prev = NULL, *out, *lk, *rk; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - heap_resort(iter, extent_sort_cmp); - - while (!bch2_btree_node_iter_large_end(iter)) { - lk = __btree_node_offset_to_key(b, _l->k); - - if (iter->used == 1) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - _r = iter->data + 1; - if (iter->used > 2 && - extent_sort_cmp(iter, _r[0], _r[1]) >= 0) - _r++; - - rk = __btree_node_offset_to_key(b, _r->k); - - l = __bkey_disassemble(b, lk, &l_unpacked); - r = __bkey_disassemble(b, rk, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, b, &nr, dst->start, &prev, lk); - extent_sort_next(iter, b, _l); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_sort_next(iter, b, _r); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - sort_key_next(iter, b, _r); - } else { - __bch2_cut_front(l.k->p, r); - extent_save(b, NULL, rk, r.k); - } - - extent_sort_sift(iter, b, _r - iter->data); - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_reassemble(&tmp.k, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); - - __bch2_cut_front(r.k->p, l); - extent_save(b, NULL, lk, l.k); - - extent_sort_sift(iter, b, 0); - - extent_sort_append(c, b, &nr, dst->start, &prev, - bkey_to_packed(&tmp.k)); - } else { - bch2_cut_back(bkey_start_pos(r.k), l.k); - extent_save(b, NULL, lk, l.k); - } - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = dst->start; - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; + bch2_btree_iter_verify(iter, iter->l[0].b); + bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s); } -struct extent_insert_state { - struct btree_insert *trans; - struct btree_insert_entry *insert; - struct bpos committed; - struct bch_fs_usage stats; +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter; + struct bkey_packed *k; - /* for deleting: */ - struct bkey_i whiteout; - bool do_journal; - bool deleting; -}; + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); -static void bch2_add_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - struct bch_fs *c = s->trans->c; - struct btree *b = s->insert->iter->l[0].b; + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0); + node_iter = l->iter; + k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) + return; - if (!sectors) + node_iter = l->iter; + k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); + if (k && !bkey_written(l->b, k) && + bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) return; - bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq, 0); -} + k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); -static void bch2_subtract_sectors(struct extent_insert_state *s, - struct bkey_s_c k, u64 offset, s64 sectors) -{ - bch2_add_sectors(s, k, offset, -sectors); + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); + bch2_btree_iter_verify(iter, l->b); } -/* These wrappers subtract exactly the sectors that we're removing from @k */ -static void bch2_cut_subtract_back(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { - bch2_subtract_sectors(s, k.s_c, where.offset, - k.k->p.offset - where.offset); - bch2_cut_back(where, k.k); -} + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0; -static void bch2_cut_subtract_front(struct extent_insert_state *s, - struct bpos where, struct bkey_s k) -{ - bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k), - where.offset - bkey_start_offset(k.k)); - __bch2_cut_front(where, k); -} + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } -static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k) -{ - if (k.k->size) - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - k.k->size = 0; - __set_bkey_deleted(k.k); + return ret; } -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC) - -static enum btree_insert_ret -extent_insert_should_stop(struct extent_insert_state *s) +static inline struct bpos +bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter) { - struct btree *b = s->insert->iter->l[0].b; - - /* - * Check if we have sufficient space in both the btree node and the - * journal reservation: - * - * Each insert checks for room in the journal entry, but we check for - * room in the btree node up-front. In the worst case, bkey_cmpxchg() - * will insert two keys, and one iteration of this room will insert one - * key, so we need room for three keys. - */ - if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; - else if (!journal_res_insert_fits(s->trans, s->insert)) - return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */ - else - return BTREE_INSERT_OK; -} + struct btree *b = iter->l[0].b; + struct btree_node_iter node_iter = iter->l[0].iter; + struct bkey_packed *_k; + unsigned nr_alloc_ptrs = + bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert)); -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bset_tree *t = bset_tree_last(l->b); - struct bkey_packed *where = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, t); - struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where); - struct bkey_packed *next_live_key = where; - unsigned clobber_u64s; + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - if (prev) - where = bkey_next(prev); + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - while (next_live_key != btree_bkey_last(l->b, t) && - bkey_deleted(next_live_key)) - next_live_key = bkey_next(next_live_key); + if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) + break; - /* - * Everything between where and next_live_key is now deleted keys, and - * is overwritten: - */ - clobber_u64s = (u64 *) next_live_key - (u64 *) where; + nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k); - if (prev && - bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true)) - goto drop_deleted_keys; + if (nr_alloc_ptrs > 20) { + BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0); + return bpos_min(insert->k.p, k.k->p); + } - if (next_live_key != btree_bkey_last(l->b, t) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), - next_live_key, false)) - goto drop_deleted_keys; + bch2_btree_node_iter_advance(&node_iter, b); + } - bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where, - clobber_u64s, where->u64s); - return; -drop_deleted_keys: - bch2_bset_delete(l->b, where, clobber_u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - where, clobber_u64s, 0); + return bpos_min(insert->k.p, b->key.k.p); } -static void extent_insert_committed(struct extent_insert_state *s) +void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct bkey_i *insert = !s->deleting - ? s->insert->k - : &s->whiteout; - BKEY_PADDED(k) split; - - EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0); - EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0); - - if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k))) - return; - - if (s->deleting && !s->do_journal) { - bch2_cut_front(s->committed, insert); - goto done; - } - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - bkey_copy(&split.k, insert); - - if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - bkey_cmp(s->committed, insert->k.p) && - bch2_extent_is_compressed(bkey_i_to_s_c(insert))) { - /* XXX: possibly need to increase our reservation? */ - bch2_cut_subtract_back(s, s->committed, - bkey_i_to_s(&split.k)); - bch2_cut_front(s->committed, insert); - bch2_add_sectors(s, bkey_i_to_s_c(insert), - bkey_start_offset(&insert->k), - insert->k.size); - } else { - bch2_cut_back(s->committed, &split.k.k); - bch2_cut_front(s->committed, insert); - } - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k)); - - bch2_btree_journal_key(s->trans, iter, &split.k); - - if (!s->deleting) - extent_bset_insert(c, iter, &split.k); -done: - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - - insert->k.needs_whiteout = false; - s->do_journal = false; - s->trans->did_work = true; + bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k); } -static enum btree_insert_ret -__extent_insert_advance_pos(struct extent_insert_state *s, - struct bpos next_pos, - struct bkey_s_c k) +bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { - struct extent_insert_hook *hook = s->trans->hook; - enum btree_insert_ret ret; - - if (hook) - ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); - else - ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - - if (ret == BTREE_INSERT_OK) - s->committed = next_pos; - - return ret; + return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p); } -/* - * Update iter->pos, marking how much of @insert we've processed, and call hook - * fn: - */ -static enum btree_insert_ret -extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *trans, + struct btree_insert_entry *insert, + unsigned *u64s) { - struct btree *b = s->insert->iter->l[0].b; - struct bpos next_pos = bpos_min(s->insert->k->k.p, - k.k ? k.k->p : b->key.k.p); - enum btree_insert_ret ret; - - if (race_fault()) - return BTREE_INSERT_NEED_TRAVERSE; - - /* hole? */ - if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null); - if (ret != BTREE_INSERT_OK) - return ret; - } + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; + enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_s_c k; + int sectors; - /* avoid redundant calls to hook fn: */ - if (!bkey_cmp(s->committed, next_pos)) + /* + * We avoid creating whiteouts whenever possible when deleting, but + * those optimizations mean we may potentially insert two whiteouts + * instead of one (when we overlap with the front of one extent and the + * back of another): + */ + if (bkey_whiteout(&insert->k->k)) + *u64s += BKEY_U64s; + + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) return BTREE_INSERT_OK; - return __extent_insert_advance_pos(s, next_pos, k); -} + k = bkey_disassemble(l->b, _k, &unpacked); -static enum btree_insert_ret -extent_insert_check_split_compressed(struct extent_insert_state *s, - struct bkey_s_c k, - enum bch_extent_overlap overlap) -{ - struct bch_fs *c = s->trans->c; - unsigned sectors; + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + /* account for having to split existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bch2_extent_is_compressed(k))) { - int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; - - if (s->trans->flags & BTREE_INSERT_NOFAIL) - flags |= BCH_DISK_RESERVATION_NOFAIL; + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; - switch (bch2_disk_reservation_add(c, - s->trans->disk_res, - sectors * bch2_extent_nr_dirty_ptrs(k), - flags)) { + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, + sectors, flags)) { case 0: break; case -ENOSPC: return BTREE_INSERT_ENOSPC; - case -EINTR: - return BTREE_INSERT_NEED_GC_LOCK; default: BUG(); } @@ -1317,78 +1019,60 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, return BTREE_INSERT_OK; } -static enum btree_insert_ret -extent_squash(struct extent_insert_state *s, struct bkey_i *insert, - struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k, +static void +extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - enum btree_insert_ret ret; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: /* insert overlaps with start of k: */ - bch2_cut_subtract_front(s, insert->k.p, k); + __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_BACK: /* insert overlaps with end of k: */ - bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k); + bch2_cut_back(bkey_start_pos(&insert->k), k.k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); /* * As the auxiliary tree is indexed by the end of the * key and we've just changed the end, update the * auxiliary tree. */ - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); break; case BCH_EXTENT_OVERLAP_ALL: { - struct bpos orig_pos = k.k->p; - /* The insert key completely covers k, invalidate k */ if (!bkey_whiteout(k.k)) - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - - bch2_drop_subtract(s, k); - k.k->p = bkey_start_pos(&insert->k); - if (!__extent_save(b, node_iter, _k, k.k)) { - /* - * Couldn't repack: we aren't necessarily able - * to repack if the new key is outside the range - * of the old extent, so we have to split - * @insert: - */ - k.k->p = orig_pos; - extent_save(b, node_iter, _k, k.k); - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret != BTREE_INSERT_OK) - return ret; - - extent_insert_committed(s); - /* - * We split and inserted upto at k.k->p - that - * has to coincide with iter->pos, so that we - * don't have anything more we have to insert - * until we recheck our journal reservation: - */ - EBUG_ON(bkey_cmp(s->committed, k.k->p)); + btree_account_key_drop(l->b, _k); + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; + + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, u64s, 0); + bch2_btree_iter_verify(iter, l->b); } else { - bch2_bset_fix_invalidated_key(b, t, _k); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - _k, _k->u64s, _k->u64s); + extent_save(l->b, _k, k.k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + verify_modified_extent(iter, _k); } break; @@ -1410,171 +1094,110 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, * what k points to) */ bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bset_written(b, bset(b, t)); + split.k.k.needs_whiteout |= bkey_written(l->b, _k); bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); BUG_ON(bkey_deleted(&split.k.k)); - bch2_cut_subtract_front(s, insert->k.p, k); + __bch2_cut_front(insert->k.p, k); BUG_ON(bkey_deleted(k.k)); - extent_save(b, node_iter, _k, k.k); + extent_save(l->b, _k, k.k); + verify_modified_extent(iter, _k); - bch2_add_sectors(s, bkey_i_to_s_c(&split.k), - bkey_start_offset(&split.k.k), - split.k.k.size); extent_bset_insert(c, iter, &split.k); break; } } - - return BTREE_INSERT_OK; } -static enum btree_insert_ret -__bch2_delete_fixup_extent(struct extent_insert_state *s) -{ - struct bch_fs *c = s->trans->c; - struct btree_iter *iter = s->insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - s->whiteout = *insert; - - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (bkey_whiteout(k.k)) { - s->committed = bpos_min(insert->k.p, k.k->p); - goto next; - } - - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; - - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) - break; - - s->do_journal = true; - - if (overlap == BCH_EXTENT_OVERLAP_ALL) { - btree_keys_account_key_drop(&b->nr, - t - b->set, _k); - bch2_subtract_sectors(s, k.s_c, - bkey_start_offset(k.k), k.k->size); - _k->type = KEY_TYPE_DISCARD; - reserve_whiteout(b, t, _k); - } else if (k.k->needs_whiteout || - bset_written(b, bset(b, t))) { - struct bkey_i discard = *insert; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - bch2_cut_front(bkey_start_pos(k.k), &discard); - break; - case BCH_EXTENT_OVERLAP_BACK: - bch2_cut_back(k.k->p, &discard.k); - break; - default: - break; - } - - discard.k.needs_whiteout = true; - - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - - extent_bset_insert(c, iter, &discard); - } else { - ret = extent_squash(s, insert, t, _k, k, overlap); - BUG_ON(ret != BTREE_INSERT_OK); - } -next: - bch2_cut_front(s->committed, insert); - bch2_btree_iter_set_pos_same_leaf(iter, s->committed); - } - - return ret; -} +struct extent_insert_state { + struct bkey_i whiteout; + bool update_journal; + bool update_btree; + bool deleting; +}; -static enum btree_insert_ret -__bch2_insert_fixup_extent(struct extent_insert_state *s) +static void __bch2_insert_fixup_extent(struct bch_fs *c, + struct btree_iter *iter, + struct bkey_i *insert, + struct extent_insert_state *s) { - struct btree_iter *iter = s->insert->iter; struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; struct bkey_packed *_k; struct bkey unpacked; - struct bkey_i *insert = s->insert->k; - enum btree_insert_ret ret = BTREE_INSERT_OK; - while (bkey_cmp(s->committed, insert->k.p) < 0 && - (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK && - (_k = bch2_btree_node_iter_peek_all(node_iter, b))) { - struct bset_tree *t = bch2_bkey_to_bset(b, _k); - struct bkey_s k = __bkey_disassemble(b, _k, &unpacked); - enum bch_extent_overlap overlap; - - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0); + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; - overlap = bch2_extent_overlap(&insert->k, k.k); - - ret = extent_insert_check_split_compressed(s, k.s_c, overlap); - if (ret) - break; + if (!bkey_whiteout(k.k)) + s->update_journal = true; - if (!k.k->size) - goto squash; + if (!s->update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &s->whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); + goto next; + } /* - * Only call advance pos & call hook for nonzero size extents: + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: */ - ret = extent_insert_advance_pos(s, k.s_c); - if (ret) + if (s->deleting && + !s->update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + _k->type = KEY_TYPE_discard; + reserve_whiteout(l->b, _k); + } break; + } - if (k.k->size && - (k.k->needs_whiteout || bset_written(b, bset(b, t)))) + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { insert->k.needs_whiteout = true; + s->update_btree = true; + } - if (overlap == BCH_EXTENT_OVERLAP_ALL && + if (s->update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && bkey_whiteout(k.k) && k.k->needs_whiteout) { - unreserve_whiteout(b, t, _k); + unreserve_whiteout(l->b, _k); _k->needs_whiteout = false; } -squash: - ret = extent_squash(s, insert, t, _k, k, overlap); - if (ret != BTREE_INSERT_OK) + + extent_squash(c, iter, insert, _k, k, overlap); + + if (!s->update_btree) + bch2_cut_front(cur_end, insert); +next: + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } - return ret; + /* + * may have skipped past some deleted extents greater than the insert + * key, before we got to a non deleted extent and knew we could bail out + * rewind the iterator a bit if necessary: + */ + { + struct btree_node_iter node_iter = l->iter; + + while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && + bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0) + l->iter = node_iter; + } } /** @@ -1616,165 +1239,69 @@ squash: * If the end of iter->pos is not the same as the end of insert, then * key insertion needs to continue/be retried. */ -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *trans, - struct btree_insert_entry *insert) +void bch2_insert_fixup_extent(struct btree_trans *trans, + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - enum btree_insert_ret ret = BTREE_INSERT_OK; - + struct btree_iter *iter = insert->iter; struct extent_insert_state s = { - .trans = trans, - .insert = insert, - .committed = insert->iter->pos, + .whiteout = *insert->k, + .update_journal = !bkey_whiteout(&insert->k->k), + .update_btree = !bkey_whiteout(&insert->k->k), .deleting = bkey_whiteout(&insert->k->k), }; + BKEY_PADDED(k) tmp; EBUG_ON(iter->level); - EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size); - - /* - * As we process overlapping extents, we advance @iter->pos both to - * signal to our caller (btree_insert_key()) how much of @insert->k has - * been inserted, and also to keep @iter->pos consistent with - * @insert->k and the node iterator that we're advancing: - */ + EBUG_ON(!insert->k->k.size); EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - if (!s.deleting && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) - bch2_add_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); + __bch2_insert_fixup_extent(c, iter, insert->k, &s); - ret = !s.deleting - ? __bch2_insert_fixup_extent(&s) - : __bch2_delete_fixup_extent(&s); + bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p); - if (ret == BTREE_INSERT_OK && - bkey_cmp(s.committed, insert->k->k.p) < 0) - ret = extent_insert_advance_pos(&s, bkey_s_c_null); + if (s.update_btree) { + bkey_copy(&tmp.k, insert->k); - extent_insert_committed(&s); + if (s.deleting) + tmp.k.k.type = KEY_TYPE_discard; +#if 0 + /* disabled due to lock recursion - mark_lock: */ + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, iter->l[0].b, + bkey_i_to_s_c(&tmp.k)); +#endif + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - if (s.deleting) - bch2_cut_front(iter->pos, insert->k); - - /* - * Subtract any remaining sectors from @insert, if we bailed out early - * and didn't fully insert @insert: - */ - if (!s.deleting && - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && - insert->k->k.size) - bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k), - bkey_start_offset(&insert->k->k), - insert->k->k.size); + extent_bset_insert(c, iter, &tmp.k); + } - bch2_fs_usage_apply(c, &s.stats, trans->disk_res, - gc_pos_btree_node(b)); + if (s.update_journal) { + bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - EBUG_ON(bkey_cmp(iter->pos, s.committed)); - EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != - !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF)); + if (s.deleting) + tmp.k.k.type = KEY_TYPE_discard; - if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF)) - ret = BTREE_INSERT_NEED_TRAVERSE; + EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0), - "ret %u insert->k.size %u", ret, insert->k->k.size); + bch2_btree_journal_key(trans, iter, &tmp.k); + } - return ret; + bch2_cut_front(insert->k->k.p, insert->k); } const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) - return "value too big"; - - if (!k.k->size) - return "zero key size"; - - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - unsigned size_ondisk = e.k->size; - const char *reason; - unsigned nonce = UINT_MAX; - - extent_for_each_entry(e, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) - return "invalid extent entry type"; - - if (extent_entry_is_crc(entry)) { - crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - - if (crc.offset + e.k->size > - crc.uncompressed_size) - return "checksum offset + key size > uncompressed size"; - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) - return "invalid checksum type"; - - if (crc.compression_type >= BCH_COMPRESSION_NR) - return "invalid compression type"; - - if (bch2_csum_type_is_encryption(crc.csum_type)) { - if (nonce == UINT_MAX) - nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) - return "incorrect nonce"; - } - } else { - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; - } - } - - return NULL; - } - - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; - } - - default: - return "invalid value type"; - } + return bch2_bkey_ptrs_invalid(c, k); } -static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, - struct bkey_s_c_extent e) +void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, + struct bkey_s_c k) { - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket_mark mark; - unsigned seq, stale; + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; char buf[160]; - bool bad; - unsigned replicas = 0; /* * XXX: we should be doing most/all of these checks at startup time, @@ -1785,181 +1312,156 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - stale = 0; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - /* between mark and bucket gen */ - smp_rmb(); - - stale = ptr_stale(ca, ptr); - - bch2_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); - - bch2_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); - - if (stale) - break; - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_USER || - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors)); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - if (bad) - goto bad_ptr; - } - - if (replicas > BCH_REPLICAS_MAX) { - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; - } - if (!bkey_extent_is_cached(e.k) && - !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) { - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), e.s_c); - bch2_fs_bug(c, - "extent key bad (replicas not marked in superblock):\n%s", - buf); - return; + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; + + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); + + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); } +} - return; - -bad_ptr: - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i type %u", buf, - PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); - return; +void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); } -void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; + +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); +#define set_common_fields(_dst, _src) \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (extent_entry_type(to_entry(dst))) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); break; - case BCH_RESERVATION: + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; break; default: BUG(); } -} - -void bch2_extent_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) -{ - char *out = buf, *end = buf + size; - const char *invalid; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - if (bkey_extent_is_data(k.k)) - out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k)); - - invalid = bch2_extent_invalid(c, k); - if (invalid) - p(" invalid: %s", invalid); -#undef p +#undef set_common_fields } static void bch2_extent_crc_init(union bch_extent_crc *crc, struct bch_extent_crc_unpacked new) { -#define common_fields(_crc) \ - .csum_type = _crc.csum_type, \ - .compression_type = _crc.compression_type, \ - ._compressed_size = _crc.compressed_size - 1, \ - ._uncompressed_size = _crc.uncompressed_size - 1, \ - .offset = _crc.offset - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) { - crc->crc32 = (struct bch_extent_crc32) { - .type = 1 << BCH_EXTENT_ENTRY_crc32, - common_fields(new), - .csum = *((__le32 *) &new.csum.lo), - }; - return; - } - - if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) { - crc->crc64 = (struct bch_extent_crc64) { - .type = 1 << BCH_EXTENT_ENTRY_crc64, - common_fields(new), - .nonce = new.nonce, - .csum_lo = new.csum.lo, - .csum_hi = *((__le16 *) &new.csum.hi), - }; - return; - } + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + crc->type = 1 << BCH_EXTENT_ENTRY_crc128; + else + BUG(); - if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) { - crc->crc128 = (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - common_fields(new), - .nonce = new.nonce, - .csum = new.csum, - }; - return; - } -#undef common_fields - BUG(); + bch2_extent_crc_pack(crc, new); } void bch2_extent_crc_append(struct bkey_i_extent *e, struct bch_extent_crc_unpacked new) { - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; + bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); + __extent_entry_push(e); +} - BUG_ON(new.compressed_size > new.uncompressed_size); - BUG_ON(new.live_size != e->k.size); - BUG_ON(!new.compressed_size || !new.uncompressed_size); +static inline void __extent_entry_insert(struct bkey_i_extent *e, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); - /* - * Look up the last crc entry, so we can check if we need to add - * another: - */ - extent_for_each_crc(extent_i_to_s(e), crc, i) - ; + memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + e->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} - if (!bch2_crc_unpacked_cmp(crc, new)) - return; +void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, + struct extent_ptr_decoded *p) +{ + struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL); + union bch_extent_entry *pos; + unsigned i; - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); - __extent_entry_push(e); + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = e->v.start; + goto found; + } + + extent_for_each_crc(extent_i_to_s(e), crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(e, p->crc); + pos = extent_entry_last(extent_i_to_s(e)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ptr)); + + for (i = 0; i < p->ec_nr; i++) { + p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + } } /* @@ -1972,41 +1474,17 @@ void bch2_extent_crc_append(struct bkey_i_extent *e, */ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { - struct bkey_s_extent e; - - switch (k.k->type) { - case KEY_TYPE_ERROR: - return false; - - case KEY_TYPE_DELETED: - case KEY_TYPE_COOKIE: - return true; - - case KEY_TYPE_DISCARD: - return bversion_zero(k.k->version); - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_to_extent(k); + struct bch_extent_ptr *ptr; - bch2_extent_drop_stale(c, e); + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - if (!bkey_val_u64s(e.k)) { - if (bkey_extent_is_cached(e.k)) { - k.k->type = KEY_TYPE_DISCARD; - if (bversion_zero(k.k->version)) - return true; - } else { - k.k->type = KEY_TYPE_ERROR; - } - } + /* will only happen if all pointers were cached: */ + if (!bkey_val_u64s(k.k)) + k.k->type = KEY_TYPE_deleted; - return false; - case BCH_RESERVATION: - return false; - default: - BUG(); - } + return false; } void bch2_extent_mark_replicas_cached(struct bch_fs *c, @@ -2014,117 +1492,59 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, unsigned target, unsigned nr_desired_replicas) { - struct bch_extent_ptr *ptr; - int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && - !bch2_dev_in_target(c, ptr->dev, target)) { - ptr->cached = true; + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; extra -= n; } } if (extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { - ptr->cached = true; + entry->ptr.cached = true; extra -= n; } } } -/* - * This picks a non-stale pointer, preferably from a device other than @avoid. - * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to - * other devices, it will still pick a pointer from avoid. - */ -int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *pick) -{ - int ret; - - switch (k.k->type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_COOKIE: - return 0; - - case KEY_TYPE_ERROR: - return -EIO; - - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - ret = extent_pick_read_device(c, bkey_s_c_to_extent(k), - avoid, pick); - - if (!ret && !bkey_extent_is_cached(k.k)) - ret = -EIO; - - return ret; - - case BCH_RESERVATION: - return 0; - - default: - BUG(); - } -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, - struct bkey_i *l, struct bkey_i *r) +enum merge_result bch2_extent_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) { - struct bkey_s_extent el, er; - union bch_extent_entry *en_l, *en_r; + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_extent r = bkey_s_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; + union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; - if (key_merging_disabled(c)) + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) return BCH_MERGE_NOMERGE; - /* - * Generic header checks - * Assumes left and right are in order - * Left and right must be exactly aligned - */ - - if (l->k.u64s != r->k.u64s || - l->k.type != r->k.type || - bversion_cmp(l->k.version, r->k.version) || - bkey_cmp(l->k.p, bkey_start_pos(&r->k))) - return BCH_MERGE_NOMERGE; + crc_l = bch2_extent_crc_unpack(l.k, NULL); - switch (l->k.type) { - case KEY_TYPE_DELETED: - case KEY_TYPE_DISCARD: - case KEY_TYPE_ERROR: - /* These types are mergeable, and no val to check */ - break; + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - el = bkey_i_to_s_extent(l); - er = bkey_i_to_s_extent(r); + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return BCH_MERGE_NOMERGE; - extent_for_each_entry(el, en_l) { - struct bch_extent_ptr *lp, *rp; + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; struct bch_dev *ca; - en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); - - if ((extent_entry_type(en_l) != - extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) - return BCH_MERGE_NOMERGE; - - lp = &en_l->ptr; - rp = &en_r->ptr; - - if (lp->offset + el.k->size != rp->offset || + if (lp->offset + crc_l.compressed_size != rp->offset || lp->dev != rp->dev || lp->gen != rp->gen) return BCH_MERGE_NOMERGE; @@ -2134,160 +1554,75 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) return BCH_MERGE_NOMERGE; - } - - break; - case BCH_RESERVATION: { - struct bkey_i_reservation *li = bkey_i_to_reservation(l); - struct bkey_i_reservation *ri = bkey_i_to_reservation(r); - - if (li->v.generation != ri->v.generation || - li->v.nr_replicas != ri->v.nr_replicas) - return BCH_MERGE_NOMERGE; - break; - } - default: - return BCH_MERGE_NOMERGE; - } - l->k.needs_whiteout |= r->k.needs_whiteout; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) { - bch2_key_resize(&l->k, KEY_SIZE_MAX); - bch2_cut_front(l->k.p, r); - return BCH_MERGE_PARTIAL; - } + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - bch2_key_resize(&l->k, l->k.size + r->k.size); + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; - return BCH_MERGE_MERGE; -} + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; -static void extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; - BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k)); + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; - /* - * We don't want the bch2_verify_key_order() call in extent_save(), - * because we may be out of order with deleted keys that are about to be - * removed by extent_bset_insert() - */ + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; - if ((dst_unpacked = packed_to_bkey(dst))) - bkey_copy(dst_unpacked, src); - else - BUG_ON(!bch2_bkey_pack(dst, src, f)); -} + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; -static bool extent_merge_one_overlapping(struct btree_iter *iter, - struct bpos new_pos, - struct bset_tree *t, - struct bkey_packed *k, struct bkey uk, - bool check, bool could_pack) -{ - struct btree_iter_level *l = &iter->l[0]; + break; + default: + return BCH_MERGE_NOMERGE; + } + } - BUG_ON(!bkey_deleted(k)); + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; - if (check) { - return !bkey_packed(k) || could_pack; - } else { - uk.p = new_pos; - extent_save(l->b, &l->iter, k, &uk); - bch2_bset_fix_invalidated_key(l->b, t, k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, - k, k->u64s, k->u64s); - return true; - } -} + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -static bool extent_merge_do_overlapping(struct btree_iter *iter, - struct bkey *m, bool back_merge) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree *b = l->b; - struct btree_node_iter *node_iter = &l->iter; - struct bset_tree *t; - struct bkey_packed *k; - struct bkey uk; - struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m); - bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b); - bool check = true; + if (!extent_entry_is_crc(en_l)) + continue; - /* - * @m is the new merged extent: - * - * The merge took place in the last bset; we know there can't be any 0 - * size extents overlapping with m there because if so they would have - * been between the two extents we merged. - * - * But in the other bsets, we have to check for and fix such extents: - */ -do_fixup: - for_each_bset(b, t) { - if (t == bset_tree_last(b)) - break; + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - /* - * if we don't find this bset in the iterator we already got to - * the end of that bset, so start searching from the end. - */ - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); - if (k == btree_bkey_last(b, t)) - k = bch2_bkey_prev_all(b, t, k); - if (!k) - continue; + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; - if (back_merge) { - /* - * Back merge: 0 size extents will be before the key - * that was just inserted (and thus the iterator - * position) - walk backwards to find them - */ - for (; - k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(m)) > 0); - k = bch2_bkey_prev_all(b, t, k)) { - if (bkey_cmp(uk.p, m->p) >= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } else { - /* Front merge - walk forwards */ - for (; - k != btree_bkey_last(b, t) && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, m->p) < 0); - k = bkey_next(k)) { - if (bkey_cmp(uk.p, - bkey_start_pos(m)) <= 0) - continue; - - if (!extent_merge_one_overlapping(iter, new_pos, - t, k, uk, check, could_pack)) - return false; - } - } + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); } - if (check) { - check = false; - goto do_fixup; - } + bch2_key_resize(l.k, l.k->size + r.k->size); - return true; + return BCH_MERGE_MERGE; } /* @@ -2306,13 +1641,17 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, { struct btree *b = iter->l[0].b; struct btree_node_iter *node_iter = &iter->l[0].iter; - const struct bkey_format *f = &b->format; - struct bset_tree *t = bset_tree_last(b); - struct bkey_packed *m; - BKEY_PADDED(k) li; - BKEY_PADDED(k) ri; - struct bkey_i *mi; - struct bkey tmp; + BKEY_PADDED(k) li, ri; + struct bkey_packed *m = back_merge ? l : r; + struct bkey_i *mi = back_merge ? &li.k : &ri.k; + struct bset_tree *t = bch2_bkey_to_bset(b, m); + enum merge_result ret; + + EBUG_ON(bkey_written(b, m)); + + if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX || + bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX) + return BCH_MERGE_NOMERGE; /* * We need to save copies of both l and r, because we might get a @@ -2321,79 +1660,147 @@ static bool bch2_extent_merge_inline(struct bch_fs *c, bch2_bkey_unpack(b, &li.k, l); bch2_bkey_unpack(b, &ri.k, r); - m = back_merge ? l : r; - mi = back_merge ? &li.k : &ri.k; + ret = bch2_bkey_merge(c, + bkey_i_to_s(&li.k), + bkey_i_to_s(&ri.k)); + if (ret == BCH_MERGE_NOMERGE) + return false; - /* l & r should be in last bset: */ - EBUG_ON(bch2_bkey_to_bset(b, m) != t); + /* + * check if we overlap with deleted extents - would break the sort + * order: + */ + if (back_merge) { + struct bkey_packed *n = bkey_next(m); - switch (bch2_extent_merge(c, b, &li.k, &ri.k)) { - case BCH_MERGE_NOMERGE: - return false; - case BCH_MERGE_PARTIAL: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f)) + if (n != btree_bkey_last(b, t) && + bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && + bkey_deleted(n)) return false; + } else if (ret == BCH_MERGE_MERGE) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + if (prev && + bkey_cmp_left_packed_byval(b, prev, + bkey_start_pos(&li.k.k)) > 0) return false; + } - extent_i_save(b, m, mi); - bch2_bset_fix_invalidated_key(b, t, m); - - /* - * Update iterator to reflect what we just inserted - otherwise, - * the iter_fix() call is going to put us _before_ the key we - * just partially merged with: - */ - if (back_merge) - bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p); - - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); + if (ret == BCH_MERGE_PARTIAL) { + if (!extent_i_save(b, m, mi)) + return false; if (!back_merge) bkey_copy(packed_to_bkey(l), &li.k); else bkey_copy(packed_to_bkey(r), &ri.k); - return false; - case BCH_MERGE_MERGE: - if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f)) - return false; - - if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge)) + } else { + if (!extent_i_save(b, m, &li.k)) return false; + } - extent_i_save(b, m, &li.k); - bch2_bset_fix_invalidated_key(b, t, m); + bch2_bset_fix_invalidated_key(b, m); + bch2_btree_node_iter_fix(iter, b, node_iter, + m, m->u64s, m->u64s); + verify_modified_extent(iter, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - t, m, m->u64s, m->u64s); - return true; - default: - BUG(); - } + return ret == BCH_MERGE_MERGE; } -int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bpos end = pos; struct bkey_s_c k; - int ret = 0; + bool ret = true; + int err; end.offset += size; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + BTREE_ITER_SLOTS, k, err) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (!bch2_extent_is_fully_allocated(k)) { - ret = -ENOSPC; + if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { + ret = false; break; } } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return ret; } + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + extent_for_each_ptr_decode(e, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + break; + } + case KEY_TYPE_reservation: + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return ret; +} + +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + pr_buf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} + +enum merge_result bch2_reservation_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_reservation r = bkey_s_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, r.s); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 08ad96472406..fe92737354bd 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_EXTENTS_H #define _BCACHEFS_EXTENTS_H @@ -6,129 +7,37 @@ #include "extents_types.h" struct bch_fs; -struct journal_res; -struct btree_node_iter; -struct btree_node_iter_large; -struct btree_insert; +struct btree_trans; struct btree_insert_entry; -struct extent_insert_hook; -struct bch_devs_mask; -union bch_extent_crc; -const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, - struct bkey_s_c); -void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); -void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); - -#define bch2_bkey_btree_ops (struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ - .key_debugcheck = bch2_btree_ptr_debugcheck, \ - .val_to_text = bch2_btree_ptr_to_text, \ - .swab = bch2_ptr_swab, \ -} - -const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); -bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s); -enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *, - struct bkey_i *, struct bkey_i *); - -#define bch2_bkey_extent_ops (struct bkey_ops) { \ - .key_invalid = bch2_extent_invalid, \ - .key_debugcheck = bch2_extent_debugcheck, \ - .val_to_text = bch2_extent_to_text, \ - .swab = bch2_ptr_swab, \ - .key_normalize = bch2_ptr_normalize, \ - .key_merge = bch2_extent_merge, \ - .is_extents = true, \ -} - -struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *, - struct btree *, - struct btree_node_iter_large *); -struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *, - struct btree *, - struct btree_node_iter_large *); - -int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *, - struct bch_devs_mask *avoid, - struct extent_pick_ptr *); - -int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_devs_mask *, - struct extent_pick_ptr *); - -enum btree_insert_ret -bch2_insert_fixup_extent(struct btree_insert *, - struct btree_insert_entry *); - -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned, unsigned); - -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent, unsigned); -bool bch2_extent_drop_device(struct bkey_s_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned); - -unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); -unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); -unsigned bch2_extent_is_compressed(struct bkey_s_c); - -unsigned bch2_extent_ptr_durability(struct bch_fs *, - const struct bch_extent_ptr *); -unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent); - -bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, - struct bch_extent_ptr, u64); - -static inline bool bkey_extent_is_data(const struct bkey *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - return true; - default: - return false; - } -} +/* extent entries: */ -static inline bool bkey_extent_is_allocation(const struct bkey *k) -{ - switch (k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - case BCH_RESERVATION: - return true; - default: - return false; - } -} - -static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k) -{ - return bkey_extent_is_allocation(k.k) && - !bch2_extent_is_compressed(k); -} - -static inline bool bkey_extent_is_cached(const struct bkey *k) -{ - return k->type == BCH_EXTENT_CACHED; -} +#define extent_entry_last(_e) bkey_val_end(_e) -static inline void bkey_extent_set_cached(struct bkey *k, bool cached) -{ - EBUG_ON(k->type != BCH_EXTENT && - k->type != BCH_EXTENT_CACHED); +#define entry_to_ptr(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ + \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const struct bch_extent_ptr *) (_entry), \ + (struct bch_extent_ptr *) (_entry)); \ +}) - k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT; -} +/* downcast, preserves const */ +#define to_entry(_entry) \ +({ \ + BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ + !type_is(_entry, struct bch_extent_ptr *) && \ + !type_is(_entry, struct bch_extent_stripe_ptr *)); \ + \ + __builtin_choose_expr( \ + (type_is_exact(_entry, const union bch_extent_crc *) || \ + type_is_exact(_entry, const struct bch_extent_ptr *) ||\ + type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ + (const union bch_extent_entry *) (_entry), \ + (union bch_extent_entry *) (_entry)); \ +}) static inline unsigned __extent_entry_type(const union bch_extent_entry *e) @@ -149,14 +58,11 @@ extent_entry_type(const union bch_extent_entry *e) static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) { switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - return sizeof(struct bch_extent_crc32); - case BCH_EXTENT_ENTRY_crc64: - return sizeof(struct bch_extent_crc64); - case BCH_EXTENT_ENTRY_crc128: - return sizeof(struct bch_extent_crc128); - case BCH_EXTENT_ENTRY_ptr: - return sizeof(struct bch_extent_ptr); +#define x(f, n) \ + case BCH_EXTENT_ENTRY_##f: \ + return sizeof(struct bch_extent_##f); + BCH_EXTENT_ENTRY_TYPES() +#undef x default: BUG(); } @@ -169,12 +75,24 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_ptr: + return true; + default: + return false; + } } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) { - return !extent_entry_is_ptr(e); + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + return true; + default: + return false; + } } union bch_extent_crc { @@ -184,19 +102,6 @@ union bch_extent_crc { struct bch_extent_crc128 crc128; }; -/* downcast, preserves const */ -#define to_entry(_entry) \ -({ \ - BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ - !type_is(_entry, struct bch_extent_ptr *)); \ - \ - __builtin_choose_expr( \ - (type_is_exact(_entry, const union bch_extent_crc *) || \ - type_is_exact(_entry, const struct bch_extent_ptr *)), \ - (const union bch_extent_entry *) (_entry), \ - (union bch_extent_entry *) (_entry)); \ -}) - #define __entry_to_crc(_entry) \ __builtin_choose_expr( \ type_is_exact(_entry, const union bch_extent_entry *), \ @@ -210,56 +115,6 @@ union bch_extent_crc { __entry_to_crc(_entry); \ }) -#define entry_to_ptr(_entry) \ -({ \ - EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ - \ - __builtin_choose_expr( \ - type_is_exact(_entry, const union bch_extent_entry *), \ - (const struct bch_extent_ptr *) (_entry), \ - (struct bch_extent_ptr *) (_entry)); \ -}) - -/* checksum entries: */ - -enum bch_extent_crc_type { - BCH_EXTENT_CRC_NONE, - BCH_EXTENT_CRC32, - BCH_EXTENT_CRC64, - BCH_EXTENT_CRC128, -}; - -static inline enum bch_extent_crc_type -__extent_crc_type(const union bch_extent_crc *crc) -{ - if (!crc) - return BCH_EXTENT_CRC_NONE; - - switch (extent_entry_type(to_entry(crc))) { - case BCH_EXTENT_ENTRY_crc32: - return BCH_EXTENT_CRC32; - case BCH_EXTENT_ENTRY_crc64: - return BCH_EXTENT_CRC64; - case BCH_EXTENT_ENTRY_crc128: - return BCH_EXTENT_CRC128; - default: - BUG(); - } -} - -#define extent_crc_type(_crc) \ -({ \ - BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \ - !type_is(_crc, struct bch_extent_crc64 *) && \ - !type_is(_crc, struct bch_extent_crc128 *) && \ - !type_is(_crc, union bch_extent_crc *)); \ - \ - type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \ - : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \ - : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \ - : __extent_crc_type((union bch_extent_crc *) _crc); \ -}) - static inline struct bch_extent_crc_unpacked bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) { @@ -271,14 +126,15 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .offset = _crc.offset, \ .live_size = k->size - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: + if (!crc) return (struct bch_extent_crc_unpacked) { .compressed_size = k->size, .uncompressed_size = k->size, .live_size = k->size, }; - case BCH_EXTENT_CRC32: { + + switch (extent_entry_type(to_entry(crc))) { + case BCH_EXTENT_ENTRY_crc32: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc32), }; @@ -290,7 +146,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) return ret; } - case BCH_EXTENT_CRC64: { + case BCH_EXTENT_ENTRY_crc64: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc64), .nonce = crc->crc64.nonce, @@ -301,7 +157,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) return ret; } - case BCH_EXTENT_CRC128: { + case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { common_fields(crc->crc128), .nonce = crc->crc128.nonce, @@ -316,223 +172,403 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) #undef common_fields } -/* Extent entry iteration: */ - -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +/* bkey_ptrs: generically over any key type that has ptrs */ -#define extent_entry_last(_e) \ - vstruct_idx((_e).v, bkey_val_u64s((_e).k)) +struct bkey_ptrs_c { + const union bch_extent_entry *start; + const union bch_extent_entry *end; +}; -/* Iterate over all entries: */ +struct bkey_ptrs { + union bch_extent_entry *start; + union bch_extent_entry *end; +}; -#define extent_for_each_entry_from(_e, _entry, _start) \ - for ((_entry) = _start; \ - (_entry) < extent_entry_last(_e); \ - (_entry) = extent_entry_next(_entry)) +/* iterate over bkey ptrs */ -#define extent_for_each_entry(_e, _entry) \ - extent_for_each_entry_from(_e, _entry, (_e).v->start) +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) -/* Iterate over crcs only: */ +#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ + for ((_entry) = (_start); \ + (_entry) < (_end); \ + (_entry) = extent_entry_next(_entry)) -#define __extent_crc_next(_e, _p) \ +#define __bkey_ptr_next(_ptr, _end) \ ({ \ - typeof(&(_e).v->start[0]) _entry = _p; \ + typeof(_end) _entry; \ \ - while ((_entry) < extent_entry_last(_e) && \ - !extent_entry_is_crc(_entry)) \ - (_entry) = extent_entry_next(_entry); \ + __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ + if (extent_entry_is_ptr(_entry)) \ + break; \ \ - entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \ + _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ }) -#define __extent_for_each_crc(_e, _crc) \ - for ((_crc) = __extent_crc_next(_e, (_e).v->start); \ - (_crc); \ - (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) +#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) -#define extent_crc_next(_e, _crc, _iter) \ -({ \ - extent_for_each_entry_from(_e, _iter, _iter) \ - if (extent_entry_is_crc(_iter)) { \ - (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\ - break; \ - } \ - \ - (_iter) < extent_entry_last(_e); \ -}) +#define bkey_extent_entry_for_each(_p, _entry) \ + bkey_extent_entry_for_each_from(_p, _entry, _p.start) -#define extent_for_each_crc(_e, _crc, _iter) \ - for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ - (_iter) = (_e).v->start; \ - extent_crc_next(_e, _crc, _iter); \ - (_iter) = extent_entry_next(_iter)) +#define __bkey_for_each_ptr(_start, _end, _ptr) \ + for ((_ptr) = (_start); \ + ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ + (_ptr)++) -/* Iterate over pointers, with crcs: */ +#define bkey_ptr_next(_p, _ptr) \ + __bkey_ptr_next(_ptr, (_p).end) -#define extent_ptr_crc_next(_e, _ptr, _crc) \ +#define bkey_for_each_ptr(_p, _ptr) \ + __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) + +#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ({ \ __label__ out; \ - typeof(&(_e).v->start[0]) _entry; \ \ - extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ - if (extent_entry_is_crc(_entry)) { \ - (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\ - } else { \ - _ptr = entry_to_ptr(_entry); \ + (_ptr).idx = 0; \ + (_ptr).ec_nr = 0; \ + \ + __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ + switch (extent_entry_type(_entry)) { \ + case BCH_EXTENT_ENTRY_ptr: \ + (_ptr).ptr = _entry->ptr; \ goto out; \ + case BCH_EXTENT_ENTRY_crc32: \ + case BCH_EXTENT_ENTRY_crc64: \ + case BCH_EXTENT_ENTRY_crc128: \ + (_ptr).crc = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_entry)); \ + break; \ + case BCH_EXTENT_ENTRY_stripe_ptr: \ + (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \ + break; \ } \ - \ - _ptr = NULL; \ out: \ - _ptr; \ -}) - -#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ - for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ - (_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \ - (_ptr)++) - -/* Iterate over pointers only, and from a given position: */ - -#define extent_ptr_next(_e, _ptr) \ -({ \ - struct bch_extent_crc_unpacked _crc; \ - \ - extent_ptr_crc_next(_e, _ptr, _crc); \ + _entry < (_end); \ }) -#define extent_for_each_ptr(_e, _ptr) \ - for ((_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_next(_e, _ptr)); \ - (_ptr)++) +#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ + for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ + (_entry) = _start; \ + __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ + (_entry) = extent_entry_next(_entry)) -#define extent_ptr_prev(_e, _ptr) \ -({ \ - typeof(&(_e).v->start->ptr) _p; \ - typeof(&(_e).v->start->ptr) _prev = NULL; \ - \ - extent_for_each_ptr(_e, _p) { \ - if (_p == (_ptr)) \ - break; \ - _prev = _p; \ - } \ - \ - _prev; \ -}) +#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ + __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ + _ptr, _entry) -/* - * Use this when you'll be dropping pointers as you iterate. Quadratic, - * unfortunately: - */ -#define extent_for_each_ptr_backwards(_e, _ptr) \ - for ((_ptr) = extent_ptr_prev(_e, NULL); \ - (_ptr); \ - (_ptr) = extent_ptr_prev(_e, _ptr)) +/* utility code common to all keys with pointers: */ -void bch2_extent_crc_append(struct bkey_i_extent *, - struct bch_extent_crc_unpacked); - -static inline void __extent_entry_push(struct bkey_i_extent *e) +static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) { - union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); - - EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > - BKEY_EXTENT_VAL_U64s_MAX); - - e->k.u64s += extent_entry_u64s(entry); + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(bkey_val_end(e)) + }; + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) + }; + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), + }; + } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } } -static inline void extent_ptr_append(struct bkey_i_extent *e, - struct bch_extent_ptr ptr) +static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) { - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - extent_entry_last(extent_i_to_s(e))->ptr = ptr; - __extent_entry_push(e); + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + + return (struct bkey_ptrs) { + (void *) p.start, + (void *) p.end + }; } -static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e) +static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(p, ptr) ret.devs[ret.nr++] = ptr->dev; return ret; } -static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e) +static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(p, ptr) if (!ptr->cached) ret.devs[ret.nr++] = ptr->dev; return ret; } -static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e) +static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(p, ptr) if (ptr->cached) ret.devs[ret.nr++] = ptr->dev; return ret; } -static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - return bch2_extent_devs(bkey_s_c_to_extent(k)); - default: - return (struct bch_devs_list) { .nr = 0 }; - } + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; } -static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); +unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); +unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + +void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); +int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *); + +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); + +/* bch_btree_ptr: */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *, + struct bkey_s_c); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + +#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ +} + +/* bch_extent: */ + +const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); +void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +enum merge_result bch2_extent_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_extent (struct bkey_ops) { \ + .key_invalid = bch2_extent_invalid, \ + .key_debugcheck = bch2_extent_debugcheck, \ + .val_to_text = bch2_extent_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ +} + +/* bch_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +enum merge_result bch2_reservation_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reservation (struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ +} + +void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, + unsigned *); +void bch2_insert_fixup_extent(struct btree_trans *, + struct btree_insert_entry *); + +void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, + unsigned, unsigned); + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent, unsigned); +const struct bch_extent_ptr * +bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned); +const struct bch_extent_ptr * +bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned); + +unsigned bch2_extent_is_compressed(struct bkey_s_c); + +bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, + struct bch_extent_ptr, u64); + +static inline bool bkey_extent_is_data(const struct bkey *k) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - return bch2_extent_dirty_devs(bkey_s_c_to_extent(k)); + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + return true; default: - return (struct bch_devs_list) { .nr = 0 }; + return false; } } -static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +static inline bool bkey_extent_is_allocation(const struct bkey *k) { - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - return bch2_extent_cached_devs(bkey_s_c_to_extent(k)); + switch (k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reservation: + return true; default: - return (struct bch_devs_list) { .nr = 0 }; + return false; } } +static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k) +{ + return bkey_extent_is_allocation(k.k) && + !bch2_extent_is_compressed(k); +} + +void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); +void bch2_bkey_drop_device(struct bkey_s, unsigned); + +/* Extent entry iteration: */ + +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ + extent_entry_last(_e),_entry) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +#define extent_ptr_next(_e, _ptr) \ + __bkey_ptr_next(_ptr, extent_entry_last(_e)) + +#define extent_for_each_ptr(_e, _ptr) \ + __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + +#define extent_crc_next(_e, _crc, _iter) \ +({ \ + extent_for_each_entry_from(_e, _iter, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\ + break; \ + } \ + \ + (_iter) < extent_entry_last(_e); \ +}) + +#define extent_for_each_crc(_e, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ + (_iter) = (_e).v->start; \ + extent_crc_next(_e, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) + +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ + extent_entry_last(_e), _ptr, _entry) + +void bch2_extent_crc_append(struct bkey_i_extent *, + struct bch_extent_crc_unpacked); +void bch2_extent_ptr_decoded_append(struct bkey_i_extent *, + struct extent_ptr_decoded *); + +static inline void __extent_entry_push(struct bkey_i_extent *e) +{ + union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); + + EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > + BKEY_EXTENT_VAL_U64s_MAX); + + e->k.u64s += extent_entry_u64s(entry); +} + bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, struct bch_extent_crc_unpacked); bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); -void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); -void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); -void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); + +#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +do { \ + struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + \ + _ptr = &_ptrs.start->ptr; \ + \ + while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + if (_cond) { \ + _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ + _ptrs = bch2_bkey_ptrs(_k); \ + continue; \ + } \ + \ + (_ptr)++; \ + } \ +} while (0) + +bool __bch2_cut_front(struct bpos, struct bkey_s); + +static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k) +{ + return __bch2_cut_front(where, bkey_i_to_s(k)); +} -bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); void bch2_key_resize(struct bkey *, unsigned); -int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64); +/* + * In extent_sort_fix_overlapping(), insert_fixup_extent(), + * extent_merge_inline() - we're modifying keys in place that are packed. To do + * that we have to unpack the key, modify the unpacked key - then this + * copies/repacks the unpacked to the original as necessary. + */ +static inline void extent_save(struct btree *b, struct bkey_packed *dst, + struct bkey *src) +{ + struct bkey_format *f = &b->format; + struct bkey_i *dst_unpacked; + + if ((dst_unpacked = packed_to_bkey(dst))) + dst_unpacked->k = *src; + else + BUG_ON(!bch2_bkey_pack_key(dst, src, f)); +} + +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 76139f931fe0..a8dd6952d989 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -1,26 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_EXTENTS_TYPES_H #define _BCACHEFS_EXTENTS_TYPES_H #include "bcachefs_format.h" struct bch_extent_crc_unpacked { + u32 compressed_size; + u32 uncompressed_size; + u32 live_size; + u8 csum_type; u8 compression_type; - u16 compressed_size; - u16 uncompressed_size; - u16 offset; - u16 live_size; u16 nonce; struct bch_csum csum; }; -struct extent_pick_ptr { - struct bch_extent_ptr ptr; +struct extent_ptr_decoded { + unsigned idx; + unsigned ec_nr; struct bch_extent_crc_unpacked crc; + struct bch_extent_ptr ptr; + struct bch_extent_stripe_ptr ec[4]; +}; + +struct bch_io_failures { + u8 nr; + struct bch_dev_io_failures { + u8 dev; + u8 idx; + u8 nr_failed; + u8 nr_retries; + } devs[BCH_REPLICAS_MAX]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 66fa227c552d..26d5cad7e6a5 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _EYTZINGER_H #define _EYTZINGER_H @@ -262,18 +263,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, } } -static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) -{ - size_t i = 0; - int res; - - while (i < nr && - (res = cmp(search, base + i * size, size))) - i = eytzinger0_child(i, res > 0); - - return i; -} +#define eytzinger0_find(base, nr, size, _cmp, search) \ +({ \ + void *_base = (base); \ + void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < _nr && \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + _i; \ +}) void eytzinger0_sort(void *, size_t, size_t, int (*cmp_func)(const void *, const void *, size_t), diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 789ae663bcbf..cdb272708a4b 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FIFO_H #define _BCACHEFS_FIFO_H @@ -12,7 +13,9 @@ struct { \ #define DECLARE_FIFO(type, name) FIFO(type) name #define fifo_buf_size(fifo) \ - (roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])) + ((fifo)->size \ + ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ + : 0) #define init_fifo(fifo, _size, _gfp) \ ({ \ @@ -98,7 +101,7 @@ do { \ ({ \ bool _r = !fifo_empty((fifo)); \ if (_r) \ - (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \ + (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ _r; \ }) @@ -108,17 +111,17 @@ do { \ #define fifo_peek(fifo) fifo_peek_front(fifo) #define fifo_for_each_entry(_entry, _fifo, _iter) \ - for (((void) (&(_iter) == &(_fifo)->front)), \ - _iter = (_fifo)->front; \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) + (_iter)++) #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ - for (((void) (&(_iter) == &(_fifo)->front)), \ - _iter = (_fifo)->front; \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ ((_iter != (_fifo)->back) && \ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ - _iter++) + (_iter)++) #endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 1ad9cb293e07..1ab1dd040abe 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -1,10 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" #include "error.h" +#include "extents.h" #include "fs.h" #include "fs-io.h" #include "fsck.h" @@ -32,16 +35,6 @@ struct quota_res { u64 sectors; }; -struct i_sectors_hook { - struct extent_insert_hook hook; - struct bch_inode_info *inode; - struct quota_res quota_res; - s64 sectors; - u64 new_i_size; - unsigned flags; - unsigned appending:1; -}; - struct bchfs_write_op { struct bch_inode_info *inode; s64 sectors_added; @@ -64,7 +57,7 @@ struct bch_writepage_io { struct dio_write { struct closure cl; struct kiocb *req; - struct task_struct *task; + struct mm_struct *mm; unsigned loop:1, sync:1, free_iov:1; @@ -129,7 +122,7 @@ static void bch2_quota_reservation_put(struct bch_fs *c, BUG_ON(res->sectors > inode->ei_quota_reserved); bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), BCH_QUOTA_PREALLOC); + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); inode->ei_quota_reserved -= res->sectors; mutex_unlock(&inode->ei_quota_lock); @@ -146,7 +139,7 @@ static int bch2_quota_reservation_add(struct bch_fs *c, mutex_lock(&inode->ei_quota_lock); ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK); + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); if (likely(!ret)) { inode->ei_quota_reserved += sectors; res->sectors += sectors; @@ -177,28 +170,48 @@ static int bch2_quota_reservation_add(struct bch_fs *c, /* i_size updates: */ +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; + static int inode_set_size(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { - loff_t *new_i_size = p; + struct inode_new_size *s = p; - lockdep_assert_held(&inode->ei_update_lock); + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; - bi->bi_size = *new_i_size; return 0; } static int __must_check bch2_write_inode_size(struct bch_fs *c, struct bch_inode_info *inode, - loff_t new_size) + loff_t new_size, unsigned fields) { - return __bch2_write_inode(c, inode, inode_set_size, &new_size); + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; + + return bch2_write_inode(c, inode, inode_set_size, &s, fields); } static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, int sectors) + struct quota_res *quota_res, s64 sectors) { + if (!sectors) + return; + mutex_lock(&inode->ei_quota_lock); #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { @@ -208,284 +221,252 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, quota_res->sectors -= sectors; inode->ei_quota_reserved -= sectors; } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, BCH_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } #endif inode->v.i_blocks += sectors; mutex_unlock(&inode->ei_quota_lock); } -/* i_sectors accounting: */ +/* normal i_size/i_sectors update machinery: */ -static enum btree_insert_ret -i_sectors_hook_fn(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) +static int sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, bool *allocating, + s64 *delta) { - struct i_sectors_hook *h = container_of(hook, - struct i_sectors_hook, hook); - s64 sectors = next_pos.offset - committed_pos.offset; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); + struct btree_iter *iter; + struct bkey_s_c old; - EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY)); + *delta = 0; - h->sectors += sectors * sign; + iter = bch2_trans_copy_iter(trans, extent_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); - return BTREE_INSERT_OK; -} + old = bch2_btree_iter_peek_slot(iter); -static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct i_sectors_hook *h = p; + while (1) { + /* + * should not be possible to get an error here, since we're + * carefully not advancing past @new and thus whatever leaf node + * @_iter currently points to: + */ + BUG_ON(bkey_err(old)); + + if (allocating && + !*allocating && + bch2_bkey_nr_ptrs_allocated(old) < + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) + *allocating = true; + + *delta += (min(new->k.p.offset, + old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k))) * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + if (bkey_cmp(old.k->p, new->k.p) >= 0) + break; + + old = bch2_btree_iter_next_slot(iter); + } - if (h->new_i_size != U64_MAX && - (!h->appending || - h->new_i_size > bi->bi_size)) - bi->bi_size = h->new_i_size; - bi->bi_sectors += h->sectors; - bi->bi_flags &= ~h->flags; + bch2_trans_iter_free(trans, iter); return 0; } -static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) -{ +static int bch2_extent_update(struct btree_trans *trans, + struct bch_inode_info *inode, + struct disk_reservation *disk_res, + struct quota_res *quota_res, + struct btree_iter *extent_iter, + struct bkey_i *k, + u64 new_i_size, + bool may_allocate, + bool direct, + s64 *total_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter *inode_iter = NULL; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + bool allocating = false; + bool extended = false; + bool inode_locked = false; + s64 i_sectors_delta; int ret; - mutex_lock(&h->inode->ei_update_lock); - if (h->new_i_size != U64_MAX) - i_size_write(&h->inode->v, h->new_i_size); - - i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); - - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h); - mutex_unlock(&h->inode->ei_update_lock); + bch2_trans_begin_updates(trans); - bch2_quota_reservation_put(c, h->inode, &h->quota_res); - - h->sectors = 0; - - return ret; -} + ret = bch2_btree_iter_traverse(extent_iter); + if (ret) + return ret; -static int i_sectors_dirty_start_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, void *p) -{ - struct i_sectors_hook *h = p; + bch2_extent_trim_atomic(k, extent_iter); - if (h->flags & BCH_INODE_I_SIZE_DIRTY) - bi->bi_size = h->new_i_size; + ret = sum_sector_overwrites(trans, extent_iter, + k, &allocating, + &i_sectors_delta); + if (ret) + return ret; - bi->bi_flags |= h->flags; - return 0; -} + if (!may_allocate && allocating) + return -ENOSPC; -static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h) -{ - int ret; + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k)); - mutex_lock(&h->inode->ei_update_lock); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h); - mutex_unlock(&h->inode->ei_update_lock); + new_i_size = min(k->k.p.offset << 9, new_i_size); - return ret; -} + /* XXX: inode->i_size locking */ + if (i_sectors_delta || + new_i_size > inode->ei_inode.bi_size) { + if (c->opts.new_inode_updates) { + bch2_trans_unlock(trans); + mutex_lock(&inode->ei_update_lock); -static inline struct i_sectors_hook -i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags) -{ - return (struct i_sectors_hook) { - .hook.fn = i_sectors_hook_fn, - .inode = inode, - .sectors = 0, - .new_i_size = U64_MAX, - .flags = flags|BCH_INODE_I_SECTORS_DIRTY, - }; -} + if (!bch2_trans_relock(trans)) { + mutex_unlock(&inode->ei_update_lock); + return -EINTR; + } -/* normal i_size/i_sectors update machinery: */ + inode_locked = true; -struct bchfs_extent_trans_hook { - struct bchfs_write_op *op; - struct extent_insert_hook hook; + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(c, + BTREE_ID_INODES, 64); - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; + inode_u = inode->ei_inode; + inode_u.bi_sectors += i_sectors_delta; - bool need_inode_update; -}; + /* XXX: this is slightly suspect */ + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) { + inode_u.bi_size = new_i_size; + extended = true; + } -static enum btree_insert_ret -bchfs_extent_update_hook(struct extent_insert_hook *hook, - struct bpos committed_pos, - struct bpos next_pos, - struct bkey_s_c k, - const struct bkey_i *insert) -{ - struct bchfs_extent_trans_hook *h = container_of(hook, - struct bchfs_extent_trans_hook, hook); - struct bch_inode_info *inode = h->op->inode; - int sign = bkey_extent_is_allocation(&insert->k) - - (k.k && bkey_extent_is_allocation(k.k)); - s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign; - u64 offset = min(next_pos.offset << 9, h->op->new_i_size); - bool do_pack = false; + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p.inode.k_i)); + } else { + inode_iter = bch2_trans_get_iter(trans, + BTREE_ID_INODES, + POS(k->k.p.inode, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + ret = bch2_btree_iter_traverse(inode_iter); + if (ret) + goto err; - if (h->op->unalloc && - !bch2_extent_is_fully_allocated(k)) - return BTREE_INSERT_ENOSPC; + inode_u = inode->ei_inode; + inode_u.bi_sectors += i_sectors_delta; - BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); + /* XXX: this is slightly suspect */ + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) { + inode_u.bi_size = new_i_size; + extended = true; + } - /* XXX: inode->i_size locking */ - if (offset > inode->ei_inode.bi_size) { - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_INSERT_NEED_TRAVERSE; + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); } + } - BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY); + ret = bch2_trans_commit(trans, disk_res, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_USE_RESERVE); + if (ret) + goto err; - h->inode_u.bi_size = offset; - do_pack = true; + inode->ei_inode.bi_sectors += i_sectors_delta; - inode->ei_inode.bi_size = offset; + EBUG_ON(i_sectors_delta && + inode->ei_inode.bi_sectors != inode_u.bi_sectors); - if (h->op->is_dio) - i_size_write(&inode->v, offset); - } + if (extended) { + inode->ei_inode.bi_size = new_i_size; - if (sectors) { - if (!h->need_inode_update) { - h->need_inode_update = true; - return BTREE_INSERT_NEED_TRAVERSE; + if (direct) { + spin_lock(&inode->v.i_lock); + if (new_i_size > inode->v.i_size) + i_size_write(&inode->v, new_i_size); + spin_unlock(&inode->v.i_lock); } - - h->inode_u.bi_sectors += sectors; - do_pack = true; - - h->op->sectors_added += sectors; } - if (do_pack) - bch2_inode_pack(&h->inode_p, &h->inode_u); + if (direct) + i_sectors_acct(c, inode, quota_res, i_sectors_delta); + + if (total_delta) + *total_delta += i_sectors_delta; +err: + if (!IS_ERR_OR_NULL(inode_iter)) + bch2_trans_iter_put(trans, inode_iter); + if (inode_locked) + mutex_unlock(&inode->ei_update_lock); - return BTREE_INSERT_OK; + return ret; } static int bchfs_write_index_update(struct bch_write_op *wop) { + struct bch_fs *c = wop->c; struct bchfs_write_op *op = container_of(wop, struct bchfs_write_op, op); + struct quota_res *quota_res = op->is_dio + ? &container_of(op, struct dio_write, iop)->quota_res + : NULL; + struct bch_inode_info *inode = op->inode; struct keylist *keys = &op->op.insert_keys; - struct btree_iter extent_iter, inode_iter; - struct bchfs_extent_trans_hook hook; struct bkey_i *k = bch2_keylist_front(keys); - s64 orig_sectors_added = op->sectors_added; + struct btree_trans trans; + struct btree_iter *iter; int ret; - BUG_ON(k->k.p.inode != op->inode->v.i_ino); + BUG_ON(k->k.p.inode != inode->v.i_ino); - bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); - bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES, - POS(extent_iter.pos.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - hook.op = op; - hook.hook.fn = bchfs_extent_update_hook; - hook.need_inode_update = false; + iter = bch2_trans_get_iter(&trans, + BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); do { - /* XXX: inode->i_size locking */ - k = bch2_keylist_front(keys); - if (min(k->k.p.offset << 9, op->new_i_size) > - op->inode->ei_inode.bi_size) - hook.need_inode_update = true; - - if (hook.need_inode_update) { - struct bkey_s_c inode; - - if (!btree_iter_linked(&inode_iter)) - bch2_btree_iter_link(&extent_iter, &inode_iter); - - inode = bch2_btree_iter_peek_slot(&inode_iter); - if ((ret = btree_iter_err(inode))) - goto err; - - if (WARN_ONCE(inode.k->type != BCH_INODE_FS, - "inode %llu not found when updating", - extent_iter.pos.inode)) { - ret = -ENOENT; - break; - } - - if (WARN_ONCE(bkey_bytes(inode.k) > - sizeof(hook.inode_p), - "inode %llu too big (%zu bytes, buf %zu)", - extent_iter.pos.inode, - bkey_bytes(inode.k), - sizeof(hook.inode_p))) { - ret = -ENOENT; - break; - } - - bkey_reassemble(&hook.inode_p.inode.k_i, inode); - ret = bch2_inode_unpack(bkey_s_c_to_inode(inode), - &hook.inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", - ret, extent_iter.pos.inode)) { - ret = -ENOENT; - break; - } - - ret = bch2_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(&extent_iter, k), - BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, - &hook.inode_p.inode.k_i, 2)); - } else { - ret = bch2_btree_insert_at(wop->c, &wop->res, - &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE, - BTREE_INSERT_ENTRY(&extent_iter, k)); - } + BKEY_PADDED(k) tmp; - BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k))); + bkey_copy(&tmp.k, bch2_keylist_front(keys)); - if (WARN_ONCE(!ret != !k->k.size, - "ret %i k->size %u", ret, k->k.size)) - ret = k->k.size ? -EINTR : 0; -err: + ret = bch2_extent_update(&trans, inode, + &wop->res, quota_res, + iter, &tmp.k, + op->new_i_size, + !op->unalloc, + op->is_dio, + &op->sectors_added); if (ret == -EINTR) continue; if (ret) break; - BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0); - bch2_keylist_pop_front(keys); + if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + else + bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); - bch2_btree_iter_unlock(&extent_iter); - bch2_btree_iter_unlock(&inode_iter); - - if (op->is_dio) { - struct dio_write *dio = container_of(op, struct dio_write, iop); - - i_sectors_acct(wop->c, op->inode, &dio->quota_res, - op->sectors_added - orig_sectors_added); - } + bch2_trans_exit(&trans); return ret; } @@ -529,12 +510,12 @@ struct bch_page_state { union { struct { /* existing data: */ unsigned sectors:PAGE_SECTOR_SHIFT + 1; + + /* Uncompressed, fully allocated replicas: */ unsigned nr_replicas:4; - unsigned compressed:1; - /* Owns PAGE_SECTORS sized reservation: */ - unsigned reserved:1; - unsigned reservation_replicas:4; + /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ + unsigned replicas_reserved:4; /* Owns PAGE_SECTORS sized quota reservation: */ unsigned quota_reserved:1; @@ -581,7 +562,7 @@ static inline struct bch_page_state *page_state(struct page *page) static inline unsigned page_res_sectors(struct bch_page_state s) { - return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0; + return s.replicas_reserved * PAGE_SECTORS; } static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, @@ -599,8 +580,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i { struct bch_page_state s; + EBUG_ON(!PageLocked(page)); + s = page_state_cmpxchg(page_state(page), s, { - s.reserved = 0; + s.replicas_reserved = 0; s.quota_reserved = 0; }); @@ -610,62 +593,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, struct page *page, bool check_enospc) { - struct bch_page_state *s = page_state(page), new, old; + struct bch_page_state *s = page_state(page), new; /* XXX: this should not be open coded */ unsigned nr_replicas = inode->ei_inode.bi_data_replicas ? inode->ei_inode.bi_data_replicas - 1 : c->opts.data_replicas; - - struct disk_reservation disk_res = bch2_disk_reservation_init(c, - nr_replicas); + struct disk_reservation disk_res; struct quota_res quota_res = { 0 }; - int ret = 0; + int ret; - /* - * XXX: this could likely be quite a bit simpler, page reservations - * _should_ only be manipulated with page locked: - */ + EBUG_ON(!PageLocked(page)); - old = page_state_cmpxchg(s, new, { - if (new.reserved - ? (new.reservation_replicas < disk_res.nr_replicas) - : (new.sectors < PAGE_SECTORS || - new.nr_replicas < disk_res.nr_replicas || - new.compressed)) { - int sectors = (disk_res.nr_replicas * PAGE_SECTORS - - page_res_sectors(new) - - disk_res.sectors); - - if (sectors > 0) { - ret = bch2_disk_reservation_add(c, &disk_res, sectors, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (unlikely(ret)) - goto err; - } + if (s->replicas_reserved < nr_replicas) { + ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS, + nr_replicas - s->replicas_reserved, + !check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (unlikely(ret)) + return ret; - new.reserved = 1; - new.reservation_replicas = disk_res.nr_replicas; - } + page_state_cmpxchg(s, new, ({ + BUG_ON(new.replicas_reserved + + disk_res.nr_replicas != nr_replicas); + new.replicas_reserved += disk_res.nr_replicas; + })); + } - if (!new.quota_reserved && - new.sectors + new.dirty_sectors < PAGE_SECTORS) { - ret = bch2_quota_reservation_add(c, inode, "a_res, - PAGE_SECTORS - quota_res.sectors, - check_enospc); - if (unlikely(ret)) - goto err; + if (!s->quota_reserved && + s->sectors + s->dirty_sectors < PAGE_SECTORS) { + ret = bch2_quota_reservation_add(c, inode, "a_res, + PAGE_SECTORS, + check_enospc); + if (unlikely(ret)) + return ret; + page_state_cmpxchg(s, new, ({ + BUG_ON(new.quota_reserved); new.quota_reserved = 1; - } - }); + })); + } - quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS; - disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old); -err: - bch2_quota_reservation_put(c, inode, "a_res); - bch2_disk_reservation_put(c, &disk_res); return ret; } @@ -675,6 +642,8 @@ static void bch2_clear_page_bits(struct page *page) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_page_state s; + EBUG_ON(!PageLocked(page)); + if (!PagePrivate(page)) return; @@ -709,7 +678,7 @@ int bch2_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -int bch2_page_mkwrite(struct vm_fault *vmf) +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct file *file = vmf->vma->vm_file; @@ -785,7 +754,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, { int ret; - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + EBUG_ON(!PageLocked(page)); + EBUG_ON(!PageLocked(newpage)); + + ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -828,10 +800,11 @@ static int bio_add_page_contig(struct bio *bio, struct page *page) static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; int i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -846,65 +819,96 @@ static void bch2_readpages_end_io(struct bio *bio) bio_put(bio); } +static inline void page_state_init_for_read(struct page *page) +{ + SetPagePrivate(page); + page->private = 0; +} + struct readpages_iter { struct address_space *mapping; - struct list_head pages; + struct page **pages; unsigned nr_pages; + unsigned nr_added; + unsigned idx; + pgoff_t offset; }; -static inline void page_state_init_for_read(struct page *page) +static int readpages_iter_init(struct readpages_iter *iter, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) { - struct bch_page_state *s = page_state(page); + memset(iter, 0, sizeof(*iter)); - BUG_ON(s->reserved); - s->sectors = 0; - s->compressed = 0; -} + iter->mapping = mapping; + iter->offset = list_last_entry(pages, struct page, lru)->index; -static int readpage_add_page(struct readpages_iter *iter, struct page *page) -{ - int ret; + iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!iter->pages) + return -ENOMEM; - prefetchw(&page->flags); + while (!list_empty(pages)) { + struct page *page = list_last_entry(pages, struct page, lru); - ret = add_to_page_cache_lru(page, iter->mapping, - page->index, GFP_NOFS); - if (!ret) - page_state_init_for_read(page); + prefetchw(&page->flags); + iter->pages[iter->nr_pages++] = page; + list_del(&page->lru); + } - put_page(page); - return ret; + return 0; } static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - while (iter->nr_pages) { - struct page *page = - list_last_entry(&iter->pages, struct page, lru); + struct page *page; + unsigned i; + int ret; - prefetchw(&page->flags); - list_del(&page->lru); - iter->nr_pages--; + BUG_ON(iter->idx > iter->nr_added); + BUG_ON(iter->nr_added > iter->nr_pages); + + if (iter->idx < iter->nr_added) + goto out; - if (!readpage_add_page(iter, page)) - return page; + while (1) { + if (iter->idx == iter->nr_pages) + return NULL; + + ret = add_to_page_cache_lru_vec(iter->mapping, + iter->pages + iter->nr_added, + iter->nr_pages - iter->nr_added, + iter->offset + iter->nr_added, + GFP_NOFS); + if (ret > 0) + break; + + page = iter->pages[iter->nr_added]; + iter->idx++; + iter->nr_added++; + + put_page(page); } - return NULL; -} + iter->nr_added += ret; + + for (i = iter->idx; i < iter->nr_added; i++) + put_page(iter->pages[i]); +out: + EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); -#define for_each_readpage_page(_iter, _page) \ - for (; \ - ((_page) = __readpage_next_page(&(_iter)));) \ + page_state_init_for_read(iter->pages[iter->idx]); + return iter->pages[iter->idx]; +} static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - bool compressed = bch2_extent_is_compressed(k); - unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k); + unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); bio_for_each_segment(bv, bio, iter) { + /* brand new pages, don't need to be locked: */ + struct bch_page_state *s = page_state(bv.bv_page); /* sectors in @k from the start of this page: */ @@ -912,14 +916,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); - s->nr_replicas = !s->sectors - ? nr_ptrs - : min_t(unsigned, s->nr_replicas, nr_ptrs); + s->nr_replicas = page_sectors == PAGE_SECTORS + ? nr_ptrs : 0; BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); s->sectors += page_sectors; - - s->compressed |= compressed; } } @@ -927,54 +928,51 @@ static void readpage_bio_extend(struct readpages_iter *iter, struct bio *bio, u64 offset, bool get_more) { - struct page *page; - pgoff_t page_offset; - int ret; - while (bio_end_sector(bio) < offset && bio->bi_vcnt < bio->bi_max_vecs) { - page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + struct page *page = readpage_iter_next(iter); + int ret; - if (iter->nr_pages) { - page = list_last_entry(&iter->pages, struct page, lru); - if (page->index != page_offset) + if (page) { + if (iter->offset + iter->idx != page_offset) break; - list_del(&page->lru); - iter->nr_pages--; - } else if (get_more) { - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); + iter->idx++; + } else { + if (!get_more) + break; - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); if (!page) break; - page->index = page_offset; - ClearPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page); - } else { - break; - } + page_state_init_for_read(page); - ret = readpage_add_page(iter, page); - if (ret) - break; + ret = add_to_page_cache_lru(page, iter->mapping, + page_offset, GFP_NOFS); + if (ret) { + ClearPagePrivate(page); + put_page(page); + break; + } + + put_page(page); + } __bio_add_page(bio, page, PAGE_SIZE, 0); } - - if (!iter->nr_pages) - SetPageReadahead(bio->bi_io_vec[bio->bi_vcnt - 1].bv_page); } -static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, +static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct bch_read_bio *rbio, u64 inum, struct readpages_iter *readpages_iter) { + struct bch_fs *c = trans->c; struct bio *bio = &rbio->bio; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; @@ -993,7 +991,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, BUG_ON(!k.k); if (IS_ERR(k.k)) { - int ret = bch2_btree_iter_unlock(iter); + int ret = btree_iter_err(iter); BUG_ON(!ret); bcache_io_error(c, bio, "btree IO error %i", ret); bio_endio(bio); @@ -1001,7 +999,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, } bkey_reassemble(&tmp.k, k); - bch2_btree_iter_unlock(iter); + bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); if (readpages_iter) { @@ -1009,12 +1007,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - struct bch_extent_crc_unpacked crc; const union bch_extent_entry *i; + struct extent_ptr_decoded p; - extent_for_each_crc(e, crc, i) - want_full_extent |= ((crc.csum_type != 0) | - (crc.compression_type != 0)); + extent_for_each_ptr_decode(e, p, i) + want_full_extent |= ((p.crc.csum_type != 0) | + (p.crc.compression_type != 0)); } readpage_bio_extend(readpages_iter, @@ -1048,56 +1046,71 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, inode); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct page *page; - struct readpages_iter readpages_iter = { - .mapping = mapping, .nr_pages = nr_pages - }; + struct readpages_iter readpages_iter; + int ret; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS); + ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + BUG_ON(ret); - INIT_LIST_HEAD(&readpages_iter.pages); - list_add(&readpages_iter.pages, pages); - list_del_init(pages); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); if (current->pagecache_lock != &mapping->add_lock) pagecache_add_get(&mapping->add_lock); while ((page = readpage_iter_next(&readpages_iter))) { - unsigned n = max_t(unsigned, - min_t(unsigned, readpages_iter.nr_pages + 1, - BIO_MAX_PAGES), - c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT); - + pgoff_t index = readpages_iter.offset + readpages_iter.idx; + unsigned n = min_t(unsigned, + readpages_iter.nr_pages - + readpages_iter.idx, + BIO_MAX_PAGES); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), opts); + readpages_iter.idx++; + + bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); + rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; - bio_add_page_contig(&rbio->bio, page); - bchfs_read(c, &iter, rbio, inode->v.i_ino, &readpages_iter); + __bio_add_page(&rbio->bio, page, PAGE_SIZE, 0); + + bchfs_read(&trans, iter, rbio, inode->v.i_ino, + &readpages_iter); } if (current->pagecache_lock != &mapping->add_lock) pagecache_add_put(&mapping->add_lock); + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); + return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, u64 inum, struct page *page) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; page_state_init_for_read(page); bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); bio_add_page_contig(&rbio->bio, page); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS); - bchfs_read(c, &iter, rbio, inum, NULL); + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS); + + bchfs_read(&trans, iter, rbio, inum, NULL); + + bch2_trans_exit(&trans); } int bch2_readpage(struct file *file, struct page *page) @@ -1173,13 +1186,15 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.op.c; struct bio *bio = &io->op.op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; unsigned i; if (io->op.op.error) { - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter) { SetPageError(bvec->bv_page); - set_bit(AS_EIO, &io->op.inode->v.i_mapping->flags); + mapping_set_error(bvec->bv_page->mapping, -EIO); + } } /* @@ -1203,7 +1218,7 @@ static void bch2_writepage_io_done(struct closure *cl) i_sectors_acct(c, io->op.inode, NULL, io->op.sectors_added - (s64) io->new_sectors); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter) end_page_writeback(bvec->bv_page); closure_return_with_destructor(&io->cl, bch2_writepage_io_free); @@ -1255,7 +1270,7 @@ static int __bch2_writepage(struct page *page, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; struct bch_page_state new, old; - unsigned offset; + unsigned offset, nr_replicas_this_write; loff_t i_size = i_size_read(&inode->v); pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -1281,19 +1296,31 @@ static int __bch2_writepage(struct page *page, */ zero_user_segment(page, offset, PAGE_SIZE); do_io: + EBUG_ON(!PageLocked(page)); + /* Before unlocking the page, transfer reservation to w->io: */ old = page_state_cmpxchg(page_state(page), new, { - EBUG_ON(!new.reserved && - (new.sectors != PAGE_SECTORS || - new.compressed)); + /* + * If we didn't get a reservation, we can only write out the + * number of (fully allocated) replicas that currently exist, + * and only if the entire page has been written: + */ + nr_replicas_this_write = + max_t(unsigned, + new.replicas_reserved, + (new.sectors == PAGE_SECTORS + ? new.nr_replicas : 0)); + + BUG_ON(!nr_replicas_this_write); - if (new.reserved) - new.nr_replicas = new.reservation_replicas; - new.reserved = 0; + new.nr_replicas = w->opts.compression + ? 0 + : nr_replicas_this_write; - new.compressed |= w->opts.compression != 0; + new.replicas_reserved = 0; new.sectors += new.dirty_sectors; + BUG_ON(new.sectors != PAGE_SECTORS); new.dirty_sectors = 0; }); @@ -1302,21 +1329,20 @@ do_io: unlock_page(page); if (w->io && - (w->io->op.op.res.nr_replicas != new.nr_replicas || + (w->io->op.op.res.nr_replicas != nr_replicas_this_write || !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page))) bch2_writepage_do_io(w); if (!w->io) - bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas); + bch2_writepage_io_alloc(c, w, inode, page, + nr_replicas_this_write); w->io->new_sectors += new.sectors - old.sectors; BUG_ON(inode != w->io->op.inode); BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); - if (old.reserved) - w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS; - + w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS; w->io->op.new_i_size = i_size; if (wbc->sync_mode == WB_SYNC_ALL) @@ -1446,8 +1472,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping, copied = 0; } + spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); if (copied) { if (!PageUptodate(page)) @@ -1552,8 +1580,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); inode->ei_last_dirtied = (unsigned long) current; + spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); if (copied < len && ((offset + copied) & (PAGE_SIZE - 1))) { @@ -1780,11 +1810,14 @@ static void bch2_dio_write_loop_async(struct closure *); static long bch2_dio_write_loop(struct dio_write *dio) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct kiocb *req = dio->req; struct address_space *mapping = req->ki_filp->f_mapping; struct bch_inode_info *inode = dio->iop.inode; struct bio *bio = &dio->iop.op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; + loff_t offset; bool sync; long ret; int i; @@ -1796,28 +1829,38 @@ static long bch2_dio_write_loop(struct dio_write *dio) __pagecache_block_get(&mapping->add_lock); /* Write and invalidate pagecache range that we're writing to: */ - ret = write_invalidate_inode_pages_range(mapping, req->ki_pos, - req->ki_pos + iov_iter_count(&dio->iter) - 1); + offset = req->ki_pos + (dio->iop.op.written << 9); + ret = write_invalidate_inode_pages_range(mapping, + offset, + offset + iov_iter_count(&dio->iter) - 1); if (unlikely(ret)) goto err; while (1) { + offset = req->ki_pos + (dio->iop.op.written << 9); + BUG_ON(current->pagecache_lock); current->pagecache_lock = &mapping->add_lock; - if (current != dio->task) - use_mm(dio->task->mm); + if (kthread) + use_mm(dio->mm); ret = bio_iov_iter_get_pages(bio, &dio->iter); - if (current != dio->task) - unuse_mm(dio->task->mm); + if (kthread) + unuse_mm(dio->mm); current->pagecache_lock = NULL; if (unlikely(ret < 0)) goto err; - dio->iop.op.pos = POS(inode->v.i_ino, - (req->ki_pos >> 9) + dio->iop.op.written); + /* gup might have faulted pages back in: */ + ret = write_invalidate_inode_pages_range(mapping, + offset, + offset + bio->bi_iter.bi_size - 1); + if (unlikely(ret)) + goto err; + + dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9); task_io_account_write(bio->bi_iter.bi_size); @@ -1850,7 +1893,7 @@ err_wait_io: closure_sync(&dio->cl); loop: - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, i, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->iop.op.error) break; @@ -1897,7 +1940,6 @@ static int bch2_direct_IO_write(struct kiocb *req, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct dio_write *dio; struct bio *bio; - loff_t offset = req->ki_pos; ssize_t ret; lockdep_assert_held(&inode->v.i_rwsem); @@ -1905,7 +1947,7 @@ static int bch2_direct_IO_write(struct kiocb *req, if (unlikely(!iter->count)) return 0; - if (unlikely((offset|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, @@ -1914,15 +1956,15 @@ static int bch2_direct_IO_write(struct kiocb *req, dio = container_of(bio, struct dio_write, iop.op.wbio.bio); closure_init(&dio->cl, NULL); dio->req = req; - dio->task = current; + dio->mm = current->mm; dio->loop = false; dio->sync = is_sync_kiocb(req) || - offset + iter->count > inode->v.i_size; + req->ki_pos + iter->count > inode->v.i_size; dio->free_iov = false; dio->quota_res.sectors = 0; dio->iter = *iter; bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true); - dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task); + dio->iop.op.write_point = writepoint_hashed((unsigned long) current); dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION; if ((req->ki_flags & IOCB_DSYNC) && @@ -1934,19 +1976,20 @@ static int bch2_direct_IO_write(struct kiocb *req, if (unlikely(ret)) goto err; + dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas; + ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, dio->iop.op.opts.data_replicas, 0); if (unlikely(ret)) { - if (bch2_check_range_allocated(c, POS(inode->v.i_ino, - offset >> 9), - iter->count >> 9)) + if (!bch2_check_range_allocated(c, POS(inode->v.i_ino, + req->ki_pos >> 9), + iter->count >> 9, + dio->iop.op.opts.data_replicas)) goto err; dio->iop.unalloc = true; } - dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas; - return bch2_dio_write_loop(dio); err: bch2_disk_reservation_put(c, &dio->iop.op.res); @@ -2027,20 +2070,101 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; + int ret, ret2; - ret = filemap_write_and_wait_range(inode->v.i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; - if (c->opts.journal_flush_disabled) - return 0; + if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) + goto out; + + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + return ret; +out: + if (!c->opts.journal_flush_disabled) + ret = bch2_journal_flush_seq(&c->journal, + inode->ei_journal_seq); + ret2 = file_check_and_advance_wb_err(file); - return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq); + return ret ?: ret2; } /* truncate: */ +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset, u64 *journal_seq) +{ + struct bpos start = POS(inode->v.i_ino, start_offset); + struct bpos end = POS(inode->v.i_ino, end_offset); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete.k); + + ret = bch2_extent_update(&trans, inode, + &disk_res, NULL, iter, &delete, + 0, true, true, NULL); + bch2_disk_reservation_put(c, &disk_res); + + if (ret == -EINTR) + ret = 0; + if (ret) + break; + + bch2_trans_cond_resched(&trans); + } + + bch2_trans_exit(&trans); + + return ret; +} + +static inline int range_has_data(struct bch_fs *c, + struct bpos start, + struct bpos end) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + + if (bkey_extent_is_data(k.k)) { + ret = 1; + break; + } + } + + return bch2_trans_exit(&trans) ?: ret; +} + static int __bch2_truncate_page(struct bch_inode_info *inode, pgoff_t index, loff_t start, loff_t end) { @@ -2062,30 +2186,16 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, page = find_lock_page(mapping, index); if (!page) { - struct btree_iter iter; - struct bkey_s_c k = bkey_s_c_null; - /* * XXX: we're doing two index lookups when we end up reading the * page */ - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, - index << PAGE_SECTOR_SHIFT), 0, k) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(inode->v.i_ino, - (index + 1) << PAGE_SECTOR_SHIFT)) >= 0) - break; + ret = range_has_data(c, + POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + if (ret <= 0) + return ret; - if (k.k->type != KEY_TYPE_DISCARD && - k.k->type != BCH_RESERVATION) { - bch2_btree_iter_unlock(&iter); - goto create; - } - } - bch2_btree_iter_unlock(&iter); - return 0; -create: page = find_or_create_page(mapping, index, GFP_KERNEL); if (unlikely(!page)) { ret = -ENOMEM; @@ -2131,65 +2241,111 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, from + PAGE_SIZE); } +static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + int ret; + + ret = filemap_write_and_wait_range(mapping, + inode->ei_inode.bi_size, S64_MAX); + if (ret) + return ret; + + truncate_setsize(&inode->v, iattr->ia_size); + setattr_copy(&inode->v, iattr); + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + + return ret; +} + +static int bch2_truncate_finish_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + return 0; +} + +static int bch2_truncate_start_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + u64 *new_i_size = p; + + bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; + bi->bi_size = *new_i_size; + return 0; +} + int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - bool shrink = iattr->ia_size <= inode->v.i_size; - struct i_sectors_hook i_sectors_hook = - i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY); + u64 new_i_size = iattr->ia_size; + bool shrink; int ret = 0; inode_dio_wait(&inode->v); pagecache_block_get(&mapping->add_lock); - truncate_setsize(&inode->v, iattr->ia_size); + BUG_ON(inode->v.i_size < inode->ei_inode.bi_size); + + shrink = iattr->ia_size <= inode->v.i_size; + + if (!shrink) { + ret = bch2_extend(inode, iattr); + goto err; + } + + ret = bch2_truncate_page(inode, iattr->ia_size); + if (unlikely(ret)) + goto err; - /* sync appends.. */ - /* XXX what protects inode->i_size? */ if (iattr->ia_size > inode->ei_inode.bi_size) ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); + inode->ei_inode.bi_size, + iattr->ia_size - 1); + else if (iattr->ia_size & (PAGE_SIZE - 1)) + ret = filemap_write_and_wait_range(mapping, + round_down(iattr->ia_size, PAGE_SIZE), + iattr->ia_size - 1); if (ret) - goto err_put_pagecache; + goto err; - i_sectors_hook.new_i_size = iattr->ia_size; + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, + &new_i_size, 0); + mutex_unlock(&inode->ei_update_lock); - ret = i_sectors_dirty_start(c, &i_sectors_hook); if (unlikely(ret)) goto err; + truncate_setsize(&inode->v, iattr->ia_size); + /* - * There might be persistent reservations (from fallocate()) - * above i_size, which bch2_inode_truncate() will discard - we're - * only supposed to discard them if we're doing a real truncate - * here (new i_size < current i_size): + * XXX: need a comment explaining why PAGE_SIZE and not block_bytes() + * here: */ - if (shrink) { - ret = bch2_truncate_page(inode, iattr->ia_size); - if (unlikely(ret)) - goto err; - - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(iattr->ia_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); - if (unlikely(ret)) - goto err; - } + ret = __bch2_fpunch(c, inode, + round_up(iattr->ia_size, PAGE_SIZE) >> 9, + U64_MAX, &inode->ei_journal_seq); + if (unlikely(ret)) + goto err; setattr_copy(&inode->v, iattr); - inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v); -err: - /* - * On error - in particular, bch2_truncate_page() error - don't clear - * I_SIZE_DIRTY, as we've left data above i_size!: - */ - if (ret) - i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY; - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; -err_put_pagecache: + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); +err: pagecache_block_put(&mapping->add_lock); return ret; } @@ -2200,7 +2356,6 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - u64 ino = inode->v.i_ino; u64 discard_start = round_up(offset, PAGE_SIZE) >> 9; u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9; int ret = 0; @@ -2226,34 +2381,9 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); - if (discard_start < discard_end) { - /* - * We need to pass in a disk reservation here because we might - * be splitting a compressed extent into two. This isn't a - * problem with truncate because truncate will never split an - * extent, only truncate it... - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct i_sectors_hook i_sectors_hook = - i_sectors_hook_init(inode, 0); - int ret; - - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - ret = bch2_btree_delete_range(c, - BTREE_ID_EXTENTS, - POS(ino, discard_start), - POS(ino, discard_end), - ZERO_VERSION, - &disk_res, - &i_sectors_hook.hook, - &inode->ei_journal_seq); - - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; - } + if (discard_start < discard_end) + ret = __bch2_fpunch(c, inode, discard_start, discard_end, + &inode->ei_journal_seq); err: pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); @@ -2266,24 +2396,17 @@ static long bch2_fcollapse(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct btree_iter src; - struct btree_iter dst; + struct btree_trans trans; + struct btree_iter *src, *dst; BKEY_PADDED(k) copy; struct bkey_s_c k; - struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); loff_t new_size; int ret; - if ((offset | len) & (PAGE_SIZE - 1)) + if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - /* position will be set from dst iter's position: */ - bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS); - bch2_btree_iter_link(&src, &dst); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); /* * We need i_mutex to keep the page cache consistent with the extents @@ -2308,76 +2431,79 @@ static long bch2_fcollapse(struct bch_inode_info *inode, if (ret) goto err; - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (ret) - goto err; + dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, offset >> 9), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BUG_ON(IS_ERR_OR_NULL(dst)); + + src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS_MIN, BTREE_ITER_SLOTS); + BUG_ON(IS_ERR_OR_NULL(src)); - while (bkey_cmp(dst.pos, + while (bkey_cmp(dst->pos, POS(inode->v.i_ino, round_up(new_size, PAGE_SIZE) >> 9)) < 0) { struct disk_reservation disk_res; - bch2_btree_iter_set_pos(&src, - POS(dst.pos.inode, dst.pos.offset + (len >> 9))); + ret = bch2_btree_iter_traverse(dst); + if (ret) + goto bkey_err; - k = bch2_btree_iter_peek_slot(&src); - if ((ret = btree_iter_err(k))) - goto btree_iter_err; + bch2_btree_iter_set_pos(src, + POS(dst->pos.inode, dst->pos.offset + (len >> 9))); - bkey_reassemble(©.k, k); + k = bch2_btree_iter_peek_slot(src); + if ((ret = bkey_err(k))) + goto bkey_err; - if (bkey_deleted(©.k.k)) - copy.k.k.type = KEY_TYPE_DISCARD; + bkey_reassemble(©.k, k); - bch2_cut_front(src.pos, ©.k); + bch2_cut_front(src->pos, ©.k); copy.k.k.p.offset -= len >> 9; - BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(©.k.k))); + bch2_extent_trim_atomic(©.k, dst); + + BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k))); ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size, - bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(©.k)), + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)), BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); - ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&dst, ©.k)); + ret = bch2_extent_update(&trans, inode, + &disk_res, NULL, + dst, ©.k, + 0, true, true, NULL); bch2_disk_reservation_put(c, &disk_res); -btree_iter_err: +bkey_err: if (ret == -EINTR) ret = 0; if (ret) - goto err_put_sectors_dirty; + goto err; /* * XXX: if we error here we've left data with multiple * pointers... which isn't a _super_ serious problem... */ - bch2_btree_iter_cond_resched(&src); + bch2_trans_cond_resched(&trans); } + bch2_trans_unlock(&trans); - bch2_btree_iter_unlock(&src); - bch2_btree_iter_unlock(&dst); - - ret = bch2_inode_truncate(c, inode->v.i_ino, - round_up(new_size, PAGE_SIZE) >> 9, - &i_sectors_hook.hook, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, + round_up(new_size, block_bytes(c)) >> 9, + U64_MAX, &inode->ei_journal_seq); if (ret) - goto err_put_sectors_dirty; + goto err; i_size_write(&inode->v, new_size); - i_sectors_hook.new_i_size = new_size; -err_put_sectors_dirty: - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); err: + bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); - - bch2_btree_iter_unlock(&src); - bch2_btree_iter_unlock(&dst); return ret; } @@ -2386,8 +2512,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, { struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bpos end_pos; loff_t block_start, block_end; loff_t end = offset + len; @@ -2395,8 +2521,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, unsigned replicas = io_opts(c, inode).data_replicas; int ret; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); inode_lock(&inode->v); inode_dio_wait(&inode->v); @@ -2431,53 +2556,51 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, block_end = round_up(end, PAGE_SIZE); } - bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9)); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, block_start >> 9), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); end_pos = POS(inode->v.i_ino, block_end >> 9); - ret = i_sectors_dirty_start(c, &i_sectors_hook); - if (unlikely(ret)) - goto err; - - while (bkey_cmp(iter.pos, end_pos) < 0) { + while (bkey_cmp(iter->pos, end_pos) < 0) { struct disk_reservation disk_res = { 0 }; + struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; - k = bch2_btree_iter_peek_slot(&iter); - if ((ret = btree_iter_err(k))) - goto btree_iter_err; + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) + goto bkey_err; /* already reserved */ - if (k.k->type == BCH_RESERVATION && + if (k.k->type == KEY_TYPE_reservation && bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch2_btree_iter_next_slot(&iter); + bch2_btree_iter_next_slot(iter); continue; } - if (bkey_extent_is_data(k.k)) { - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_next_slot(&iter); - continue; - } + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { + bch2_btree_iter_next_slot(iter); + continue; } bkey_reservation_init(&reservation.k_i); - reservation.k.type = BCH_RESERVATION; + reservation.k.type = KEY_TYPE_reservation; reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter.pos, &reservation.k_i); + bch2_cut_front(iter->pos, &reservation.k_i); bch2_cut_back(end_pos, &reservation.k); sectors = reservation.k.size; - reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k); + reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); if (!bkey_extent_is_allocation(k.k)) { ret = bch2_quota_reservation_add(c, inode, - &i_sectors_hook.quota_res, + "a_res, sectors, true); if (unlikely(ret)) - goto err_put_sectors_dirty; + goto bkey_err; } if (reservation.v.nr_replicas < replicas || @@ -2485,32 +2608,31 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, ret = bch2_disk_reservation_get(c, &disk_res, sectors, replicas, 0); if (unlikely(ret)) - goto err_put_sectors_dirty; + goto bkey_err; reservation.v.nr_replicas = disk_res.nr_replicas; } - ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); + ret = bch2_extent_update(&trans, inode, + &disk_res, "a_res, + iter, &reservation.k_i, + 0, true, true, NULL); +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); -btree_iter_err: - if (ret < 0 && ret != -EINTR) - goto err_put_sectors_dirty; - + if (ret == -EINTR) + ret = 0; + if (ret) + goto err; } - bch2_btree_iter_unlock(&iter); - - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; + bch2_trans_unlock(&trans); if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { i_size_write(&inode->v, end); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } @@ -2526,19 +2648,13 @@ btree_iter_err: if (inode->ei_inode.bi_size != inode->v.i_size) { mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, + inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } } - - pagecache_block_put(&mapping->add_lock); - inode_unlock(&inode->v); - - return 0; -err_put_sectors_dirty: - ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); pagecache_block_put(&mapping->add_lock); inode_unlock(&inode->v); return ret; @@ -2565,6 +2681,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode, static bool page_is_data(struct page *page) { + EBUG_ON(!PageLocked(page)); + /* XXX: should only have to check PageDirty */ return PagePrivate(page) && (page_state(page)->sectors || @@ -2604,7 +2722,8 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; u64 isize, next_data = MAX_LFS_FILESIZE; int ret; @@ -2613,8 +2732,10 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, offset >> 9), 0, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, offset >> 9), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -2624,7 +2745,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) break; } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; @@ -2644,7 +2765,7 @@ static bool page_slot_is_data(struct address_space *mapping, pgoff_t index) bool ret; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return false; ret = page_is_data(page); @@ -2674,7 +2795,8 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; u64 isize, next_hole = MAX_LFS_FILESIZE; int ret; @@ -2683,9 +2805,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) if (offset >= isize) return -ENXIO; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode->v.i_ino, offset >> 9), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_next_pagecache_hole(&inode->v, offset, MAX_LFS_FILESIZE); @@ -2702,7 +2826,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) } } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 30d1ea9d2b85..88060b8785c3 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FS_IO_H #define _BCACHEFS_FS_IO_H @@ -33,7 +34,7 @@ long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); loff_t bch2_llseek(struct file *, loff_t, int); -int bch2_page_mkwrite(struct vm_fault *); +vm_fault_t bch2_page_mkwrite(struct vm_fault *); void bch2_invalidatepage(struct page *, unsigned int, unsigned int); int bch2_releasepage(struct page *, gfp_t); int bch2_migrate_page(struct address_space *, struct page *, diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 2c1ecf7732cd..971744ba3bf9 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "chardev.h" +#include "dirent.h" #include "fs.h" #include "fs-ioctl.h" #include "quota.h" @@ -11,88 +13,18 @@ #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, - //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -/* Set VFS inode flags from bcachefs inode: */ -void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} - struct flags_set { unsigned mask; unsigned flags; + + unsigned projid; }; static int bch2_inode_flags_set(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { + struct bch_fs *c = inode->v.i_sb->s_fs_info; /* * We're relying on btree locking here for exclusion with other ioctl * calls - use the flags in the btree (@bi), not inode->i_flags: @@ -105,14 +37,15 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - if (!S_ISREG(inode->v.i_mode) && - !S_ISDIR(inode->v.i_mode) && + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) return -EINVAL; bi->bi_flags &= ~s->mask; bi->bi_flags |= newflags; - inode->v.i_ctime = current_time(&inode->v); + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); return 0; } @@ -150,10 +83,8 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s); - - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); setflags_out: @@ -173,26 +104,18 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, return copy_to_user(arg, &fa, sizeof(fa)); } -static int bch2_set_projid(struct bch_fs *c, - struct bch_inode_info *inode, - u32 projid) +static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct bch_qid qid = inode->ei_qid; - int ret; - - if (projid == inode->ei_qid.q[QTYP_PRJ]) - return 0; - - qid.q[QTYP_PRJ] = projid; + struct flags_set *s = p; - ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved); - if (ret) - return ret; + if (s->projid != bi->bi_project) { + bi->bi_fields_set |= 1U << Inode_opt_project; + bi->bi_project = s->projid; + } - inode->ei_qid.q[QTYP_PRJ] = projid; - return 0; + return bch2_inode_flags_set(inode, bi, p); } static int bch2_ioc_fssetxattr(struct bch_fs *c, @@ -211,6 +134,11 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (fa.fsx_xflags) return -EOPNOTSUPP; + if (fa.fsx_projid >= U32_MAX) + return -EINVAL; + + s.projid = fa.fsx_projid + 1; + ret = mnt_want_write_file(file); if (ret) return ret; @@ -222,13 +150,12 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_set_projid(c, inode, fa.fsx_projid); + ret = bch2_set_projid(c, inode, s.projid); if (ret) goto err_unlock; - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s); - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); err_unlock: mutex_unlock(&inode->ei_update_lock); err: @@ -237,6 +164,75 @@ err: return ret; } +static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + struct file *file, + struct bch_inode_info *src, + const char __user *name) +{ + struct bch_inode_info *dst; + struct inode *vinode = NULL; + char *kname = NULL; + struct qstr qstr; + int ret = 0; + u64 inum; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) + return -ENOMEM; + + ret = strncpy_from_user(kname, name, BCH_NAME_MAX); + if (unlikely(ret < 0)) + goto err1; + + qstr.len = ret; + qstr.name = kname; + + ret = -ENOENT; + inum = bch2_dirent_lookup(c, src->v.i_ino, + &src->ei_str_hash, + &qstr); + if (!inum) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) + goto err1; + + dst = to_bch_ei(vinode); + + ret = mnt_want_write_file(file); + if (ret) + goto err2; + + bch2_lock_inodes(src, dst); + + if (inode_attr_changing(src, dst, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst, + src->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err3; + } + + ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); +err3: + bch2_unlock_inodes(src, dst); + + /* return true if we did work */ + if (ret >= 0) + ret = !ret; + + mnt_drop_write_file(file); +err2: + iput(vinode); +err1: + kfree(kname); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -253,7 +249,12 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_FSGETXATTR: return bch2_ioc_fsgetxattr(inode, (void __user *) arg); case FS_IOC_FSSETXATTR: - return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg); + return bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + + case BCHFS_IOC_REINHERIT_ATTRS: + return bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); case FS_IOC_GETVERSION: return -ENOTTY; @@ -265,8 +266,9 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EPERM; down_write(&sb->s_umount); - sb->s_flags |= MS_RDONLY; - bch2_fs_emergency_read_only(c); + sb->s_flags |= SB_RDONLY; + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "emergency read only due to ioctl"); up_write(&sb->s_umount); return 0; diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index c14e583da7ec..f201980ef2c3 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -1,7 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FS_IOCTL_H #define _BCACHEFS_FS_IOCTL_H -void bch2_inode_flags_to_vfs(struct bch_inode_info *); +/* Inode flags: */ + +/* bcachefs inode flags -> vfs inode flags: */ +static const unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_SYNC] = S_SYNC, + [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, + [__BCH_INODE_APPEND] = S_APPEND, + [__BCH_INODE_NOATIME] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_SYNC] = FS_SYNC_FL, + [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, + [__BCH_INODE_APPEND] = FS_APPEND_FL, + [__BCH_INODE_NODUMP] = FS_NODUMP_FL, + [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, + [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, + [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, + [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f4cdf4b5181c..c70c723f8518 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" @@ -34,6 +35,19 @@ static void bch2_vfs_inode_init(struct bch_fs *, struct bch_inode_info *, struct bch_inode_unpacked *); +static void journal_seq_copy(struct bch_inode_info *dst, + u64 journal_seq) +{ + u64 old, v = READ_ONCE(dst->ei_journal_seq); + + do { + old = v; + + if (old >= journal_seq) + break; + } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); +} + /* * I_SIZE_DIRTY requires special handling: * @@ -62,118 +76,181 @@ static void bch2_vfs_inode_init(struct bch_fs *, * be set explicitly. */ -int __must_check __bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p) +void bch2_inode_update_after_write(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + unsigned fields) { - struct btree_iter iter; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - u64 inum = inode->v.i_ino; - unsigned i_nlink = READ_ONCE(inode->v.i_nlink); - int ret; - - /* - * We can't write an inode with i_nlink == 0 because it's stored biased; - * however, we don't need to because if i_nlink is 0 the inode is - * getting deleted when it's evicted. - */ - if (!i_nlink) - return 0; + set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED + ? 0 + : bi->bi_nlink + nlink_bias(inode->v.i_mode)); + i_uid_write(&inode->v, bi->bi_uid); + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; - lockdep_assert_held(&inode->ei_update_lock); + if (fields & ATTR_ATIME) + inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + if (fields & ATTR_MTIME) + inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + if (fields & ATTR_CTIME) + inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + inode->ei_inode = *bi; - do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + bch2_inode_flags_to_vfs(inode); +} - if ((ret = btree_iter_err(k))) - goto out; +int __must_check bch2_write_inode_trans(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + inode_set_fn set, + void *p) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = NULL; + struct bkey_inode_buf *inode_p; + int ret; - if (WARN_ONCE(k.k->type != BCH_INODE_FS, - "inode %llu not found when updating", inum)) { - bch2_btree_iter_unlock(&iter); - return -ENOENT; - } + lockdep_assert_held(&inode->ei_update_lock); - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); - if (WARN_ONCE(ret, - "error %i unpacking inode %llu", ret, inum)) { - ret = -ENOENT; - break; - } + if (c->opts.new_inode_updates) { + /* XXX: Don't do this with btree locks held */ + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64); + } else { + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(inode->v.i_ino, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + /* The btree node lock is our lock on the inode: */ + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + } - if (set) { - ret = set(inode, &inode_u, p); - if (ret) - goto out; - } + *inode_u = inode->ei_inode; - BUG_ON(i_nlink < nlink_bias(inode->v.i_mode)); - - inode_u.bi_mode = inode->v.i_mode; - inode_u.bi_uid = i_uid_read(&inode->v); - inode_u.bi_gid = i_gid_read(&inode->v); - inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ]; - inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode); - inode_u.bi_dev = inode->v.i_rdev; - inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime); - inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime); - inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime); - - bch2_inode_pack(&inode_p, &inode_u); - - ret = bch2_btree_insert_at(c, NULL, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); - } while (ret == -EINTR); - - if (!ret) { - inode->ei_inode = inode_u; - inode->ei_qid = bch_qid(&inode_u); + if (set) { + ret = set(inode, inode_u, p); + if (ret) + return ret; } -out: - bch2_btree_iter_unlock(&iter); - return ret < 0 ? ret : 0; + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode_u); + + if (!inode->ei_inode_update) + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + else + bch2_trans_update(trans, + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p->inode.k_i)); + + return 0; } int __must_check bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode) + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) { - return __bch2_write_inode(c, inode, NULL, NULL); + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; + + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; + * this is important for inode updates via bchfs_write_index_update + */ + if (!ret) + bch2_inode_update_after_write(c, inode, &inode_u, fields); + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } -static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode) +int bch2_fs_quota_transfer(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_qid new_qid, + unsigned qtypes, + enum quota_acct_mode mode) { + unsigned i; int ret; - mutex_lock(&inode->ei_update_lock); - inc_nlink(&inode->v); - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); + qtypes &= enabled_qtypes(c); + + for (i = 0; i < QTYP_NR; i++) + if (new_qid.q[i] == inode->ei_qid.q[i]) + qtypes &= ~(1U << i); + + if (!qtypes) + return 0; + + mutex_lock(&inode->ei_quota_lock); + + ret = bch2_quota_transfer(c, qtypes, new_qid, + inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved, + mode); + if (!ret) + for (i = 0; i < QTYP_NR; i++) + if (qtypes & (1 << i)) + inode->ei_qid.q[i] = new_qid.q[i]; + + mutex_unlock(&inode->ei_quota_lock); return ret; } -static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode) +int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - int ret = 0; + struct bch_inode_info *dir = p; + u64 src, dst; + unsigned id; + int ret = 1; - mutex_lock(&inode->ei_update_lock); - drop_nlink(&inode->v); - ret = bch2_write_inode(c, inode); - mutex_unlock(&inode->ei_update_lock); + for (id = 0; id < Inode_opt_nr; id++) { + if (bi->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(&dir->ei_inode, id); + dst = bch2_inode_opt_get(bi, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(bi, id, src); + ret = 0; + } return ret; } -static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; @@ -200,125 +277,178 @@ static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } -static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c, - struct bch_inode_info *dir, - umode_t mode, dev_t rdev) +static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u, + const struct inode *dir, umode_t mode) { - struct posix_acl *default_acl = NULL, *acl = NULL; - struct bch_inode_info *inode; + kuid_t uid = current_fsuid(); + kgid_t gid; + + if (dir && dir->i_mode & S_ISGID) { + gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + gid = current_fsgid(); + + inode_u->bi_uid = from_kuid(dir->i_sb->s_user_ns, uid); + inode_u->bi_gid = from_kgid(dir->i_sb->s_user_ns, gid); + inode_u->bi_mode = mode; +} + +static int inode_update_for_create_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_unpacked *new_inode = p; + + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + + if (S_ISDIR(new_inode->bi_mode)) + bi->bi_nlink++; + + return 0; +} + +static struct bch_inode_info * +__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, bool tmpfile) +{ + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans trans; + struct bch_inode_unpacked dir_u; + struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; + struct bch_hash_info hash_info; + struct posix_acl *default_acl = NULL, *acl = NULL; + u64 journal_seq = 0; int ret; - inode = to_bch_ei(new_inode(c->vfs_sb)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); + bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); + bch2_inode_init_owner(&inode_u, &dir->v, mode); + + hash_info = bch2_hash_info_init(c, &inode_u); + + if (tmpfile) + inode_u.bi_flags |= BCH_INODE_UNLINKED; - inode_init_owner(&inode->v, &dir->v, mode); + ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + return ERR_PTR(ret); #ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl); + ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl); if (ret) - goto err_make_bad; + goto err; #endif - bch2_inode_init(c, &inode_u, - i_uid_read(&inode->v), - i_gid_read(&inode->v), - inode->v.i_mode, rdev, - &dir->ei_inode); - - inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ]; - - ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC); - if (ret) - goto err_make_bad; + /* + * preallocate vfs inode before btree transaction, so that nothing can + * fail after the transaction succeeds: + */ + inode = to_bch_ei(new_inode(c->vfs_sb)); + if (unlikely(!inode)) { + ret = -ENOMEM; + goto err; + } - ret = bch2_inode_create(c, &inode_u, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); + if (!tmpfile) + mutex_lock(&dir->ei_update_lock); + + bch2_trans_init(&trans, c, 8, 1024); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_inode_create(&trans, &inode_u, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint) ?: + (default_acl + ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, + default_acl, ACL_TYPE_DEFAULT) + : 0) ?: + (acl + ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, + acl, ACL_TYPE_ACCESS) + : 0) ?: + (!tmpfile + ? __bch2_dirent_create(&trans, dir->v.i_ino, + &dir->ei_str_hash, + mode_to_type(mode), + &dentry->d_name, + inode_u.bi_inum, + BCH_HASH_SET_MUST_CREATE) + : 0) ?: + (!tmpfile + ? bch2_write_inode_trans(&trans, dir, &dir_u, + inode_update_for_create_fn, + &inode_u) + : 0) ?: + bch2_trans_commit(&trans, NULL, + &journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; if (unlikely(ret)) - goto err_acct_quota; + goto err_trans; + + if (!tmpfile) { + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(dir, journal_seq); + mutex_unlock(&dir->ei_update_lock); + } bch2_vfs_inode_init(c, inode, &inode_u); - atomic_long_inc(&c->nr_inodes); + journal_seq_copy(inode, journal_seq); - if (default_acl) { - ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT); - if (unlikely(ret)) - goto err; - } + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); - if (acl) { - ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS); - if (unlikely(ret)) - goto err; + /* + * we must insert the new inode into the inode cache before calling + * bch2_trans_exit() and dropping locks, else we could race with another + * thread pulling the inode in and modifying it: + */ + + old = to_bch_ei(insert_inode_locked2(&inode->v)); + if (unlikely(old)) { + /* + * We raced, another process pulled the new inode into cache + * before us: + */ + old->ei_journal_seq = inode->ei_journal_seq; + make_bad_inode(&inode->v); + iput(&inode->v); + + inode = old; + } else { + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); } - insert_inode_hash(&inode->v); + bch2_trans_exit(&trans); out: posix_acl_release(default_acl); posix_acl_release(acl); return inode; -err_acct_quota: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN); -err_make_bad: - /* - * indicate to bch_evict_inode that the inode was never actually - * created: - */ +err_trans: + if (!tmpfile) + mutex_unlock(&dir->ei_update_lock); + + bch2_trans_exit(&trans); make_bad_inode(&inode->v); -err: - clear_nlink(&inode->v); iput(&inode->v); +err: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); inode = ERR_PTR(ret); goto out; } -static int bch2_vfs_dirent_create(struct bch_fs *c, - struct bch_inode_info *dir, - u8 type, const struct qstr *name, - u64 dst) -{ - int ret; - - ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash, - type, name, dst, - &dir->ei_journal_seq, - BCH_HASH_SET_MUST_CREATE); - if (unlikely(ret)) - return ret; - - dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v); - mark_inode_dirty_sync(&dir->v); - return 0; -} - -static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct bch_inode_info *inode; - int ret; - - inode = bch2_vfs_inode_create(c, dir, mode, rdev); - if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); - - ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode), - &dentry->d_name, inode->v.i_ino); - if (unlikely(ret)) { - clear_nlink(&inode->v); - iput(&inode->v); - return ret; - } - - if (dir->ei_journal_seq > inode->ei_journal_seq) - inode->ei_journal_seq = dir->ei_journal_seq; - - d_instantiate(dentry, &inode->v); - return 0; -} - /* methods */ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, @@ -342,7 +472,69 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, static int bch2_create(struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0); + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_instantiate(dentry, &inode->v); + return 0; +} + +static int inode_update_for_link_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_ctime = bch2_current_time(c); + + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else + bi->bi_nlink++; + + return 0; +} + +static int __bch2_link(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_info *dir, + struct dentry *dentry) +{ + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + int ret; + + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_create(&trans, dir->v.i_ino, + &dir->ei_str_hash, + mode_to_type(inode->v.i_mode), + &dentry->d_name, + inode->v.i_ino, + BCH_HASH_SET_MUST_CREATE) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_link_fn, + NULL) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + + if (ret == -EINTR) + goto retry; + + if (likely(!ret)) + bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + + bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); + return ret; } static int bch2_link(struct dentry *old_dentry, struct inode *vdir, @@ -355,23 +547,41 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - inode->v.i_ctime = current_time(&dir->v); - - ret = bch2_inc_nlink(c, inode); - if (ret) + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) return ret; ihold(&inode->v); + d_instantiate(dentry, &inode->v); + return 0; +} - ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode), - &dentry->d_name, inode->v.i_ino); - if (unlikely(ret)) { - bch2_dec_nlink(c, inode); - iput(&inode->v); - return ret; - } +static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_inode_info *unlink_inode = p; + + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + + bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode); + + return 0; +} + +static int inode_update_for_unlink_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_ctime = bch2_current_time(c); + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; - d_instantiate(dentry, &inode->v); return 0; } @@ -380,28 +590,46 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_unpacked dir_u, inode_u; + struct btree_trans trans; int ret; - lockdep_assert_held(&inode->v.i_rwsem); - - ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash, - &dentry->d_name, &dir->ei_journal_seq); + bch2_lock_inodes(dir, inode); + bch2_trans_init(&trans, c, 4, 1024); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_delete(&trans, dir->v.i_ino, + &dir->ei_str_hash, + &dentry->d_name) ?: + bch2_write_inode_trans(&trans, dir, &dir_u, + inode_update_dir_for_unlink_fn, + inode) ?: + bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_unlink_fn, + NULL) ?: + bch2_trans_commit(&trans, NULL, + &dir->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; if (ret) - return ret; + goto err; if (dir->ei_journal_seq > inode->ei_journal_seq) inode->ei_journal_seq = dir->ei_journal_seq; - inode->v.i_ctime = dir->v.i_ctime; - - if (S_ISDIR(inode->v.i_mode)) { - bch2_dec_nlink(c, dir); - drop_nlink(&inode->v); - } - - bch2_dec_nlink(c, inode); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_MTIME); +err: + bch2_trans_exit(&trans); + bch2_unlock_inodes(dir, inode); - return 0; + return ret; } static int bch2_symlink(struct inode *vdir, struct dentry *dentry, @@ -411,7 +639,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0); + inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -426,37 +654,28 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, if (unlikely(ret)) goto err; - /* XXX: racy */ - if (dir->ei_journal_seq < inode->ei_journal_seq) - dir->ei_journal_seq = inode->ei_journal_seq; + journal_seq_copy(dir, inode->ei_journal_seq); - ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, - inode->v.i_ino); + ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; d_instantiate(dentry, &inode->v); return 0; err: - clear_nlink(&inode->v); iput(&inode->v); return ret; } static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) { - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - int ret; + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false); - lockdep_assert_held(&dir->v.i_rwsem); - - ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0); - if (unlikely(ret)) - return ret; - - bch2_inc_nlink(c, dir); + if (IS_ERR(inode)) + return PTR_ERR(inode); + d_instantiate(dentry, &inode->v); return 0; } @@ -473,187 +692,312 @@ static int bch2_rmdir(struct inode *vdir, struct dentry *dentry) static int bch2_mknod(struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { - return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev); + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); + + if (IS_ERR(inode)) + return PTR_ERR(inode); + + d_instantiate(dentry, &inode->v); + return 0; } -static int bch2_rename(struct bch_fs *c, - struct bch_inode_info *old_dir, - struct dentry *old_dentry, - struct bch_inode_info *new_dir, - struct dentry *new_dentry) +struct rename_info { + u64 now; + struct bch_inode_info *src_dir; + struct bch_inode_info *dst_dir; + struct bch_inode_info *src_inode; + struct bch_inode_info *dst_inode; + enum bch_rename_mode mode; +}; + +static int inode_update_for_rename_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); - struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec64 now = current_time(&old_dir->v); + struct rename_info *info = p; int ret; - lockdep_assert_held(&old_dir->v.i_rwsem); - lockdep_assert_held(&new_dir->v.i_rwsem); - - if (new_inode) - filemap_write_and_wait_range(old_inode->v.i_mapping, - 0, LLONG_MAX); - - if (new_inode && S_ISDIR(old_inode->v.i_mode)) { - lockdep_assert_held(&new_inode->v.i_rwsem); + if (inode == info->src_dir) { + bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode); + bi->bi_nlink += info->dst_inode && + S_ISDIR(info->dst_inode->v.i_mode) && + info->mode == BCH_RENAME_EXCHANGE; + } - if (!S_ISDIR(new_inode->v.i_mode)) - return -ENOTDIR; + if (inode == info->dst_dir) { + bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode); + bi->bi_nlink -= info->dst_inode && + S_ISDIR(info->dst_inode->v.i_mode); + } - if (bch2_empty_dir(c, new_inode->v.i_ino)) - return -ENOTEMPTY; + if (inode == info->src_inode) { + ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir); - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; + BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode)); + } - clear_nlink(&new_inode->v); - bch2_dec_nlink(c, old_dir); - } else if (new_inode) { - lockdep_assert_held(&new_inode->v.i_rwsem); + if (inode == info->dst_inode && + info->mode == BCH_RENAME_EXCHANGE) { + ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir); - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE); - if (unlikely(ret)) - return ret; + BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode)); + } - new_inode->v.i_ctime = now; - bch2_dec_nlink(c, new_inode); - } else if (S_ISDIR(old_inode->v.i_mode)) { - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; + if (inode == info->dst_inode && + info->mode == BCH_RENAME_OVERWRITE) { + BUG_ON(bi->bi_nlink && + S_ISDIR(info->dst_inode->v.i_mode)); - bch2_inc_nlink(c, new_dir); - bch2_dec_nlink(c, old_dir); - } else { - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME); - if (unlikely(ret)) - return ret; + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; } - old_dir->v.i_ctime = old_dir->v.i_mtime = now; - new_dir->v.i_ctime = new_dir->v.i_mtime = now; - mark_inode_dirty_sync(&old_dir->v); - mark_inode_dirty_sync(&new_dir->v); - - old_inode->v.i_ctime = now; - mark_inode_dirty_sync(&old_inode->v); + if (inode == info->src_dir || + inode == info->dst_dir) + bi->bi_mtime = info->now; + bi->bi_ctime = info->now; return 0; } -static int bch2_rename_exchange(struct bch_fs *c, - struct bch_inode_info *old_dir, - struct dentry *old_dentry, - struct bch_inode_info *new_dir, - struct dentry *new_dentry) +static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, + struct inode *dst_vdir, struct dentry *dst_dentry, + unsigned flags) { - struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode); - struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode); - struct timespec64 now = current_time(&old_dir->v); + struct bch_fs *c = src_vdir->i_sb->s_fs_info; + struct rename_info i = { + .src_dir = to_bch_ei(src_vdir), + .dst_dir = to_bch_ei(dst_vdir), + .src_inode = to_bch_ei(src_dentry->d_inode), + .dst_inode = to_bch_ei(dst_dentry->d_inode), + .mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME, + }; + struct btree_trans trans; + struct bch_inode_unpacked dst_dir_u, src_dir_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u; + u64 journal_seq = 0; int ret; - ret = bch2_dirent_rename(c, - old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name, - &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE); + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + return -EINVAL; + + if (i.mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(i.src_inode->v.i_mode) != + S_ISDIR(i.dst_inode->v.i_mode)) + return -ENOTDIR; + + if (S_ISDIR(i.src_inode->v.i_mode) && + bch2_empty_dir(c, i.dst_inode->v.i_ino)) + return -ENOTEMPTY; + + ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping, + 0, LLONG_MAX); + if (ret) + return ret; + } + + bch2_trans_init(&trans, c, 8, 2048); + + bch2_lock_inodes(i.src_dir, + i.dst_dir, + i.src_inode, + i.dst_inode); + + if (S_ISDIR(i.src_inode->v.i_mode) && + inode_attrs_changing(i.dst_dir, i.src_inode)) { + ret = -EXDEV; + goto err; + } + + if (i.mode == BCH_RENAME_EXCHANGE && + S_ISDIR(i.dst_inode->v.i_mode) && + inode_attrs_changing(i.src_dir, i.dst_inode)) { + ret = -EXDEV; + goto err; + } + + if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, i.src_inode, + i.dst_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + if (i.mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, i.dst_inode, + i.src_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + +retry: + bch2_trans_begin(&trans); + i.now = bch2_current_time(c); + + ret = bch2_dirent_rename(&trans, + i.src_dir, &src_dentry->d_name, + i.dst_dir, &dst_dentry->d_name, + i.mode) ?: + bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u, + inode_update_for_rename_fn, &i) ?: + (i.src_dir != i.dst_dir + ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u, + inode_update_for_rename_fn, &i) + : 0 ) ?: + bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u, + inode_update_for_rename_fn, &i) ?: + (i.dst_inode + ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u, + inode_update_for_rename_fn, &i) + : 0 ) ?: + bch2_trans_commit(&trans, NULL, + &journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + if (ret == -EINTR) + goto retry; if (unlikely(ret)) - return ret; + goto err; - if (S_ISDIR(old_inode->v.i_mode) != - S_ISDIR(new_inode->v.i_mode)) { - if (S_ISDIR(old_inode->v.i_mode)) { - bch2_inc_nlink(c, new_dir); - bch2_dec_nlink(c, old_dir); - } else { - bch2_dec_nlink(c, new_dir); - bch2_inc_nlink(c, old_dir); - } + bch2_inode_update_after_write(c, i.src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(i.src_dir, journal_seq); + + if (i.src_dir != i.dst_dir) { + bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); + journal_seq_copy(i.dst_dir, journal_seq); } - old_dir->v.i_ctime = old_dir->v.i_mtime = now; - new_dir->v.i_ctime = new_dir->v.i_mtime = now; - mark_inode_dirty_sync(&old_dir->v); - mark_inode_dirty_sync(&new_dir->v); + journal_seq_copy(i.src_inode, journal_seq); + if (i.dst_inode) + journal_seq_copy(i.dst_inode, journal_seq); - old_inode->v.i_ctime = now; - new_inode->v.i_ctime = now; - mark_inode_dirty_sync(&old_inode->v); - mark_inode_dirty_sync(&new_inode->v); + bch2_inode_update_after_write(c, i.src_inode, &src_inode_u, + ATTR_CTIME); + if (i.dst_inode) + bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u, + ATTR_CTIME); +err: + bch2_trans_exit(&trans); + + bch2_fs_quota_transfer(c, i.src_inode, + bch_qid(&i.src_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + if (i.dst_inode) + bch2_fs_quota_transfer(c, i.dst_inode, + bch_qid(&i.dst_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + + bch2_unlock_inodes(i.src_dir, + i.dst_dir, + i.src_inode, + i.dst_inode); - return 0; + return ret; } -static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry, - struct inode *new_vdir, struct dentry *new_dentry, - unsigned flags) +static int inode_update_for_setattr_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct bch_fs *c = old_vdir->i_sb->s_fs_info; - struct bch_inode_info *old_dir = to_bch_ei(old_vdir); - struct bch_inode_info *new_dir = to_bch_ei(new_vdir); - - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) - return -EINVAL; - - if (flags & RENAME_EXCHANGE) - return bch2_rename_exchange(c, old_dir, old_dentry, - new_dir, new_dentry); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct iattr *attr = p; + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid); + if (ia_valid & ATTR_GID) + bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid); + + if (ia_valid & ATTR_ATIME) + bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); + + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + kgid_t gid = ia_valid & ATTR_GID + ? attr->ia_gid + : inode->v.i_gid; + + if (!in_group_p(gid) && + !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) + mode &= ~S_ISGID; + bi->bi_mode = mode; + } - return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry); + return 0; } static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid = inode->ei_qid; - unsigned qtypes = 0; + struct bch_qid qid; + struct btree_trans trans; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; int ret; mutex_lock(&inode->ei_update_lock); - if (c->opts.usrquota && - (iattr->ia_valid & ATTR_UID) && - !uid_eq(iattr->ia_uid, inode->v.i_uid)) { - qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid), - qtypes |= 1 << QTYP_USR; - } + qid = inode->ei_qid; + + if (iattr->ia_valid & ATTR_UID) + qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid); - if (c->opts.grpquota && - (iattr->ia_valid & ATTR_GID) && - !gid_eq(iattr->ia_gid, inode->v.i_gid)) { + if (iattr->ia_valid & ATTR_GID) qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid); - qtypes |= 1 << QTYP_GRP; - } - if (qtypes) { - ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved); - if (ret) - goto out_unlock; - } + ret = bch2_fs_quota_transfer(c, inode, qid, ~0, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; - setattr_copy(&inode->v, iattr); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + kfree(acl); + acl = NULL; + + ret = bch2_write_inode_trans(&trans, inode, &inode_u, + inode_update_for_setattr_fn, iattr) ?: + (iattr->ia_valid & ATTR_MODE + ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl) + : 0) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + goto retry; + if (unlikely(ret)) + goto err_trans; - ret = bch2_write_inode(c, inode); -out_unlock: - mutex_unlock(&inode->ei_update_lock); + bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid); - if (!ret && - iattr->ia_valid & ATTR_MODE) - ret = posix_acl_chmod(&inode->v, inode->v.i_mode); + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +err_trans: + bch2_trans_exit(&trans); +err: + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -711,16 +1055,14 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) { - struct bch_fs *c = vdir->i_sb->s_fs_info; - struct bch_inode_info *dir = to_bch_ei(vdir); - struct bch_inode_info *inode; + struct bch_inode_info *inode = + __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); - /* XXX: i_nlink should be 0? */ - inode = bch2_vfs_inode_create(c, dir, mode, 0); - if (unlikely(IS_ERR(inode))) + if (IS_ERR(inode)) return PTR_ERR(inode); - d_tmpfile(dentry, &inode->v); + d_mark_tmpfile(dentry, &inode->v); + d_instantiate(dentry, &inode->v); return 0; } @@ -729,33 +1071,33 @@ static int bch2_fill_extent(struct fiemap_extent_info *info, { if (bkey_extent_is_data(&k->k)) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; int ret; - extent_for_each_ptr_crc(e, ptr, crc) { + extent_for_each_ptr_decode(e, p, entry) { int flags2 = 0; - u64 offset = ptr->offset; + u64 offset = p.ptr.offset; - if (crc.compression_type) + if (p.crc.compression_type) flags2 |= FIEMAP_EXTENT_ENCODED; else - offset += crc.offset; + offset += p.crc.offset; if ((offset & (PAGE_SECTORS - 1)) || (e.k->size & (PAGE_SECTORS - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, - bkey_start_offset(e.k) << 9, - offset << 9, - e.k->size << 9, flags|flags2); + bkey_start_offset(e.k) << 9, + offset << 9, + e.k->size << 9, flags|flags2); if (ret) return ret; } return 0; - } else if (k->k.type == BCH_RESERVATION) { + } else if (k->k.type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(&k->k) << 9, 0, k->k.size << 9, @@ -772,7 +1114,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; BKEY_PADDED(k) tmp; bool have_extent = false; @@ -781,10 +1124,12 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k) + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + POS(ei->v.i_ino, start >> 9), 0, k, ret) if (bkey_extent_is_data(k.k) || - k.k->type == BCH_RESERVATION) { + k.k->type == KEY_TYPE_reservation) { if (bkey_cmp(bkey_start_pos(k.k), POS(ei->v.i_ino, (start + len) >> 9)) >= 0) break; @@ -792,17 +1137,17 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (have_extent) { ret = bch2_fill_extent(info, &tmp.k, 0); if (ret) - goto out; + break; } bkey_reassemble(&tmp.k, k); have_extent = true; } - if (have_extent) + if (!ret && have_extent) ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST); -out: - bch2_btree_iter_unlock(&iter); + + ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; } @@ -975,26 +1320,18 @@ static void bch2_vfs_inode_init(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { - inode->v.i_mode = bi->bi_mode; - i_uid_write(&inode->v, bi->bi_uid); - i_gid_write(&inode->v, bi->bi_gid); + bch2_inode_update_after_write(c, inode, bi, ~0); + inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; - set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode)); inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; - inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); - inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); - inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; - inode->ei_qid = bch_qid(bi); inode->ei_str_hash = bch2_hash_info_init(c, bi); - inode->ei_inode = *bi; - - bch2_inode_flags_to_vfs(inode); + inode->ei_qid = bch_qid(bi); inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1029,6 +1366,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); mutex_init(&inode->ei_quota_lock); + inode->ei_inode_update = NULL; inode->ei_journal_seq = 0; return &inode->v; @@ -1047,6 +1385,19 @@ static void bch2_destroy_inode(struct inode *vinode) call_rcu(&vinode->i_rcu, bch2_i_callback); } +static int inode_update_times_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); + bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); + bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); + + return 0; +} + static int bch2_vfs_write_inode(struct inode *vinode, struct writeback_control *wbc) { @@ -1055,7 +1406,8 @@ static int bch2_vfs_write_inode(struct inode *vinode, int ret; mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode); + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); if (c->opts.journal_flush_disabled) @@ -1078,13 +1430,16 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); + if (inode->ei_inode_update) + bch2_deferred_update_free(c, inode->ei_inode_update); + inode->ei_inode_update = NULL; + if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), - BCH_QUOTA_WARN); + KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, - BCH_QUOTA_WARN); + KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode->v.i_ino); - atomic_long_dec(&c->nr_inodes); } } @@ -1092,22 +1447,23 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); + unsigned shift = sb->s_blocksize_bits - 9; u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT; - buf->f_bfree = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >> - PAGE_SECTOR_SHIFT; + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = atomic_long_read(&c->nr_inodes); + buf->f_files = usage.nr_inodes; buf->f_ffree = U64_MAX; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - buf->f_namelen = NAME_MAX; + buf->f_namelen = BCH_NAME_MAX; return 0; } @@ -1156,7 +1512,7 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons */ c1 = bch2_path_to_fs(devs[0]); - if (!c1) + if (IS_ERR(c1)) return c; for (i = 1; i < nr_devs; i++) { @@ -1178,7 +1534,7 @@ static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * cons mutex_lock(&c->state_lock); - if (!bch2_fs_running(c)) { + if (!test_bit(BCH_FS_STARTED, &c->flags)) { mutex_unlock(&c->state_lock); closure_put(&c->cl); pr_err("err mounting %s: incomplete filesystem", dev_name); @@ -1227,29 +1583,28 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) struct bch_opts opts = bch2_opts_empty(); int ret; - opt_set(opts, read_only, (*flags & MS_RDONLY) != 0); + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(&opts, data); if (ret) return ret; if (opts.read_only != c->opts.read_only) { - const char *err = NULL; - mutex_lock(&c->state_lock); if (opts.read_only) { bch2_fs_read_only(c); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else { - err = bch2_fs_read_write(c); - if (err) { - bch_err(c, "error going rw: %s", err); + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + mutex_unlock(&c->state_lock); return -EINVAL; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } c->opts.read_only = opts.read_only; @@ -1273,13 +1628,13 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (opt->mode < OPT_MOUNT) + if (!(opt->mode & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - bch2_opt_to_text(c, buf, sizeof(buf), opt, v, + bch2_opt_to_text(&PBUF(buf), c, opt, v, OPT_SHOW_MOUNT_STYLE); seq_putc(seq, ','); seq_puts(seq, buf); @@ -1327,7 +1682,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, unsigned i; int ret; - opt_set(opts, read_only, (flags & MS_RDONLY) != 0); + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(&opts, data); if (ret) @@ -1337,7 +1692,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(c)) return ERR_CAST(c); - sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c); + sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); if (IS_ERR(sb)) { closure_put(&c->cl); return ERR_CAST(sb); @@ -1348,7 +1703,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (sb->s_root) { closure_put(&c->cl); - if ((flags ^ sb->s_flags) & MS_RDONLY) { + if ((flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; } @@ -1377,7 +1732,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; @@ -1391,22 +1746,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, #ifdef CONFIG_BCACHEFS_POSIX_ACL if (c->opts.acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); if (IS_ERR(vinode)) { + bch_err(c, "error mounting: error getting root inode %i", + (int) PTR_ERR(vinode)); ret = PTR_ERR(vinode); goto err_put_super; } sb->s_root = d_make_root(vinode); if (!sb->s_root) { + bch_err(c, "error mounting: error allocating root dentry"); ret = -ENOMEM; goto err_put_super; } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; out: return dget(sb->s_root); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index fbbc7a3a3cb7..fad8f4e5fdfc 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -1,6 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FS_H #define _BCACHEFS_FS_H +#include "inode.h" #include "opts.h" #include "str_hash.h" #include "quota_types.h" @@ -12,6 +14,7 @@ struct bch_inode_info { struct inode v; struct mutex ei_update_lock; + struct deferred_update *ei_inode_update; u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; @@ -28,6 +31,30 @@ struct bch_inode_info { #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) +static inline int ptrcmp(void *l, void *r) +{ + return cmp_int(l, r); +} + +#define __bch2_lock_inodes(_lock, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp); \ + \ + for (i = ARRAY_SIZE(a) - 1; a[i]; --i) \ + if (a[i] != a[i - 1]) { \ + if (_lock) \ + mutex_lock_nested(&a[i]->ei_update_lock, i);\ + else \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) + +#define bch2_lock_inodes(...) __bch2_lock_inodes(true, __VA_ARGS__) +#define bch2_unlock_inodes(...) __bch2_lock_inodes(false, __VA_ARGS__) + static inline struct bch_inode_info *file_bch_inode(struct file *file) { return to_bch_ei(file_inode(file)); @@ -43,18 +70,70 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } +static inline bool inode_attr_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode, + enum inode_opt_id id) +{ + return !(inode->ei_inode.bi_fields_set & (1 << id)) && + bch2_inode_opt_get(&dir->ei_inode, id) != + bch2_inode_opt_get(&inode->ei_inode, id); +} + +static inline bool inode_attrs_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode) +{ + unsigned id; + + for (id = 0; id < Inode_opt_nr; id++) + if (inode_attr_changing(dir, inode, id)) + return true; + + return false; +} + struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, + struct bch_qid, + unsigned, + enum quota_acct_mode); + +static inline int bch2_set_projid(struct bch_fs *c, + struct bch_inode_info *inode, + u32 projid) +{ + struct bch_qid qid = inode->ei_qid; + + qid.q[QTYP_PRJ] = projid; + + return bch2_fs_quota_transfer(c, inode, qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); + /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); -int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *); -int __must_check bch2_write_inode(struct bch_fs *, - struct bch_inode_info *); +void bch2_inode_update_after_write(struct bch_fs *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +int __must_check bch2_write_inode_trans(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + inode_set_fn, void *); +int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); + +int bch2_reinherit_attrs_fn(struct bch_inode_info *, + struct bch_inode_unpacked *, + void *); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c554a987f3aa..e3738757b6a0 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" @@ -15,9 +16,31 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static int remove_dirent(struct bch_fs *c, struct btree_iter *iter, +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) +{ + struct btree_iter *iter; + struct bkey_s_c k; + u64 sectors = 0; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, + POS(inum, 0), 0, k, ret) { + if (k.k->p.inode != inum) + break; + + if (bkey_extent_is_allocation(k.k)) + sectors += k.k->size; + } + + bch2_trans_iter_free(trans, iter); + + return ret ?: sectors; +} + +static int remove_dirent(struct btree_trans *trans, struct bkey_s_c_dirent dirent) { + struct bch_fs *c = trans->c; struct qstr name; struct bch_inode_unpacked dir_inode; struct bch_hash_info dir_hash_info; @@ -34,8 +57,8 @@ static int remove_dirent(struct bch_fs *c, struct btree_iter *iter, buf[name.len] = '\0'; name.name = buf; - /* Unlock iter so we don't deadlock, after copying name: */ - bch2_btree_iter_unlock(iter); + /* Unlock so we don't deadlock, after copying name: */ + bch2_trans_unlock(trans); ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); if (ret) { @@ -72,8 +95,9 @@ static int reattach_inode(struct bch_fs *c, bch2_inode_pack(&packed, lostfound_inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, - BTREE_INSERT_NOFAIL); + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret) { bch_err(c, "error %i reattaching inode %llu while updating lost+found", ret, inum); @@ -83,7 +107,8 @@ static int reattach_inode(struct bch_fs *c, ret = bch2_dirent_create(c, lostfound_inode->bi_inum, &lostfound_hash_info, DT_DIR, &name, inum, NULL, - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret) { bch_err(c, "error %i reattaching inode %llu while creating new dirent", ret, inum); @@ -107,18 +132,21 @@ static struct inode_walker inode_walker_init(void) }; } -static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) +static int walk_inode(struct btree_trans *trans, + struct inode_walker *w, u64 inum) { - w->first_this_inode = inum != w->cur_inum; - w->cur_inum = inum; - - if (w->first_this_inode) { - int ret = bch2_inode_find_by_inum(c, inum, &w->inode); + if (inum != w->cur_inum) { + int ret = bch2_inode_find_by_inum_trans(trans, inum, + &w->inode); if (ret && ret != -ENOENT) return ret; - w->have_inode = !ret; + w->have_inode = !ret; + w->cur_inum = inum; + w->first_this_inode = true; + } else { + w->first_this_inode = false; } return 0; @@ -126,27 +154,37 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum) struct hash_check { struct bch_hash_info info; - struct btree_iter chain; - struct btree_iter iter; - u64 next; + + /* start of current chain of hash collisions: */ + struct btree_iter *chain; + + /* next offset in current chain of hash collisions: */ + u64 chain_end; }; -static void hash_check_init(const struct bch_hash_desc desc, - struct hash_check *h, struct bch_fs *c) +static void hash_check_init(struct hash_check *h) +{ + h->chain = NULL; +} + +static void hash_stop_chain(struct btree_trans *trans, + struct hash_check *h) { - bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0); - bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0); + if (h->chain) + bch2_trans_iter_free(trans, h->chain); + h->chain = NULL; } -static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c, +static void hash_check_set_inode(struct btree_trans *trans, + struct hash_check *h, const struct bch_inode_unpacked *bi) { - h->info = bch2_hash_info_init(c, bi); - h->next = -1; + h->info = bch2_hash_info_init(trans->c, bi); + hash_stop_chain(trans, h); } static int hash_redo_key(const struct bch_hash_desc desc, - struct hash_check *h, struct bch_fs *c, + struct btree_trans *trans, struct hash_check *h, struct btree_iter *k_iter, struct bkey_s_c k, u64 hashed) { @@ -159,54 +197,142 @@ static int hash_redo_key(const struct bch_hash_desc desc, bkey_reassemble(tmp, k); - ret = bch2_btree_delete_at(k_iter, 0); + ret = bch2_btree_delete_at(trans, k_iter, 0); if (ret) goto err; - bch2_btree_iter_unlock(k_iter); - - bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL, tmp, - BTREE_INSERT_NOFAIL| - BCH_HASH_SET_MUST_CREATE); + bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, + tmp, BCH_HASH_SET_MUST_CREATE); + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); err: kfree(tmp); return ret; } -static int hash_check_key(const struct bch_hash_desc desc, - struct hash_check *h, struct bch_fs *c, - struct btree_iter *k_iter, struct bkey_s_c k) +static int fsck_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *info, + struct btree_iter *iter) +{ + int ret; +retry: + ret = bch2_hash_delete_at(trans, desc, info, iter) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret == -EINTR) { + ret = bch2_btree_iter_traverse(iter); + if (!ret) + goto retry; + } + + return ret; +} + +static int hash_check_duplicates(struct btree_trans *trans, + const struct bch_hash_desc desc, struct hash_check *h, + struct btree_iter *k_iter, struct bkey_s_c k) { + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c k2; char buf[200]; - u64 hashed; int ret = 0; - if (k.k->type != desc.whiteout_type && - k.k->type != desc.key_type) + if (!bkey_cmp(h->chain->pos, k_iter->pos)) return 0; - if (k.k->p.offset != h->next) { - if (!btree_iter_linked(&h->chain)) { - bch2_btree_iter_link(k_iter, &h->chain); - bch2_btree_iter_link(k_iter, &h->iter); + iter = bch2_trans_copy_iter(trans, h->chain); + BUG_ON(IS_ERR(iter)); + + for_each_btree_key_continue(iter, 0, k2) { + if (bkey_cmp(k2.k->p, k.k->p) >= 0) + break; + + if (fsck_err_on(k2.k->type == desc.key_type && + !desc.cmp_bkey(k, k2), c, + "duplicate hash table keys:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); + if (ret) + return ret; + ret = 1; + break; } - bch2_btree_iter_copy(&h->chain, k_iter); } - h->next = k.k->p.offset + 1; +fsck_err: + bch2_trans_iter_free(trans, iter); + return ret; +} + +static void hash_set_chain_start(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct hash_check *h, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + bool hole = (k.k->type != KEY_TYPE_whiteout && + k.k->type != desc.key_type); + + if (hole || k.k->p.offset > h->chain_end + 1) + hash_stop_chain(trans, h); + + if (!hole) { + if (!h->chain) { + h->chain = bch2_trans_copy_iter(trans, k_iter); + BUG_ON(IS_ERR(h->chain)); + } + + h->chain_end = k.k->p.offset; + } +} + +static bool key_has_correct_hash(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct hash_check *h, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + u64 hash; + + hash_set_chain_start(trans, desc, h, k_iter, k); + + if (k.k->type != desc.key_type) + return true; + + hash = desc.hash_bkey(&h->info, k); + + return hash >= h->chain->pos.offset && + hash <= k.k->p.offset; +} + +static int hash_check_key(struct btree_trans *trans, + const struct bch_hash_desc desc, struct hash_check *h, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + char buf[200]; + u64 hashed; + int ret = 0; + + hash_set_chain_start(trans, desc, h, k_iter, k); if (k.k->type != desc.key_type) return 0; hashed = desc.hash_bkey(&h->info, k); - if (fsck_err_on(hashed < h->chain.pos.offset || + if (fsck_err_on(hashed < h->chain->pos.offset || hashed > k.k->p.offset, c, - "hash table key at wrong offset: %llu, " + "hash table key at wrong offset: btree %u, %llu, " "hashed to %llu chain starts at %llu\n%s", - k.k->p.offset, hashed, h->chain.pos.offset, - (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), - buf, sizeof(buf), k), buf))) { - ret = hash_redo_key(desc, h, c, k_iter, k, hashed); + desc.btree_id, k.k->p.offset, + hashed, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); if (ret) { bch_err(c, "hash_redo_key err %i", ret); return ret; @@ -214,27 +340,101 @@ static int hash_check_key(const struct bch_hash_desc desc, return 1; } - if (!bkey_cmp(h->chain.pos, k_iter->pos)) + ret = hash_check_duplicates(trans, desc, h, k_iter, k); +fsck_err: + return ret; +} + +static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, + struct btree_iter *iter, struct bkey_s_c *k) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *d = NULL; + int ret = -EINVAL; + char buf[200]; + unsigned len; + u64 hash; + + if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) return 0; - bch2_btree_iter_copy(&h->iter, &h->chain); - while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) { - struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter); + len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); + BUG_ON(!len); - if (fsck_err_on(k2.k->type == desc.key_type && - !desc.cmp_bkey(k, k2), c, - "duplicate hash table keys:\n%s", - (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id), - buf, sizeof(buf), k), buf))) { - ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL); - if (ret) - return ret; - return 1; - } - bch2_btree_iter_next(&h->iter); + memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); + buf[len] = '\0'; + + d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!d) { + bch_err(c, "memory allocation failure"); + return -ENOMEM; + } + + bkey_reassemble(&d->k_i, *k); + + do { + --len; + if (!len) + goto err_redo; + + d->k.u64s = BKEY_U64s + dirent_val_u64s(len); + + BUG_ON(bkey_val_bytes(&d->k) < + offsetof(struct bch_dirent, d_name) + len); + + memset(d->v.d_name + len, 0, + bkey_val_bytes(&d->k) - + offsetof(struct bch_dirent, d_name) - len); + + hash = bch2_dirent_hash_desc.hash_bkey(&h->info, + bkey_i_to_s_c(&d->k_i)); + } while (hash < h->chain->pos.offset || + hash > k->k->p.offset); + + if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", + buf, strlen(buf), d->v.d_name, len)) { + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i)); + + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) + goto err; + + *k = bch2_btree_iter_peek(iter); + + BUG_ON(k->k->type != KEY_TYPE_dirent); } +err: fsck_err: + kfree(d); return ret; +err_redo: + hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); + + if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" + "hash table key at wrong offset: btree %u, offset %llu, " + "hashed to %llu chain starts at %llu\n%s", + buf, strlen(buf), BTREE_ID_DIRENTS, + k->k->p.offset, hash, h->chain->pos.offset, + (bch2_bkey_val_to_text(&PBUF(buf), c, + *k), buf))) { + ret = hash_redo_key(bch2_dirent_hash_desc, trans, + h, iter, *k, hash); + if (ret) + bch_err(c, "hash_redo_key err %i", ret); + else + ret = 1; + } + + goto err; +} + +static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) +{ + return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), + POS(inode_nr + 1, 0), NULL); } /* @@ -245,17 +445,21 @@ noinline_for_stack static int check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; u64 i_sectors; int ret = 0; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { - if (k.k->type == KEY_TYPE_DISCARD) - continue; + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch_verbose(c, "checking extents"); - ret = walk_inode(c, &w, k.k->p.inode); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(BCACHEFS_ROOT_INO, 0), 0); +retry: + for_each_btree_key_continue(iter, 0, k) { + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -266,9 +470,9 @@ static int check_extents(struct bch_fs *c) !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, "extent type %u for non regular file, inode %llu mode %o", k.k->type, k.k->p.inode, w.inode.bi_mode)) { - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); - ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL); + ret = bch2_inode_truncate(c, k.k->p.inode, 0); if (ret) goto err; continue; @@ -278,44 +482,40 @@ static int check_extents(struct bch_fs *c) w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && w.inode.bi_sectors != - (i_sectors = bch2_count_inode_sectors(c, w.cur_inum)), + (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), c, "i_sectors wrong: got %llu, should be %llu", w.inode.bi_sectors, i_sectors)) { struct bkey_inode_buf p; w.inode.bi_sectors = i_sectors; - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); bch2_inode_pack(&p, &w.inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, - &p.inode.k_i, - NULL, - NULL, - NULL, - BTREE_INSERT_NOFAIL); + &p.inode.k_i, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret) { - bch_err(c, "error in fs gc: error %i " - "updating inode", ret); + bch_err(c, "error in fsck: error %i updating inode", ret); goto err; } /* revalidate iterator: */ - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(iter); } if (fsck_err_on(w.have_inode && !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != BCH_RESERVATION && + k.k->type != KEY_TYPE_reservation && k.k->p.offset > round_up(w.inode.bi_size, PAGE_SIZE) >> 9, c, "extent type %u offset %llu past end of inode %llu, i_size %llu", k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); ret = bch2_inode_truncate(c, k.k->p.inode, - round_up(w.inode.bi_size, PAGE_SIZE) >> 9, - NULL, NULL); + w.inode.bi_size); if (ret) goto err; continue; @@ -323,7 +523,9 @@ static int check_extents(struct bch_fs *c) } err: fsck_err: - return bch2_btree_iter_unlock(&iter) ?: ret; + if (ret == -EINTR) + goto retry; + return bch2_trans_exit(&trans) ?: ret; } /* @@ -335,53 +537,62 @@ static int check_dirents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct hash_check h; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; unsigned name_len; char buf[200]; int ret = 0; - hash_check_init(bch2_dirent_hash_desc, &h, c); + bch_verbose(c, "checking dirents"); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + hash_check_init(&h); - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + POS(BCACHEFS_ROOT_INO, 0), 0); +retry: + for_each_btree_key_continue(iter, 0, k) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; u64 d_inum; - ret = walk_inode(c, &w, k.k->p.inode); + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; if (fsck_err_on(!w.have_inode, c, "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, - buf, sizeof(buf), k), buf)) || + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf)) || fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, "dirent in non directory inode type %u:\n%s", mode_to_type(w.inode.bi_mode), - (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, - buf, sizeof(buf), k), buf))) { - ret = bch2_btree_delete_at(&iter, 0); + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = bch2_btree_delete_at(&trans, iter, 0); if (ret) goto err; continue; } if (w.first_this_inode && w.have_inode) - hash_check_set_inode(&h, c, &w.inode); + hash_check_set_inode(&trans, &h, &w.inode); - ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k); + ret = check_dirent_hash(&trans, &h, iter, &k); if (ret > 0) { ret = 0; continue; } + if (ret) + goto fsck_err; if (ret) goto fsck_err; - if (k.k->type != BCH_DIRENT) + if (k.k->type != KEY_TYPE_dirent) continue; d = bkey_s_c_to_dirent(k); @@ -395,8 +606,13 @@ static int check_dirents(struct bch_fs *c) ". dirent") || fsck_err_on(name_len == 2 && !memcmp(d.v->d_name, "..", 2), c, - ".. dirent")) { - ret = remove_dirent(c, &iter, d); + ".. dirent") || + fsck_err_on(name_len == 2 && + !memcmp(d.v->d_name, "..", 2), c, + ".. dirent") || + fsck_err_on(memchr(d.v->d_name, '/', name_len), c, + "dirent name has invalid chars")) { + ret = remove_dirent(&trans, d); if (ret) goto err; continue; @@ -404,15 +620,15 @@ static int check_dirents(struct bch_fs *c) if (fsck_err_on(d_inum == d.k->p.inode, c, "dirent points to own directory:\n%s", - (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, - buf, sizeof(buf), k), buf))) { - ret = remove_dirent(c, &iter, d); + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = remove_dirent(&trans, d); if (ret) goto err; continue; } - ret = bch2_inode_find_by_inum(c, d_inum, &target); + ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); if (ret && ret != -ENOENT) break; @@ -421,9 +637,9 @@ static int check_dirents(struct bch_fs *c) if (fsck_err_on(!have_target, c, "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, - buf, sizeof(buf), k), buf))) { - ret = remove_dirent(c, &iter, d); + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = remove_dirent(&trans, d); if (ret) goto err; continue; @@ -434,8 +650,8 @@ static int check_dirents(struct bch_fs *c) mode_to_type(target.bi_mode), c, "incorrect d_type: should be %u:\n%s", mode_to_type(target.bi_mode), - (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS, - buf, sizeof(buf), k), buf))) { + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { struct bkey_i_dirent *n; n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); @@ -447,20 +663,26 @@ static int check_dirents(struct bch_fs *c) bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = mode_to_type(target.bi_mode); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &n->k_i)); + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(iter, &n->k_i)); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); kfree(n); if (ret) goto err; } } + + hash_stop_chain(&trans, &h); err: fsck_err: - bch2_btree_iter_unlock(&h.chain); - bch2_btree_iter_unlock(&h.iter); - return bch2_btree_iter_unlock(&iter) ?: ret; + if (ret == -EINTR) + goto retry; + + return bch2_trans_exit(&trans) ?: ret; } /* @@ -471,39 +693,47 @@ static int check_xattrs(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct hash_check h; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; - hash_check_init(bch2_xattr_hash_desc, &h, c); + bch_verbose(c, "checking xattrs"); - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, - POS(BCACHEFS_ROOT_INO, 0), 0, k) { - ret = walk_inode(c, &w, k.k->p.inode); + hash_check_init(&h); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, + POS(BCACHEFS_ROOT_INO, 0), 0); +retry: + for_each_btree_key_continue(iter, 0, k) { + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; if (fsck_err_on(!w.have_inode, c, "xattr for missing inode %llu", k.k->p.inode)) { - ret = bch2_btree_delete_at(&iter, 0); + ret = bch2_btree_delete_at(&trans, iter, 0); if (ret) goto err; continue; } if (w.first_this_inode && w.have_inode) - hash_check_set_inode(&h, c, &w.inode); + hash_check_set_inode(&trans, &h, &w.inode); - ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k); + ret = hash_check_key(&trans, bch2_xattr_hash_desc, + &h, iter, k); if (ret) goto fsck_err; } err: fsck_err: - bch2_btree_iter_unlock(&h.chain); - bch2_btree_iter_unlock(&h.iter); - return bch2_btree_iter_unlock(&iter) ?: ret; + if (ret == -EINTR) + goto retry; + return bch2_trans_exit(&trans) ?: ret; } /* Get root directory, create if it doesn't exist: */ @@ -512,6 +742,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) struct bkey_inode_buf packed; int ret; + bch_verbose(c, "checking root directory"); + ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); if (ret && ret != -ENOENT) return ret; @@ -534,7 +766,9 @@ create_root: bch2_inode_pack(&packed, root_inode); return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, BTREE_INSERT_NOFAIL); + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); } /* Get lost+found, create if it doesn't exist: */ @@ -549,6 +783,8 @@ static int check_lostfound(struct bch_fs *c, u64 inum; int ret; + bch_verbose(c, "checking lost+found"); + inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, &lostfound); if (!inum) { @@ -576,7 +812,9 @@ create_lostfound: bch2_inode_pack(&packed, root_inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, NULL, BTREE_INSERT_NOFAIL); + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret) return ret; @@ -590,7 +828,8 @@ create_lostfound: ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, &lostfound, lostfound_inode->bi_inum, NULL, - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret) return ret; @@ -668,13 +907,18 @@ static int check_directory_structure(struct bch_fs *c, struct inode_bitmap dirs_done = { NULL, 0 }; struct pathbuf path = { 0, 0, NULL }; struct pathbuf_entry *e; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; bool had_unreachable; u64 d_inum; int ret = 0; + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch_verbose(c, "checking directory structure"); + /* DFS: */ restart_dfs: had_unreachable = false; @@ -686,9 +930,8 @@ restart_dfs: } ret = path_down(&path, BCACHEFS_ROOT_INO); - if (ret) { - return ret; - } + if (ret) + goto err; while (path.nr) { next: @@ -697,14 +940,14 @@ next: if (e->offset == U64_MAX) goto up; - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, - POS(e->inum, e->offset + 1), 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + POS(e->inum, e->offset + 1), 0, k, ret) { if (k.k->p.inode != e->inum) break; e->offset = k.k->p.offset; - if (k.k->type != BCH_DIRENT) + if (k.k->type != KEY_TYPE_dirent) continue; dirent = bkey_s_c_to_dirent(k); @@ -717,7 +960,7 @@ next: if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, "directory %llu has multiple hardlinks", d_inum)) { - ret = remove_dirent(c, &iter, dirent); + ret = remove_dirent(&trans, dirent); if (ret) goto err; continue; @@ -734,10 +977,14 @@ next: goto err; } - bch2_btree_iter_unlock(&iter); + ret = bch2_trans_iter_free(&trans, iter); + if (ret) { + bch_err(c, "btree error %i in fsck", ret); + goto err; + } goto next; } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_iter_free(&trans, iter) ?: ret; if (ret) { bch_err(c, "btree error %i in fsck", ret); goto err; @@ -746,15 +993,25 @@ up: path.nr--; } - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) { - if (k.k->type != BCH_INODE_FS || - !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); +retry: + for_each_btree_key_continue(iter, 0, k) { + if (k.k->type != KEY_TYPE_inode) + continue; + + if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) + continue; + + ret = bch2_empty_dir_trans(&trans, k.k->p.inode); + if (ret == -EINTR) + goto retry; + if (!ret) continue; if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, "unreachable directory found (inum %llu)", k.k->p.inode)) { - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); ret = reattach_inode(c, lostfound_inode, k.k->p.inode); if (ret) { @@ -764,7 +1021,7 @@ up: had_unreachable = true; } } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_iter_free(&trans, iter); if (ret) goto err; @@ -776,15 +1033,12 @@ up: memset(&path, 0, sizeof(path)); goto restart_dfs; } - -out: +err: +fsck_err: + ret = bch2_trans_exit(&trans) ?: ret; kfree(dirs_done.bits); kfree(path.entries); return ret; -err: -fsck_err: - ret = bch2_btree_iter_unlock(&iter) ?: ret; - goto out; } struct nlink { @@ -805,7 +1059,7 @@ static void inc_link(struct bch_fs *c, nlink_table *links, link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); if (!link) { - bch_verbose(c, "allocation failed during fs gc - will need another pass"); + bch_verbose(c, "allocation failed during fsck - will need another pass"); *range_end = inum; return; } @@ -820,17 +1074,20 @@ noinline_for_stack static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, u64 range_start, u64 *range_end) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_dirent d; u64 d_inum; int ret; + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); - for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k) { + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { switch (k.k->type) { - case BCH_DIRENT: + case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); d_inum = le64_to_cpu(d.v->d_inum); @@ -844,115 +1101,169 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, break; } - bch2_btree_iter_cond_resched(&iter); + bch2_trans_cond_resched(&trans); } - ret = bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) - bch_err(c, "error in fs gc: btree error %i while walking dirents", ret); + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); return ret; } -s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum) +static int check_inode_nlink(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode, + struct bch_inode_unpacked *u, + struct nlink *link, + bool *do_update) { - struct btree_iter iter; - struct bkey_s_c k; - u64 sectors = 0; + u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED + ? 0 + : u->bi_nlink + nlink_bias(u->bi_mode); + u32 real_i_nlink = + link->count * nlink_bias(u->bi_mode) + + link->dir_count; + int ret = 0; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), 0, k) { - if (k.k->p.inode != inum) - break; + /* + * These should have been caught/fixed by earlier passes, we don't + * repair them here: + */ + if (S_ISDIR(u->bi_mode) && link->count > 1) { + need_fsck_err(c, "directory %llu with multiple hardlinks: %u", + u->bi_inum, link->count); + return 0; + } - if (bkey_extent_is_allocation(k.k)) - sectors += k.k->size; + if (S_ISDIR(u->bi_mode) && !link->count) { + need_fsck_err(c, "unreachable directory found (inum %llu)", + u->bi_inum); + return 0; + } + + if (!S_ISDIR(u->bi_mode) && link->dir_count) { + need_fsck_err(c, "non directory with subdirectories", + u->bi_inum); + return 0; + } + + if (!link->count && + !(u->bi_flags & BCH_INODE_UNLINKED) && + (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", + u->bi_inum, mode_to_type(u->bi_mode)) == + FSCK_ERR_IGNORE) + return 0; + + ret = reattach_inode(c, lostfound_inode, u->bi_inum); + if (ret) + return ret; + + link->count = 1; + real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; + goto set_i_nlink; + } + + if (i_nlink < link->count) { + if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", + u->bi_inum, i_nlink, link->count, + mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } + + if (i_nlink != real_i_nlink && + c->sb.clean) { + if (fsck_err(c, "filesystem marked clean, " + "but inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + u->bi_inum, mode_to_type(u->bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; } - return bch2_btree_iter_unlock(&iter) ?: sectors; + if (i_nlink != real_i_nlink && + (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + if (fsck_err(c, "inode %llu has wrong i_nlink " + "(type %u i_nlink %u, should be %u)", + u->bi_inum, mode_to_type(u->bi_mode), + i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) + return 0; + goto set_i_nlink; + } + + if (real_i_nlink && i_nlink != real_i_nlink) + bch_verbose(c, "setting inode %llu nlink from %u to %u", + u->bi_inum, i_nlink, real_i_nlink); +set_i_nlink: + if (i_nlink != real_i_nlink) { + if (real_i_nlink) { + u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode); + u->bi_flags &= ~BCH_INODE_UNLINKED; + } else { + u->bi_nlink = 0; + u->bi_flags |= BCH_INODE_UNLINKED; + } + + *do_update = true; + } +fsck_err: + return ret; } -static int bch2_gc_do_inode(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - struct btree_iter *iter, - struct bkey_s_c_inode inode, struct nlink link) +static int check_inode(struct btree_trans *trans, + struct bch_inode_unpacked *lostfound_inode, + struct btree_iter *iter, + struct bkey_s_c_inode inode, + struct nlink *link) { + struct bch_fs *c = trans->c; struct bch_inode_unpacked u; - int ret = 0; - u32 i_nlink, real_i_nlink; bool do_update = false; + int ret = 0; ret = bch2_inode_unpack(inode, &u); + + bch2_trans_unlock(trans); + if (bch2_fs_inconsistent_on(ret, c, "error unpacking inode %llu in fsck", inode.k->p.inode)) return ret; - i_nlink = u.bi_nlink + nlink_bias(u.bi_mode); - - fsck_err_on(i_nlink < link.count, c, - "inode %llu i_link too small (%u < %u, type %i)", - inode.k->p.inode, i_nlink, - link.count, mode_to_type(u.bi_mode)); - - /* These should have been caught/fixed by earlier passes: */ - if (S_ISDIR(u.bi_mode)) { - need_fsck_err_on(link.count > 1, c, - "directory %llu with multiple hardlinks: %u", - inode.k->p.inode, link.count); - - real_i_nlink = link.count * 2 + link.dir_count; - } else { - need_fsck_err_on(link.dir_count, c, - "found dirents for non directory %llu", - inode.k->p.inode); - - real_i_nlink = link.count + link.dir_count; + if (link) { + ret = check_inode_nlink(c, lostfound_inode, &u, link, + &do_update); + if (ret) + return ret; } - if (!link.count) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but found orphaned inode %llu", - inode.k->p.inode); - - if (fsck_err_on(S_ISDIR(u.bi_mode) && - bch2_empty_dir(c, inode.k->p.inode), c, - "non empty directory with link count 0, " - "inode nlink %u, dir links found %u", - i_nlink, link.dir_count)) { - ret = reattach_inode(c, lostfound_inode, - inode.k->p.inode); - if (ret) - return ret; - } - - bch_verbose(c, "deleting inode %llu", inode.k->p.inode); + if (u.bi_flags & BCH_INODE_UNLINKED && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { + bch_verbose(c, "deleting inode %llu", u.bi_inum); - ret = bch2_inode_rm(c, inode.k->p.inode); + ret = bch2_inode_rm(c, u.bi_inum); if (ret) - bch_err(c, "error in fs gc: error %i " - "while deleting inode", ret); + bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; } - if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_size dirty", - inode.k->p.inode); - - bch_verbose(c, "truncating inode %llu", inode.k->p.inode); + if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", + u.bi_inum))) { + bch_verbose(c, "truncating inode %llu", u.bi_inum); /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away */ - ret = bch2_inode_truncate(c, inode.k->p.inode, - round_up(u.bi_size, PAGE_SIZE) >> 9, - NULL, NULL); + ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); if (ret) { - bch_err(c, "error in fs gc: error %i " - "truncating inode", ret); + bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; } @@ -966,21 +1277,18 @@ static int bch2_gc_do_inode(struct bch_fs *c, do_update = true; } - if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY) { + if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", + u.bi_inum))) { s64 sectors; - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has i_sectors dirty", - inode.k->p.inode); - bch_verbose(c, "recounting sectors for inode %llu", - inode.k->p.inode); + u.bi_inum); - sectors = bch2_count_inode_sectors(c, inode.k->p.inode); + sectors = bch2_count_inode_sectors(trans, u.bi_inum); if (sectors < 0) { - bch_err(c, "error in fs gc: error %i " - "recounting inode sectors", + bch_err(c, "error in fsck: error %i recounting inode sectors", (int) sectors); return sectors; } @@ -990,30 +1298,17 @@ static int bch2_gc_do_inode(struct bch_fs *c, do_update = true; } - if (i_nlink != real_i_nlink) { - fsck_err_on(c->sb.clean, c, - "filesystem marked clean, " - "but inode %llu has wrong i_nlink " - "(type %u i_nlink %u, should be %u)", - inode.k->p.inode, mode_to_type(u.bi_mode), - i_nlink, real_i_nlink); - - bch_verbose(c, "setting inode %llu nlinks from %u to %u", - inode.k->p.inode, i_nlink, real_i_nlink); - u.bi_nlink = real_i_nlink - nlink_bias(u.bi_mode); - do_update = true; - } - if (do_update) { struct bkey_inode_buf p; bch2_inode_pack(&p, &u); + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); if (ret && ret != -EINTR) - bch_err(c, "error in fs gc: error %i " + bch_err(c, "error in fsck: error %i " "updating inode", ret); } fsck_err: @@ -1022,29 +1317,33 @@ fsck_err: noinline_for_stack static int bch2_gc_walk_inodes(struct bch_fs *c, - struct bch_inode_unpacked *lostfound_inode, - nlink_table *links, - u64 range_start, u64 range_end) + struct bch_inode_unpacked *lostfound_inode, + nlink_table *links, + u64 range_start, u64 range_end) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct nlink *link, zero_links = { 0, 0 }; struct genradix_iter nlinks_iter; int ret = 0, ret2 = 0; u64 nlinks_pos; - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0), 0); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, + POS(range_start, 0), 0); nlinks_iter = genradix_iter_init(links, 0); - while ((k = bch2_btree_iter_peek(&iter)).k && - !btree_iter_err(k)) { + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret2 = bkey_err(k))) { peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); - if (!link && (!k.k || iter.pos.inode >= range_end)) + if (!link && (!k.k || iter->pos.inode >= range_end)) break; nlinks_pos = range_start + nlinks_iter.pos; - if (iter.pos.inode > nlinks_pos) { + if (iter->pos.inode > nlinks_pos) { /* Should have been caught by dirents pass: */ need_fsck_err_on(link && link->count, c, "missing inode %llu (nlink %u)", @@ -1053,25 +1352,15 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); goto peek_nlinks; } - if (iter.pos.inode < nlinks_pos || !link) + if (iter->pos.inode < nlinks_pos || !link) link = &zero_links; - if (k.k && k.k->type == BCH_INODE_FS) { - /* - * Avoid potential deadlocks with iter for - * truncate/rm/etc.: - */ - bch2_btree_iter_unlock(&iter); - - ret = bch2_gc_do_inode(c, lostfound_inode, &iter, - bkey_s_c_to_inode(k), *link); - if (ret == -EINTR) - continue; + if (k.k && k.k->type == KEY_TYPE_inode) { + ret = check_inode(&trans, lostfound_inode, iter, + bkey_s_c_to_inode(k), link); + BUG_ON(ret == -EINTR); if (ret) break; - - if (link->count) - atomic_long_inc(&c->nr_inodes); } else { /* Should have been caught by dirents pass: */ need_fsck_err_on(link->count, c, @@ -1079,16 +1368,17 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); nlinks_pos, link->count); } - if (nlinks_pos == iter.pos.inode) + if (nlinks_pos == iter->pos.inode) genradix_iter_advance(&nlinks_iter, links); - bch2_btree_iter_next(&iter); - bch2_btree_iter_cond_resched(&iter); + bch2_btree_iter_next(iter); + bch2_trans_cond_resched(&trans); } fsck_err: - ret2 = bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); + if (ret2) - bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2); + bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); return ret ?: ret2; } @@ -1101,6 +1391,8 @@ static int check_inode_nlinks(struct bch_fs *c, u64 this_iter_range_start, next_iter_range_start = 0; int ret = 0; + bch_verbose(c, "checking inode nlinks"); + genradix_init(&links); do { @@ -1131,64 +1423,55 @@ static int check_inode_nlinks(struct bch_fs *c, * Checks for inconsistencies that shouldn't happen, unless we have a bug. * Doesn't fix them yet, mainly because they haven't yet been observed: */ -int bch2_fsck(struct bch_fs *c, bool full_fsck) +int bch2_fsck_full(struct bch_fs *c) { struct bch_inode_unpacked root_inode, lostfound_inode; - int ret; - - if (full_fsck) { - bch_verbose(c, "checking extents"); - ret = check_extents(c); - if (ret) - return ret; - bch_verbose(c, "checking dirents"); - ret = check_dirents(c); - if (ret) - return ret; + return check_extents(c) ?: + check_dirents(c) ?: + check_xattrs(c) ?: + check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_directory_structure(c, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); +} - bch_verbose(c, "checking xattrs"); - ret = check_xattrs(c); - if (ret) - return ret; +int bch2_fsck_inode_nlink(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; - bch_verbose(c, "checking root directory"); - ret = check_root(c, &root_inode); - if (ret) - return ret; + return check_root(c, &root_inode) ?: + check_lostfound(c, &root_inode, &lostfound_inode) ?: + check_inode_nlinks(c, &lostfound_inode); +} - bch_verbose(c, "checking lost+found"); - ret = check_lostfound(c, &root_inode, &lostfound_inode); - if (ret) - return ret; +int bch2_fsck_walk_inodes_only(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_s_c_inode inode; + int ret; - bch_verbose(c, "checking directory structure"); - ret = check_directory_structure(c, &lostfound_inode); - if (ret) - return ret; + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch_verbose(c, "checking inode nlinks"); - ret = check_inode_nlinks(c, &lostfound_inode); - if (ret) - return ret; - } else { - bch_verbose(c, "checking root directory"); - ret = check_root(c, &root_inode); - if (ret) - return ret; + for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_inode) + continue; - bch_verbose(c, "checking lost+found"); - ret = check_lostfound(c, &root_inode, &lostfound_inode); - if (ret) - return ret; + inode = bkey_s_c_to_inode(k); - bch_verbose(c, "checking inode nlinks"); - ret = check_inode_nlinks(c, &lostfound_inode); - if (ret) - return ret; + if (inode.v->bi_flags & + (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED)) { + ret = check_inode(&trans, NULL, iter, inode, NULL); + BUG_ON(ret == -EINTR); + if (ret) + break; + } } + BUG_ON(ret == -EINTR); - bch2_flush_fsck_errs(c); - - return 0; + return bch2_trans_exit(&trans) ?: ret; } diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index f9af1305dc2a..9e4af02bde1e 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -1,7 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H -s64 bch2_count_inode_sectors(struct bch_fs *, u64); -int bch2_fsck(struct bch_fs *, bool); +int bch2_fsck_full(struct bch_fs *); +int bch2_fsck_inode_nlink(struct bch_fs *); +int bch2_fsck_walk_inodes_only(struct bch_fs *); #endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 81d040d46e41..05b7f6594113 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" @@ -12,7 +13,12 @@ #include <asm/unaligned.h> -#define FIELD_BYTES() \ +const char * const bch2_inode_opts[] = { +#define x(name, ...) #name, + BCH_INODE_OPTS() +#undef x + NULL, +}; static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static const u8 bits_table[8] = { @@ -97,7 +103,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); -#define BCH_INODE_FIELD(_name, _bits) \ +#define x(_name, _bits) \ out += inode_encode_field(out, end, 0, inode->_name); \ nr_fields++; \ \ @@ -107,7 +113,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, } BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD +#undef x out = last_nonzero_field; nr_fields = last_nonzero_fieldnr; @@ -129,9 +135,9 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); BUG_ON(unpacked.bi_mode != inode->bi_mode); -#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name); +#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD +#undef x } } @@ -149,7 +155,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -#define BCH_INODE_FIELD(_name, _bits) \ +#define x(_name, _bits) \ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ memset(&unpacked->_name, 0, \ sizeof(*unpacked) - \ @@ -168,7 +174,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, in += ret; BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD +#undef x /* XXX: signal if there were more fields than expected? */ @@ -177,88 +183,86 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (k.k->p.offset) - return "nonzero offset"; - - switch (k.k->type) { - case BCH_INODE_FS: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); struct bch_inode_unpacked unpacked; - if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) - return "incorrect value size"; + if (k.k->p.offset) + return "nonzero offset"; - if (k.k->p.inode < BLOCKDEV_INODE_MAX) - return "fs inode in blockdev range"; + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) + return "incorrect value size"; - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) - return "invalid str hash type"; + if (k.k->p.inode < BLOCKDEV_INODE_MAX) + return "fs inode in blockdev range"; - if (bch2_inode_unpack(inode, &unpacked)) - return "invalid variable length fields"; + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) + return "invalid str hash type"; - if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) - return "invalid data checksum type"; + if (bch2_inode_unpack(inode, &unpacked)) + return "invalid variable length fields"; - if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) - return "invalid data checksum type"; + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) + return "invalid data checksum type"; - return NULL; - } - case BCH_INODE_BLOCKDEV: - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev)) - return "incorrect value size"; + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) + return "invalid data checksum type"; - if (k.k->p.inode >= BLOCKDEV_INODE_MAX) - return "blockdev inode in fs range"; + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && + unpacked.bi_nlink != 0) + return "flagged as unlinked but bi_nlink != 0"; - return NULL; - case BCH_INODE_GENERATION: - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) - return "incorrect value size"; + return NULL; +} - return NULL; - default: - return "invalid type"; +void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; + + if (bch2_inode_unpack(inode, &unpacked)) { + pr_buf(out, "(unpack error)"); + return; } + +#define x(_name, _bits) \ + pr_buf(out, #_name ": %llu ", (u64) unpacked._name); + BCH_INODE_FIELDS() +#undef x } -void bch2_inode_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +const char *bch2_inode_generation_invalid(const struct bch_fs *c, + struct bkey_s_c k) { - char *out = buf, *end = out + size; - struct bkey_s_c_inode inode; - struct bch_inode_unpacked unpacked; + if (k.k->p.offset) + return "nonzero offset"; - switch (k.k->type) { - case BCH_INODE_FS: - inode = bkey_s_c_to_inode(k); - if (bch2_inode_unpack(inode, &unpacked)) { - out += scnprintf(out, end - out, "(unpack error)"); - break; - } + if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) + return "incorrect value size"; -#define BCH_INODE_FIELD(_name, _bits) \ - out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name); - BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD - break; - } + return NULL; +} + +void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); + + pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { - s64 now = timespec_to_bch2_time(c, - timespec64_trunc(current_kernel_time64(), - c->sb.time_precision)); + s64 now = bch2_current_time(c); memset(inode_u, 0, sizeof(*inode_u)); /* ick */ inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); + get_random_bytes(&inode_u->bi_hash_seed, + sizeof(inode_u->bi_hash_seed)); inode_u->bi_mode = mode; inode_u->bi_uid = uid; @@ -270,18 +274,32 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->bi_otime = now; if (parent) { -#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name; - BCH_INODE_FIELDS_INHERIT() -#undef BCH_INODE_FIELD +#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; + BCH_INODE_OPTS() +#undef x } } -int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) +static inline u32 bkey_generation(struct bkey_s_c k) { - struct bkey_inode_buf inode_p; - struct btree_iter iter; - bool searched_from_start = false; + switch (k.k->type) { + case KEY_TYPE_inode: + BUG(); + case KEY_TYPE_inode_generation: + return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + default: + return 0; + } +} + +int __bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) +{ + struct bch_fs *c = trans->c; + struct bkey_inode_buf *inode_p; + struct btree_iter *iter; + u64 start; int ret; if (!max) @@ -290,109 +308,75 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, if (c->opts.inodes_32bit) max = min_t(u64, max, U32_MAX); - if (*hint >= max || *hint < min) - *hint = min; + start = READ_ONCE(*hint); - if (*hint == min) - searched_from_start = true; -again: - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (start >= max || start < min) + start = min; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + iter = bch2_trans_get_iter(trans, + BTREE_ID_INODES, POS(start, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); +again: while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - u32 bi_generation = 0; + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - ret = btree_iter_err(k); - if (ret) { - bch2_btree_iter_unlock(&iter); + ret = bkey_err(k); + if (ret) return ret; - } switch (k.k->type) { - case BCH_INODE_BLOCKDEV: - case BCH_INODE_FS: + case KEY_TYPE_inode: /* slot used */ - if (iter.pos.inode == max) + if (iter->pos.inode >= max) goto out; - bch2_btree_iter_next_slot(&iter); + bch2_btree_iter_next_slot(iter); break; - case BCH_INODE_GENERATION: { - struct bkey_s_c_inode_generation g = - bkey_s_c_to_inode_generation(k); - bi_generation = le32_to_cpu(g.v->bi_generation); - /* fallthrough: */ - } default: - inode_u->bi_generation = bi_generation; - - bch2_inode_pack(&inode_p, inode_u); - inode_p.inode.k.p = k.k->p; - - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(&iter, - &inode_p.inode.k_i)); - - if (ret != -EINTR) { - bch2_btree_iter_unlock(&iter); - - if (!ret) { - inode_u->bi_inum = - inode_p.inode.k.p.inode; - *hint = inode_p.inode.k.p.inode + 1; - } - - return ret; - } - - if (ret == -EINTR) - continue; - + *hint = k.k->p.inode; + inode_u->bi_inum = k.k->p.inode; + inode_u->bi_generation = bkey_generation(k); + + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + return 0; } } out: - bch2_btree_iter_unlock(&iter); - - if (!searched_from_start) { + if (start != min) { /* Retry from start */ - *hint = min; - searched_from_start = true; + start = min; + bch2_btree_iter_set_pos(iter, POS(start, 0)); goto again; } return -ENOSPC; } -int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, - struct extent_insert_hook *hook, u64 *journal_seq) +int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) { - return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - POS(inode_nr, new_size), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, hook, - journal_seq); + return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + __bch2_inode_create(&trans, inode_u, min, max, hint)); } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); int ret; - ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL); - if (ret < 0) - return ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) - return ret; - /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -401,38 +385,40 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - POS(inode_nr, 0), - POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); - if (ret < 0) + ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + start, end, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_XATTRS, + start, end, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + start, end, NULL); + if (ret) return ret; - bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); u32 bi_generation = 0; - ret = btree_iter_err(k); - if (ret) { - bch2_btree_iter_unlock(&iter); - return ret; - } + ret = bkey_err(k); + if (ret) + break; - bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c, + bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, "inode %llu not found when deleting", inode_nr); switch (k.k->type) { - case BCH_INODE_FS: { + case KEY_TYPE_inode: { struct bch_inode_unpacked inode_u; if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) bi_generation = inode_u.bi_generation + 1; break; } - case BCH_INODE_GENERATION: { + case KEY_TYPE_inode_generation: { struct bkey_s_c_inode_generation g = bkey_s_c_to_inode_generation(k); bi_generation = le32_to_cpu(g.v->bi_generation); @@ -449,40 +435,44 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) delete.v.bi_generation = cpu_to_le32(bi_generation); } - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &delete.k_i)); + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(iter, &delete.k_i)); + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); } while (ret == -EINTR); - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) { - struct btree_iter iter; + struct btree_iter *iter; struct bkey_s_c k; int ret = -ENOENT; - for_each_btree_key(&iter, c, BTREE_ID_INODES, - POS(inode_nr, 0), - BTREE_ITER_SLOTS, k) { - switch (k.k->type) { - case BCH_INODE_FS: - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); - break; - default: - /* hole, not found */ - break; - } + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(inode_nr, 0), BTREE_ITER_SLOTS); + if (IS_ERR(iter)) + return PTR_ERR(iter); - break; + k = bch2_btree_iter_peek_slot(iter); + if (k.k->type == KEY_TYPE_inode) + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); - } + bch2_trans_iter_put(trans, iter); - return bch2_btree_iter_unlock(&iter) ?: ret; + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index cb3c887f0d40..af0c355f2f04 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -1,35 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_INODE_H #define _BCACHEFS_INODE_H #include "opts.h" -#include <linux/math64.h> +extern const char * const bch2_inode_opts[]; const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -#define bch2_bkey_inode_ops (struct bkey_ops) { \ +#define bch2_bkey_ops_inode (struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ } +const char *bch2_inode_generation_invalid(const struct bch_fs *, + struct bkey_s_c); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ + .key_invalid = bch2_inode_generation_invalid, \ + .val_to_text = bch2_inode_generation_to_text, \ +} + struct bch_inode_unpacked { u64 bi_inum; __le64 bi_hash_seed; u32 bi_flags; u16 bi_mode; -#define BCH_INODE_FIELD(_name, _bits) u##_bits _name; +#define x(_name, _bits) u##_bits _name; BCH_INODE_FIELDS() -#undef BCH_INODE_FIELD +#undef x }; struct bkey_inode_buf { struct bkey_i_inode inode; -#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8 +#define x(_name, _bits) + 8 + _bits / 8 u8 _pad[0 + BCH_INODE_FIELDS()]; -#undef BCH_INODE_FIELD +#undef x } __attribute__((packed, aligned(8))); void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); @@ -38,67 +49,58 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); + +int __bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *, + u64, u64, u64 *); int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, u64, u64, u64 *); -int bch2_inode_truncate(struct bch_fs *, u64, u64, - struct extent_insert_hook *, u64 *); -int bch2_inode_rm(struct bch_fs *, u64); - -int bch2_inode_find_by_inum(struct bch_fs *, u64, - struct bch_inode_unpacked *); -static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) -{ - return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); -} - -static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) -{ - s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; +int bch2_inode_rm(struct bch_fs *, u64); - if (c->sb.time_precision == 1) - return ns; - - return div_s64(ns, c->sb.time_precision); -} +int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { struct bch_io_opts ret = { 0 }; -#define BCH_INODE_OPT(_name, _bits) \ +#define x(_name, _bits) \ if (inode->bi_##_name) \ opt_set(ret, _name, inode->bi_##_name - 1); BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x return ret; } -static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode, - enum bch_opt_id id, u64 v) +static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, + enum inode_opt_id id, u64 v) { switch (id) { -#define BCH_INODE_OPT(_name, ...) \ - case Opt_##_name: \ +#define x(_name, ...) \ + case Inode_opt_##_name: \ inode->bi_##_name = v; \ break; BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x default: BUG(); } } -static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, - enum bch_opt_id id, u64 v) -{ - return __bch2_inode_opt_set(inode, id, v + 1); -} - -static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode, - enum bch_opt_id id) +static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, + enum inode_opt_id id) { - return __bch2_inode_opt_set(inode, id, 0); + switch (id) { +#define x(_name, ...) \ + case Inode_opt_##_name: \ + return inode->bi_##_name; + BCH_INODE_OPTS() +#undef x + default: + BUG(); + } } #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index f26d4041cdda..792f1df5d0a1 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * @@ -6,7 +7,7 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_foreground.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -15,6 +16,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "io.h" @@ -22,7 +24,6 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" -#include "replicas.h" #include "super.h" #include "super-io.h" @@ -121,10 +122,11 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, i, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -202,20 +204,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); const struct bch_extent_ptr *ptr; struct bch_write_bio *n; struct bch_dev *ca; BUG_ON(c->opts.nochanges); - extent_for_each_ptr(e, ptr) { + bkey_for_each_ptr(ptrs, ptr) { BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || !c->devs[ptr->dev]); ca = bch_dev_bkey_exists(c, ptr->dev); - if (ptr + 1 < &extent_entry_last(e)->ptr) { + if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, &ca->replica_set)); @@ -276,19 +278,44 @@ static void bch2_write_done(struct closure *cl) int bch2_write_index_default(struct bch_write_op *op) { + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter *iter; struct keylist *keys = &op->insert_keys; - struct btree_iter iter; int ret; - bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); + BUG_ON(bch2_keylist_empty(keys)); + bch2_verify_keylist_sorted(keys); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_INTENT); + + do { + BKEY_PADDED(k) split; + + bkey_copy(&split.k, bch2_keylist_front(keys)); - ret = bch2_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), + bch2_extent_trim_atomic(&split.k, iter); + + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(iter, &split.k)); + + ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); - bch2_btree_iter_unlock(&iter); + if (ret) + break; + + if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); + else + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); return ret; } @@ -300,31 +327,23 @@ static void __bch2_write_index(struct bch_write_op *op) { struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n, *k; + unsigned dev; int ret; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); bkey_copy(dst, src); - e = bkey_i_to_s_extent(dst); - extent_for_each_ptr_backwards(e, ptr) - if (test_bit(ptr->dev, op->failed.d)) - bch2_extent_drop_ptr(e, ptr); + bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, + test_bit(ptr->dev, op->failed.d)); - if (!bch2_extent_nr_ptrs(e.c)) { + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { ret = -EIO; goto err; } - if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c); - if (ret) - goto err; - } - dst = bkey_next(dst); } @@ -352,7 +371,11 @@ static void __bch2_write_index(struct bch_write_op *op) } } out: - bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; @@ -411,16 +434,32 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { + struct bch_fs *c = op->c; struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); + struct extent_ptr_decoded p = { .crc = crc }; + struct open_bucket *ob; + unsigned i; op->pos.offset += crc.uncompressed_size; - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.version = version; - bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bch2_extent_crc_append(e, crc); - bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); + p.ptr = ob->ptr; + p.ptr.cached = !ca->mi.durability || + (op->flags & BCH_WRITE_CACHED) != 0; + p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; + bch2_extent_ptr_decoded_append(e, &p); + + BUG_ON(crc.compressed_size > ob->sectors_free); + ob->sectors_free -= crc.compressed_size; + } bch2_keylist_push(&op->insert_keys); } @@ -428,7 +467,8 @@ static void init_append_extent(struct bch_write_op *op, static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, - bool *page_alloc_failed) + bool *page_alloc_failed, + void *buf) { struct bch_write_bio *wbio; struct bio *bio; @@ -438,11 +478,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); - wbio->bounce = true; wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; + if (buf) { + bio->bi_iter.bi_size = output_available; + bch2_bio_map(bio, buf); + return bio; + } + + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: @@ -607,14 +654,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; struct bkey_i *key_to_write; + void *ec_buf; unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; - unsigned total_output = 0; - bool bounce = false, page_alloc_failed = false; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; int ret, more = 0; BUG_ON(!bio_sectors(src)); + ec_buf = bch2_writepoint_ec_buf(c, wp); + switch (bch2_write_prep_encoded_data(op, wp)) { case PREP_ENCODED_OK: break; @@ -624,16 +675,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } init_append_extent(op, wp, op->version, op->crc); goto do_write; } - if (op->compression_type || + if (ec_buf || + op->compression_type || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); bounce = true; } @@ -736,7 +797,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); - total_output += dst_len; + total_output += dst_len; + total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && @@ -749,16 +811,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter = saved_iter; - if (!bounce && more) { - dst = bio_split(src, total_output >> 9, + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, GFP_NOIO, &c->bio_write); - wbio_init(dst)->put_bio = true; + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; /* Free unneeded pages after compressing: */ - if (bounce) + if (to_wbio(dst)->bounce) while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, &c->bio_bounce_pages); @@ -767,6 +833,10 @@ do_write: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + bch2_ec_add_backpointer(c, wp, + bkey_start_pos(&key_to_write->k), + total_input >> 9); + dst->bi_end_io = bch2_write_endio; dst->bi_private = &op->cl; bio_set_op_attrs(dst, REQ_OP_WRITE, 0); @@ -781,10 +851,10 @@ csum_err: "rewriting existing data (memory corruption?)"); ret = -EIO; err: - if (bounce) { + if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) bio_put(dst); - } return ret; } @@ -796,10 +866,12 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; int ret; again: + memset(&op->failed, 0, sizeof(op->failed)); + do { /* +1 for possible cache device: */ - if (op->open_buckets_nr + op->nr_replicas + 1 > - ARRAY_SIZE(op->open_buckets)) + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) goto flush_io; if (bch2_keylist_realloc(&op->insert_keys, @@ -810,6 +882,7 @@ again: wp = bch2_alloc_sectors_start(c, op->target, + op->opts.erasure_code, op->write_point, &op->devs_have, op->nr_replicas, @@ -830,11 +903,7 @@ again: ret = bch2_write_extent(op, wp); - BUG_ON(op->open_buckets_nr + wp->nr_ptrs - wp->first_ptr > - ARRAY_SIZE(op->open_buckets)); - bch2_open_bucket_get(c, wp, - &op->open_buckets_nr, - op->open_buckets); + bch2_open_bucket_get(c, wp, &op->open_buckets); bch2_alloc_sectors_done(c, wp); if (ret < 0) @@ -889,12 +958,9 @@ void bch2_write(struct closure *cl) BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); - BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); op->start_time = local_clock(); - memset(&op->failed, 0, sizeof(op->failed)); - bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(&op->wbio.bio)->put_bio = false; @@ -917,6 +983,7 @@ void bch2_write(struct closure *cl) struct promote_op { struct closure cl; + struct rcu_head rcu; u64 start_time; struct rhash_head hash; @@ -937,23 +1004,23 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts opts, unsigned flags) { - if (!opts.promote_target) + if (!bkey_extent_is_data(k.k)) return false; if (!(flags & BCH_READ_MAY_PROMOTE)) return false; - if (percpu_ref_is_dying(&c->writes)) - return false; - - if (!bkey_extent_is_data(k.k)) + if (!opts.promote_target) return false; - if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target)) + if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), + opts.promote_target)) return false; - if (bch2_target_congested(c, opts.promote_target)) + if (bch2_target_congested(c, opts.promote_target)) { + /* XXX trace this */ return false; + } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) @@ -970,7 +1037,7 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) bch_promote_params); BUG_ON(ret); percpu_ref_put(&c->writes); - kfree(op); + kfree_rcu(op, rcu); } static void promote_done(struct closure *cl) @@ -1012,7 +1079,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, struct bpos pos, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned rbio_sectors, struct bch_read_bio **rbio) @@ -1093,7 +1160,7 @@ err: static inline struct promote_op *promote_alloc(struct bch_fs *c, struct bvec_iter iter, struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, struct bch_io_opts opts, unsigned flags, struct bch_read_bio **rbio, @@ -1187,29 +1254,31 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, + unsigned flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; BKEY_PADDED(k) tmp; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - rbio->pos, BTREE_ITER_SLOTS); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + rbio->pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) goto err; - } bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); if (!bkey_extent_is_data(k.k) || !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), @@ -1221,44 +1290,49 @@ retry: goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) goto err; - goto out; -err: - rbio->bio.bi_status = BLK_STS_IOERR; out: bch2_rbio_done(rbio); + bch2_trans_exit(&trans); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; } static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int ret; + bch2_trans_init(&trans, c, 0, 0); + flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; retry: - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, bvec_iter.bi_size, (k.k->p.offset - bvec_iter.bi_sector) << 9); swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags); + ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags); switch (ret) { case READ_RETRY: goto retry; @@ -1277,12 +1351,12 @@ retry: * If we get here, it better have been because there was an error * reading a btree node */ - ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); - __bcache_io_error(c, "btree IO error %i", ret); + __bcache_io_error(c, "btree IO error: %i", ret); err: rbio->bio.bi_status = BLK_STS_IOERR; out: + bch2_trans_exit(&trans); bch2_rbio_done(rbio); } @@ -1294,14 +1368,12 @@ static void bch2_rbio_retry(struct work_struct *work) struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; u64 inode = rbio->pos.inode; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); - memset(&avoid, 0, sizeof(avoid)); - if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ptr.dev, avoid.d); + bch2_mark_io_failure(&failed, &rbio->pick); rbio->bio.bi_status = 0; @@ -1311,9 +1383,9 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) - bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); else - bch2_read_retry(c, rbio, iter, inode, &avoid, flags); + bch2_read_retry(c, rbio, iter, inode, &failed, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1338,21 +1410,25 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_extent *e; BKEY_PADDED(k) new; struct bch_extent_crc_unpacked new_crc; - unsigned offset; + u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; if (rbio->pick.crc.compression_type) return; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, 0, 0); retry: - k = bch2_btree_iter_peek(&iter); + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(iter); if (IS_ERR_OR_NULL(k.k)) goto out; @@ -1363,24 +1439,19 @@ retry: e = bkey_i_to_extent(&new.k); if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || + rbio->pick.ptr, data_offset) || bversion_cmp(e->k.version, rbio->version)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < rbio->pos.offset || - e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(&e->k) < data_offset || + e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; - /* The extent might have been partially overwritten since we read it: */ - offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset); - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - offset, e->k.size, - rbio->pick.crc.csum_type)) { + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(&e->k) - data_offset, e->k.size, + rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); goto out; } @@ -1388,19 +1459,19 @@ retry: if (!bch2_extent_narrow_crcs(e, new_crc)) goto out; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOWAIT, - BTREE_INSERT_ENTRY(&iter, &e->k_i)); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOWAIT); if (ret == -EINTR) goto retry; out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); } static bool should_narrow_crcs(struct bkey_s_c k, - struct extent_pick_ptr *pick, + struct extent_ptr_decoded *pick, unsigned flags) { return !(flags & BCH_READ_IN_RETRY) && @@ -1553,9 +1624,9 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; struct promote_op *promote = NULL; @@ -1563,14 +1634,16 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; - pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick); + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ if (!pick_ret) goto hole; - if (pick_ret < 0) - goto no_device; + if (pick_ret < 0) { + __bcache_io_error(c, "no device to read from"); + goto err; + } if (pick_ret > 0) ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -1695,31 +1768,46 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (!rbio->have_ioref) - goto no_device_postclone; - - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read(&c->mark_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read(&c->mark_lock); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], - bio_sectors(&rbio->bio)); + if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + __bcache_io_error(c, "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } - if (likely(!(flags & BCH_READ_IN_RETRY))) { - if (!(flags & BCH_READ_LAST_FRAGMENT)) { - bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; } - submit_bio(&rbio->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { return 0; } else { int ret; - submit_bio_wait(&rbio->bio); - rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); @@ -1727,29 +1815,19 @@ noclone: rbio = bch2_rbio_free(rbio); if (ret == READ_RETRY_AVOID) { - __set_bit(pick.ptr.dev, avoid->d); + bch2_mark_io_failure(failed, &pick); ret = READ_RETRY; } return ret; } -no_device_postclone: - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - bch2_rbio_free(rbio); -no_device: - __bcache_io_error(c, "no device to read from"); - - if (likely(!(flags & BCH_READ_IN_RETRY))) { - orig->bio.bi_status = BLK_STS_IOERR; - - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; - } else { +err: + if (flags & BCH_READ_IN_RETRY) return READ_ERR; - } + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; hole: /* @@ -1761,7 +1839,7 @@ hole: orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); - +out_read_done: if (flags & BCH_READ_LAST_FRAGMENT) bch2_rbio_done(orig); return 0; @@ -1769,13 +1847,16 @@ hole: void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED; int ret; + bch2_trans_init(&trans, c, 0, 0); + BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); BUG_ON(flags & BCH_READ_IN_RETRY); @@ -1783,9 +1864,9 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k) { + BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; unsigned bytes; @@ -1795,7 +1876,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size, (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9); @@ -1817,9 +1898,10 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) * If we get here, it better have been because there was an error * reading a btree node */ - ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); + + bch2_trans_exit(&trans); bch2_rbio_done(rbio); } diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 68539c78f292..fe82c8b81ca5 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -1,7 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_IO_H #define _BCACHEFS_IO_H -#include "alloc.h" #include "checksum.h" #include "io_types.h" @@ -31,10 +31,9 @@ enum bch_write_flags { BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_NOPUT_RESERVATION = (1 << 7), - BCH_WRITE_NOMARK_REPLICAS = (1 << 8), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -71,7 +70,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; op->alloc_reserve = RESERVE_NONE; - op->open_buckets_nr = 0; + op->open_buckets.nr = 0; op->devs_have.nr = 0; op->target = 0; op->opts = opts; @@ -95,10 +94,10 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) struct bch_devs_mask; struct cache_promote_op; -struct extent_pick_ptr; +struct extent_ptr_decoded; int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c, struct bch_devs_mask *, unsigned); + struct bkey_s_c, struct bch_io_failures *, unsigned); void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); enum bch_read_flags { diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index 28281ea6c43a..04f6d9a7c9a2 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_IO_TYPES_H #define _BCACHEFS_IO_TYPES_H @@ -54,7 +55,7 @@ struct bch_read_bio { struct bch_devs_list devs_have; - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; /* start pos of data we read (may not be pos of data we want) */ struct bpos pos; struct bversion version; @@ -103,7 +104,6 @@ struct bch_write_op { unsigned nr_replicas_required:4; unsigned alloc_reserve:4; - u8 open_buckets_nr; struct bch_devs_list devs_have; u16 target; u16 nonce; @@ -120,7 +120,7 @@ struct bch_write_op { struct disk_reservation res; - u8 open_buckets[16]; + struct open_buckets open_buckets; /* * If caller wants to flush but hasn't passed us a journal_seq ptr, we diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index addd51f08c9a..5c3e146e3942 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcachefs journalling code, for btree insertions * @@ -5,7 +6,7 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_gc.h" #include "buckets.h" @@ -17,29 +18,14 @@ #include <trace/events/bcachefs.h> -static bool journal_entry_is_open(struct journal *j) +static bool __journal_entry_is_open(union journal_res_state state) { - return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; } -void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) +static bool journal_entry_is_open(struct journal *j) { - struct journal_buf *w = journal_prev_buf(j); - - atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count); - - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); -#if 0 - closure_call(&j->io, bch2_journal_write, NULL, NULL); -#else - /* Shut sparse up: */ - closure_init(&j->io, NULL); - set_closure_fn(&j->io, bch2_journal_write, NULL); - bch2_journal_write(&j->io); -#endif + return __journal_entry_is_open(j->reservations); } static void journal_pin_new_entry(struct journal *j, int count) @@ -70,41 +56,71 @@ static void bch2_journal_buf_init(struct journal *j) buf->data->u64s = 0; } -static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf) +void bch2_journal_halt(struct journal *j) { - return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) + return; + + new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); } -static enum { - JOURNAL_ENTRY_ERROR, - JOURNAL_ENTRY_INUSE, - JOURNAL_ENTRY_CLOSED, - JOURNAL_UNLOCKED, -} journal_buf_switch(struct journal *j, bool need_write_just_set) +/* journal entry close/open: */ + +void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +{ + if (!need_write_just_set && + test_bit(JOURNAL_NEED_WRITE, &j->flags)) + bch2_time_stats_update(j->delay_time, + j->need_write_time); + + clear_bit(JOURNAL_NEED_WRITE, &j->flags); + + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); +} + +/* + * Returns true if journal entry is now closed: + */ +static bool __journal_entry_close(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *buf; + struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); + bool set_need_write = false; + unsigned sectors; lockdep_assert_held(&j->lock); do { old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) - return JOURNAL_ENTRY_CLOSED; + return true; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return JOURNAL_ENTRY_ERROR; + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { + /* this entry will never be written: */ + closure_wake_up(&buf->wait); + return true; + } - if (new.prev_buf_unwritten) - return JOURNAL_ENTRY_INUSE; + if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { + set_bit(JOURNAL_NEED_WRITE, &j->flags); + j->need_write_time = local_clock(); + set_need_write = true; + } - /* - * avoid race between setting buf->data->u64s and - * journal_res_put starting write: - */ - journal_state_inc(&new); + if (new.prev_buf_unwritten) + return false; new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; @@ -114,59 +130,62 @@ static enum { } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - - buf = &j->buf[old.idx]; buf->data->u64s = cpu_to_le32(old.cur_entry_offset); - j->prev_buf_sectors = - vstruct_blocks_plus(buf->data, c->block_bits, - journal_entry_u64s_reserve(buf)) * - c->opts.block_size; - BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); + sectors = vstruct_blocks_plus(buf->data, c->block_bits, + buf->u64s_reserved) << c->block_bits; + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; + + bkey_extent_init(&buf->key); - bch2_journal_reclaim_fast(j); - /* XXX: why set this here, and not in bch2_journal_write()? */ + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * + * A threads may replace an old pin with a new pin on their current + * journal reservation - the expectation being that the journal will + * contain either what the old pin protected or what the new pin + * protects. + * + * After the old pin is dropped journal_last_seq() won't include the old + * pin, so we can only write the updated last_seq on the entry that + * contains whatever the new pin protects. + * + * Restated, we can _not_ update last_seq for a given entry if there + * could be a newer entry open with reservations/pins that have been + * taken against it. + * + * Hence, we want update/set last_seq on the current journal entry right + * before we open a new one: + */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + if (journal_entry_empty(buf->data)) + clear_bit(JOURNAL_NOT_EMPTY, &j->flags); + else + set_bit(JOURNAL_NOT_EMPTY, &j->flags); + journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); - spin_unlock(&j->lock); - - if (c->bucket_journal_seq > 1 << 14) { - c->bucket_journal_seq = 0; - bch2_bucket_seq_cleanup(c); - } - c->bucket_journal_seq++; + bch2_journal_space_available(j); - /* ugh - might be called from __journal_res_get() under wait_event() */ - __set_current_state(TASK_RUNNING); - bch2_journal_buf_put(j, old.idx, need_write_just_set); - - return JOURNAL_UNLOCKED; + bch2_journal_buf_put(j, old.idx, set_need_write); + return true; } -void bch2_journal_halt(struct journal *j) +static bool journal_entry_close(struct journal *j) { - union journal_res_state old, new; - u64 v = atomic64_read(&j->reservations.counter); - - do { - old.v = new.v = v; - if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return; + bool ret; - new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + spin_lock(&j->lock); + ret = __journal_entry_close(j); + spin_unlock(&j->lock); - journal_wake(j); - closure_wake_up(&journal_cur_buf(j)->wait); - closure_wake_up(&journal_prev_buf(j)->wait); + return ret; } /* @@ -174,49 +193,39 @@ void bch2_journal_halt(struct journal *j) * journal reservation - journal entry is open means journal is dirty: * * returns: - * 1: success - * 0: journal currently full (must wait) - * -EROFS: insufficient rw devices - * -EIO: journal error + * 0: success + * -ENOSPC: journal currently full, must invoke reclaim + * -EAGAIN: journal blocked, must wait + * -EROFS: insufficient rw devices or journal error */ static int journal_entry_open(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; - ssize_t u64s; - int sectors; + int u64s; u64 v; lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); - if (!fifo_free(&j->pin)) - return 0; - - sectors = bch2_journal_entry_sectors(j); - if (sectors <= 0) - return sectors; + if (j->blocked) + return -EAGAIN; - buf->disk_sectors = sectors; + if (j->cur_entry_error) + return j->cur_entry_error; - sectors = min_t(unsigned, sectors, buf->size >> 9); - j->cur_buf_sectors = sectors; + BUG_ON(!j->cur_entry_sectors); - u64s = (sectors << 9) / sizeof(u64); + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); - /* Subtract the journal header */ - u64s -= sizeof(struct jset) / sizeof(u64); - /* - * Btree roots, prio pointers don't get added until right before we do - * the write: - */ - u64s -= journal_entry_u64s_reserve(buf); - u64s = max_t(ssize_t, 0L, u64s); - - BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL); + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= le32_to_cpu(buf->data->u64s)) - return 0; + return -ENOSPC; /* * Must be set before marking the journal entry as open: @@ -228,10 +237,13 @@ static int journal_entry_open(struct journal *j) old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return -EIO; + return -EROFS; /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + + EBUG_ON(journal_state_count(new, new.idx)); + journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -244,37 +256,29 @@ static int journal_entry_open(struct journal *j) &j->write_work, msecs_to_jiffies(j->write_delay_ms)); journal_wake(j); - return 1; + return 0; } -/* - * returns true if there's nothing to flush and no journal write still in flight - */ -static bool journal_flush_write(struct journal *j) +static bool journal_quiesced(struct journal *j) { - bool ret; - - spin_lock(&j->lock); - ret = !j->reservations.prev_buf_unwritten; - - if (!journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return ret; - } + union journal_res_state state = READ_ONCE(j->reservations); + bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); - set_bit(JOURNAL_NEED_WRITE, &j->flags); - if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED) - ret = false; - else - spin_unlock(&j->lock); + if (!ret) + journal_entry_close(j); return ret; } +static void journal_quiesce(struct journal *j) +{ + wait_event(j->wait, journal_quiesced(j)); +} + static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - journal_flush_write(j); + journal_entry_close(j); } /* @@ -302,26 +306,39 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) } static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; + bool can_discard; int ret; retry: - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) - return ret; + if (journal_res_get_fast(j, res, flags)) + return 0; + + if (bch2_journal_error(j)) + return -EROFS; spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call journal_entry_close() * unnecessarily */ - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) { + if (journal_res_get_fast(j, res, flags)) { spin_unlock(&j->lock); - return 1; + return 0; + } + + if (!(flags & JOURNAL_RES_GET_RESERVED) && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ + ret = -ENOSPC; + goto unlock; } /* @@ -331,51 +348,58 @@ retry: */ buf = journal_cur_buf(j); if (journal_entry_is_open(j) && - buf->size >> 9 < buf->disk_sectors && - buf->size < JOURNAL_ENTRY_SIZE_MAX) - j->buf_size_want = max(j->buf_size_want, buf->size << 1); + buf->buf_size >> 9 < buf->disk_sectors && + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - /* - * Close the current journal entry if necessary, then try to start a new - * one: - */ - switch (journal_buf_switch(j, false)) { - case JOURNAL_ENTRY_ERROR: - spin_unlock(&j->lock); - return -EROFS; - case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ - spin_unlock(&j->lock); + if (journal_entry_is_open(j) && + !__journal_entry_close(j)) { + /* + * We failed to get a reservation on the current open journal + * entry because it's full, and we can't close it because + * there's still a previous one in flight: + */ trace_journal_entry_full(c); - goto blocked; - case JOURNAL_ENTRY_CLOSED: - break; - case JOURNAL_UNLOCKED: - goto retry; + ret = -EAGAIN; + } else { + ret = journal_entry_open(j); } +unlock: + if ((ret == -EAGAIN || ret == -ENOSPC) && + !j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; - /* We now have a new, closed journal buf - see if we can open it: */ - ret = journal_entry_open(j); + can_discard = j->can_discard; spin_unlock(&j->lock); - if (ret < 0) - return ret; - if (ret) + if (!ret) goto retry; - /* Journal's full, we have to wait */ + if (ret == -ENOSPC) { + BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED)); - /* - * Direct reclaim - can't rely on reclaim from work item - * due to freezing.. - */ - bch2_journal_reclaim_work(&j->reclaim_work.work); + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + trace_journal_full(c); - trace_journal_full(c); -blocked: - if (!j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - return 0; + if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; + } + + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } + } + + ret = -EAGAIN; + } + + return ret; } /* @@ -389,16 +413,78 @@ blocked: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { int ret; - wait_event(j->wait, - (ret = __journal_res_get(j, res, u64s_min, - u64s_max))); - return ret < 0 ? ret : 0; + closure_wait_event(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -EAGAIN || + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; } +/* journal_preres: */ + +static bool journal_preres_available(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s); + + if (!ret) + bch2_journal_reclaim_work(&j->reclaim_work.work); + + return ret; +} + +int __bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + int ret; + + closure_wait_event(&j->preres_wait, + (ret = bch2_journal_error(j)) || + journal_preres_available(j, res, new_u64s)); + return ret; +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *j, + struct journal_entry_res *res, + unsigned new_u64s) +{ + union journal_res_state state; + int d = new_u64s - res->u64s; + + spin_lock(&j->lock); + + j->entry_u64s_reserved += d; + if (d <= 0) + goto out; + + j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); + smp_mb(); + state = READ_ONCE(j->reservations); + + if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && + state.cur_entry_offset > j->cur_entry_u64s) { + j->cur_entry_u64s += d; + /* + * Not enough room in current journal entry, have to flush it: + */ + __journal_entry_close(j); + } else { + journal_cur_buf(j)->u64s_reserved += d; + } +out: + spin_unlock(&j->lock); + res->u64s += d; +} + +/* journal flushing: */ + u64 bch2_journal_last_unwritten_seq(struct journal *j) { u64 seq; @@ -420,30 +506,84 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j) * btree root - every journal entry contains the roots of all the btrees, so it * doesn't need to bother with getting a journal reservation */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) +int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); int ret; spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - if (seq < journal_cur_seq(j) || + /* + * Can't try to open more than one sequence number ahead: + */ + BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); + + if (journal_cur_seq(j) > seq || journal_entry_is_open(j)) { spin_unlock(&j->lock); - return 1; + return 0; } - ret = journal_entry_open(j); - if (!ret) - closure_wait(&j->async_wait, parent); + if (journal_cur_seq(j) < seq && + !__journal_entry_close(j)) { + /* haven't finished writing out the previous one: */ + trace_journal_entry_full(c); + ret = -EAGAIN; + } else { + BUG_ON(journal_cur_seq(j) != seq); + + ret = journal_entry_open(j); + } + + if ((ret == -EAGAIN || ret == -ENOSPC) && + !j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; + + if (ret == -EAGAIN || ret == -ENOSPC) + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); - if (!ret) + if (ret == -ENOSPC) { + trace_journal_full(c); bch2_journal_reclaim_work(&j->reclaim_work.work); + ret = -EAGAIN; + } return ret; } +static int journal_seq_error(struct journal *j, u64 seq) +{ + union journal_res_state state = READ_ONCE(j->reservations); + + if (seq == journal_cur_seq(j)) + return bch2_journal_error(j); + + if (seq + 1 == journal_cur_seq(j) && + !state.prev_buf_unwritten && + seq > j->seq_ondisk) + return -EIO; + + return 0; +} + +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + /* seq should be for a journal entry that has been opened: */ + BUG_ON(seq > journal_cur_seq(j)); + BUG_ON(seq == journal_cur_seq(j) && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (seq == journal_cur_seq(j)) + return journal_cur_buf(j); + if (seq + 1 == journal_cur_seq(j) && + j->reservations.prev_buf_unwritten) + return journal_prev_buf(j); + return NULL; +} + /** * bch2_journal_wait_on_seq - wait for a journal entry to be written * @@ -452,31 +592,22 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is * configurable). */ -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent) +void bch2_journal_wait_on_seq(struct journal *j, u64 seq, + struct closure *parent) { - spin_lock(&j->lock); - - BUG_ON(seq > journal_cur_seq(j)); + struct journal_buf *buf; - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } + spin_lock(&j->lock); - if (seq == journal_cur_seq(j)) { - if (!closure_wait(&journal_cur_buf(j)->wait, parent)) - BUG(); - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - if (!closure_wait(&journal_prev_buf(j)->wait, parent)) + if ((buf = journal_seq_to_buf(j, seq))) { + if (!closure_wait(&buf->wait, parent)) BUG(); - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&journal_prev_buf(j)->wait); + if (seq == journal_cur_seq(j)) { + smp_mb(); + if (bch2_journal_error(j)) + closure_wake_up(&buf->wait); + } } spin_unlock(&j->lock); @@ -488,107 +619,32 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent * like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) +void bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) { struct journal_buf *buf; spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); - return; - } - - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - buf = journal_cur_buf(j); - - if (parent && !closure_wait(&buf->wait, parent)) - BUG(); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - if (parent) - closure_wake_up(&buf->wait); - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return; - } - } else if (parent && - seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - buf = journal_prev_buf(j); - + if (parent && + (buf = journal_seq_to_buf(j, seq))) if (!closure_wait(&buf->wait, parent)) BUG(); - smp_mb(); - - /* check if raced with write completion (or failure) */ - if (!j->reservations.prev_buf_unwritten || - bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); spin_unlock(&j->lock); } static int journal_seq_flushed(struct journal *j, u64 seq) { - struct journal_buf *buf; - int ret = 1; + int ret; spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); - - if (seq == journal_cur_seq(j)) { - bool set_need_write = false; - - ret = 0; - - buf = journal_cur_buf(j); - - if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { - j->need_write_time = local_clock(); - set_need_write = true; - } - - switch (journal_buf_switch(j, set_need_write)) { - case JOURNAL_ENTRY_ERROR: - ret = -EIO; - break; - case JOURNAL_ENTRY_CLOSED: - /* - * Journal entry hasn't been opened yet, but caller - * claims it has something - */ - BUG(); - case JOURNAL_ENTRY_INUSE: - break; - case JOURNAL_UNLOCKED: - return 0; - } - } else if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) { - ret = bch2_journal_error(j); - } + ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); spin_unlock(&j->lock); return ret; @@ -612,11 +668,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) void bch2_journal_meta_async(struct journal *j, struct closure *parent) { struct journal_res res; - unsigned u64s = jset_u64s(0); memset(&res, 0, sizeof(res)); - bch2_journal_res_get(j, &res, u64s, u64s); + bch2_journal_res_get(j, &res, jset_u64s(0), 0); bch2_journal_res_put(j, &res); bch2_journal_flush_seq_async(j, res.seq, parent); @@ -625,12 +680,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent) int bch2_journal_meta(struct journal *j) { struct journal_res res; - unsigned u64s = jset_u64s(0); int ret; memset(&res, 0, sizeof(res)); - ret = bch2_journal_res_get(j, &res, u64s, u64s); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; @@ -683,6 +737,26 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } +/* block/unlock the journal: */ + +void bch2_journal_unblock(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked--; + spin_unlock(&j->lock); + + journal_wake(j); +} + +void bch2_journal_block(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked++; + spin_unlock(&j->lock); + + journal_quiesce(j); +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -705,10 +779,14 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); + nr + sizeof(*journal_buckets) / sizeof(u64)); if (!journal_buckets) goto err; + /* + * We may be called from the device add path, before the new device has + * actually been added to the running filesystem: + */ if (c) spin_lock(&c->journal.lock); @@ -722,58 +800,58 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, while (ja->nr < nr) { struct open_bucket *ob = NULL; + unsigned pos; long bucket; if (new_fs) { - percpu_down_read_preempt_disable(&c->usage_lock); bucket = bch2_bucket_alloc_new_fs(ca); - percpu_up_read_preempt_enable(&c->usage_lock); - if (bucket < 0) { ret = -ENOSPC; goto err; } } else { - int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl); - if (ob_idx < 0) { + ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, + false, cl); + if (IS_ERR(ob)) { ret = cl ? -EAGAIN : -ENOSPC; goto err; } - ob = c->open_buckets + ob_idx; bucket = sector_to_bucket(ca, ob->ptr.offset); } if (c) { - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read(&c->mark_lock); spin_lock(&c->journal.lock); } - __array_insert_item(ja->buckets, ja->nr, ja->last_idx); - __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx); - __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx); + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); + __array_insert_item(journal_buckets->buckets, ja->nr, pos); + ja->nr++; - ja->buckets[ja->last_idx] = bucket; - ja->bucket_seq[ja->last_idx] = 0; - journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket); + ja->buckets[pos] = bucket; + ja->bucket_seq[pos] = 0; + journal_buckets->buckets[pos] = cpu_to_le64(bucket); - if (ja->last_idx < ja->nr) { - if (ja->cur_idx >= ja->last_idx) - ja->cur_idx++; - ja->last_idx++; - } - ja->nr++; + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - new_fs - ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE - : 0); + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); if (c) { spin_unlock(&c->journal.lock); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read(&c->mark_lock); } if (!new_fs) @@ -818,7 +896,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, */ if (bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0)) { + bucket_to_sector(ca, nr - ja->nr), 1, 0)) { mutex_unlock(&c->sb_lock); return -ENOSPC; } @@ -875,54 +953,90 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) { - spin_lock(&j->lock); - bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx); - spin_unlock(&j->lock); - wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); } void bch2_fs_journal_stop(struct journal *j) { - wait_event(j->wait, journal_flush_write(j)); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, journal_entry_close(j)); + + /* do we need to write another journal entry? */ + if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) || + c->btree_roots_dirty) + bch2_journal_meta(j); + + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_NOT_EMPTY, &j->flags)); cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->reclaim_work); } -void bch2_fs_journal_start(struct journal *j) +int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + struct list_head *journal_entries) { - struct journal_seq_blacklist *bl; - u64 blacklist = 0; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct journal_replay *i; + u64 last_seq = cur_seq, nr, seq; + + if (!list_empty(journal_entries)) + last_seq = le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); + + nr = cur_seq - last_seq; + + if (nr + 1 > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -ENOMEM; + } + } + + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 1); + p->devs.nr = 0; + } + + list_for_each_entry(i, journal_entries, list) { + seq = le64_to_cpu(i->j.seq); - list_for_each_entry(bl, &j->seq_blacklist, list) - blacklist = max(blacklist, bl->end); + BUG_ON(seq < last_seq || seq >= cur_seq); + + journal_seq_pin(j, seq)->devs = i->devs; + } spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); - while (journal_cur_seq(j) < blacklist) - journal_pin_new_entry(j, 0); - - /* - * journal_buf_switch() only inits the next journal entry when it - * closes an open journal entry - the very first journal entry gets - * initialized here: - */ journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); - spin_unlock(&j->lock); + c->last_bucket_seq_cleanup = journal_cur_seq(j); - /* - * Adding entries to the next journal entry before allocating space on - * disk for the next journal entry - this is ok, because these entries - * only have to go down with the next journal entry we write: - */ - bch2_journal_seq_blacklist_write(j); + bch2_journal_space_available(j); + spin_unlock(&j->lock); - queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0); + return 0; } /* init/exit: */ @@ -968,8 +1082,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].size); - kvpfree(j->buf[0].data, j->buf[0].size); + kvpfree(j->buf[1].data, j->buf[1].buf_size); + kvpfree(j->buf[0].data, j->buf[0].buf_size); free_fifo(&j->pin); } @@ -986,26 +1100,28 @@ int bch2_fs_journal_init(struct journal *j) init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); + init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; - bkey_extent_init(&j->key); + /* Btree roots: */ + j->entry_u64s_reserved += + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) { + !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || + !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ret = -ENOMEM; goto out; } @@ -1020,38 +1136,63 @@ out: ssize_t bch2_journal_print_debug(struct journal *j, char *buf) { + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(j, struct bch_fs, journal); - union journal_res_state *s = &j->reservations; + union journal_res_state s; struct bch_dev *ca; unsigned iter; - ssize_t ret = 0; rcu_read_lock(); spin_lock(&j->lock); + s = READ_ONCE(j->reservations); + + pr_buf(&out, + "active journal entries:\t%llu\n" + "seq:\t\t\t%llu\n" + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" + "current entry sectors:\t%u\n" + "current entry:\t\t", + fifo_used(&j->pin), + journal_cur_seq(j), + journal_last_seq(j), + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, + j->cur_entry_sectors); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: + pr_buf(&out, "error\n"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: + pr_buf(&out, "closed\n"); + break; + default: + pr_buf(&out, "%u/%u\n", + s.cur_entry_offset, + j->cur_entry_u64s); + break; + } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "active journal entries:\t%llu\n" - "seq:\t\t\t%llu\n" - "last_seq:\t\t%llu\n" - "last_seq_ondisk:\t%llu\n" - "reservation count:\t%u\n" - "reservation offset:\t%u\n" - "current entry u64s:\t%u\n" - "io in flight:\t\t%i\n" - "need write:\t\t%i\n" - "dirty:\t\t\t%i\n" - "replay done:\t\t%i\n", - fifo_used(&j->pin), - journal_cur_seq(j), - journal_last_seq(j), - j->last_seq_ondisk, - journal_state_count(*s, s->idx), - s->cur_entry_offset, - j->cur_entry_u64s, - s->prev_buf_unwritten, - test_bit(JOURNAL_NEED_WRITE, &j->flags), - journal_entry_is_open(j), - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + pr_buf(&out, + "current entry refs:\t%u\n" + "prev entry unwritten:\t", + journal_state_count(s, s.idx)); + + if (s.prev_buf_unwritten) + pr_buf(&out, "yes, ref %u sectors %u\n", + journal_state_count(s, !s.idx), + journal_prev_buf(j)->sectors); + else + pr_buf(&out, "no\n"); + + pr_buf(&out, + "need write:\t\t%i\n" + "replay done:\t\t%i\n", + test_bit(JOURNAL_NEED_WRITE, &j->flags), + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); for_each_member_device_rcu(ca, c, iter, &c->rw_devs[BCH_DATA_JOURNAL]) { @@ -1060,50 +1201,53 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) if (!ja->nr) continue; - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "dev %u:\n" - "\tnr\t\t%u\n" - "\tcur_idx\t\t%u (seq %llu)\n" - "\tlast_idx\t%u (seq %llu)\n", - iter, ja->nr, - ja->cur_idx, ja->bucket_seq[ja->cur_idx], - ja->last_idx, ja->bucket_seq[ja->last_idx]); + pr_buf(&out, + "dev %u:\n" + "\tnr\t\t%u\n" + "\tavailable\t%u:%u\n" + "\tdiscard_idx\t\t%u\n" + "\tdirty_idx_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t\t%u (seq %llu)\n" + "\tcur_idx\t\t%u (seq %llu)\n", + iter, ja->nr, + bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), + ja->sectors_free, + ja->discard_idx, + ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], + ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], + ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } spin_unlock(&j->lock); rcu_read_unlock(); - return ret; + return out.pos - buf; } ssize_t bch2_journal_print_pins(struct journal *j, char *buf) { + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - ssize_t ret = 0; u64 i; spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%llu: count %u\n", - i, atomic_read(&pin_list->count)); + pr_buf(&out, "%llu: count %u\n", + i, atomic_read(&pin_list->count)); list_for_each_entry(pin, &pin_list->list, list) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "\t%p %pf\n", - pin, pin->flush); + pr_buf(&out, "\t%p %pf\n", + pin, pin->flush); if (!list_empty(&pin_list->flushed)) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "flushed:\n"); + pr_buf(&out, "flushed:\n"); list_for_each_entry(pin, &pin_list->flushed, list) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "\t%p %pf\n", - pin, pin->flush); + pr_buf(&out, "\t%p %pf\n", + pin, pin->flush); } spin_unlock(&j->lock); - return ret; + return out.pos - buf; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 6759810b19ef..ec5ba2b9ef42 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_H #define _BCACHEFS_JOURNAL_H @@ -118,6 +119,7 @@ static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); + closure_wake_up(&j->preres_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) @@ -178,6 +180,11 @@ static inline unsigned jset_u64s(unsigned u64s) return u64s + sizeof(struct jset_entry) / sizeof(u64); } +static inline int journal_entry_overhead(struct journal *j) +{ + return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; +} + static inline struct jset_entry * bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) { @@ -222,7 +229,20 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res * id, 0, k, k->k.u64s); } -void bch2_journal_buf_put_slowpath(struct journal *, bool); +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) + return false; + return true; +} + +void __bch2_journal_buf_put(struct journal *, bool); static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, bool need_write_just_set) @@ -233,17 +253,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, .buf0_count = idx == 0, .buf1_count = idx == 1, }).v, &j->reservations.counter); - - EBUG_ON(s.idx != idx && !s.prev_buf_unwritten); - - /* - * Do not initiate a journal write if the journal is in an error state - * (previous journal entry write may have failed) - */ - if (s.idx != idx && - !journal_state_count(s, idx) && - s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL) - bch2_journal_buf_put_slowpath(j, need_write_just_set); + if (!journal_state_count(s, idx)) { + EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); + __bch2_journal_buf_put(j, need_write_just_set); + } } /* @@ -269,12 +282,15 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, unsigned); + unsigned); + +#define JOURNAL_RES_GET_NONBLOCK (1 << 0) +#define JOURNAL_RES_GET_CHECK (1 << 1) +#define JOURNAL_RES_GET_RESERVED (1 << 2) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, - unsigned u64s_min, - unsigned u64s_max) + unsigned flags) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -286,45 +302,143 @@ static inline int journal_res_get_fast(struct journal *j, * Check if there is still room in the current journal * entry: */ - if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; - res->offset = old.cur_entry_offset; - res->u64s = min(u64s_max, j->cur_entry_u64s - - old.cur_entry_offset); + EBUG_ON(!journal_state_count(new, new.idx)); + + if (!(flags & JOURNAL_RES_GET_RESERVED) && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; - journal_state_inc(&new); new.cur_entry_offset += res->u64s; + journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - res->ref = true; - res->idx = new.idx; - res->seq = le64_to_cpu(j->buf[res->idx].data->seq); + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned u64s, unsigned flags) { int ret; EBUG_ON(res->ref); - EBUG_ON(u64s_max < u64s_min); EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - if (journal_res_get_fast(j, res, u64s_min, u64s_max)) + res->u64s = u64s; + + if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max); + ret = bch2_journal_res_get_slowpath(j, res, flags); if (ret) return ret; out: - lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); - EBUG_ON(!res->ref); + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } return 0; } +/* journal_preres: */ + +static inline bool journal_check_may_get_unreserved(struct journal *j) +{ + union journal_preres_state s = READ_ONCE(j->prereserved); + bool ret = s.reserved <= s.remaining && + fifo_free(&j->pin) > 8; + + lockdep_assert_held(&j->lock); + + if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + if (ret) { + set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); + journal_wake(j); + } else { + clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); + } + } + return ret; +} + +static inline void bch2_journal_preres_put(struct journal *j, + struct journal_preres *res) +{ + union journal_preres_state s = { .reserved = res->u64s }; + + if (!res->u64s) + return; + + s.v = atomic64_sub_return(s.v, &j->prereserved.counter); + res->u64s = 0; + closure_wake_up(&j->preres_wait); + + if (s.reserved <= s.remaining && + !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + spin_lock(&j->lock); + journal_check_may_get_unreserved(j); + spin_unlock(&j->lock); + } +} + +int __bch2_journal_preres_get(struct journal *, + struct journal_preres *, unsigned); + +static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, + unsigned new_u64s) +{ + int d = new_u64s - res->u64s; + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + + do { + old.v = new.v = v; + + new.reserved += d; + + if (new.reserved > new.remaining) + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); + + res->u64s += d; + return 1; +} + +static inline int bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + if (new_u64s <= res->u64s) + return 0; + + if (bch2_journal_preres_get_fast(j, res, new_u64s)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) + return -EAGAIN; + + return __bch2_journal_preres_get(j, res, new_u64s); +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + u64 bch2_journal_last_unwritten_seq(struct journal *); int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); @@ -352,16 +466,15 @@ static inline bool journal_flushes_device(struct bch_dev *ca) return true; } -int bch2_journal_mark(struct bch_fs *, struct list_head *); -void bch2_journal_entries_free(struct list_head *); -int bch2_journal_replay(struct bch_fs *, struct list_head *); - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); set_bit(JOURNAL_REPLAY_DONE, &j->flags); } +void bch2_journal_unblock(struct journal *); +void bch2_journal_block(struct journal *); + ssize_t bch2_journal_print_debug(struct journal *, char *); ssize_t bch2_journal_print_pins(struct journal *, char *); @@ -370,8 +483,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + void bch2_fs_journal_stop(struct journal *); -void bch2_fs_journal_start(struct journal *); +int bch2_fs_journal_start(struct journal *, u64, struct list_head *); + void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 36ba6a4daf84..af135e263a3f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1,49 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc.h" -#include "btree_gc.h" -#include "btree_update.h" +#include "alloc_foreground.h" #include "buckets.h" #include "checksum.h" #include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" -#include "journal_seq_blacklist.h" #include "replicas.h" #include <trace/events/bcachefs.h> -static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type, - enum btree_id id) -{ - struct jset_entry *entry; - - for_each_jset_entry_type(entry, j, type) - if (entry->btree_id == id) - return entry; - - return NULL; -} - -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry = - bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id); - - if (!entry) - return NULL; - - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - *level = entry->level; - return k; -} - struct journal_list { struct closure cl; struct mutex lock; @@ -171,12 +138,12 @@ static void journal_entry_null_range(void *start, void *end) static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - struct bkey_i *k, enum bkey_type key_type, + struct bkey_i *k, enum btree_node_type key_type, const char *type, int write) { void *next = vstruct_next(entry); const char *invalid; - char buf[160]; + unsigned version = le32_to_cpu(jset->version); int ret = 0; if (journal_entry_err_on(!k->k.u64s, c, @@ -205,12 +172,17 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, } if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) - bch2_bkey_swab(key_type, NULL, bkey_to_packed(k)); + bch2_bkey_swab(NULL, bkey_to_packed(k)); - invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k)); + if (!write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(key_type, bkey_to_packed(k), write); + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type); if (invalid) { - bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), - bkey_i_to_s_c(k)); + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", type, invalid, buf); @@ -219,6 +191,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, journal_entry_null_range(vstruct_next(entry), next); return 0; } + + if (write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(key_type, bkey_to_packed(k), write); fsck_err: return ret; } @@ -232,8 +208,8 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, vstruct_for_each(entry, k) { int ret = journal_validate_key(c, jset, entry, k, - bkey_type(entry->level, - entry->btree_id), + __btree_node_type(entry->level, + entry->btree_id), "key", write); if (ret) return ret; @@ -305,6 +281,7 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c, if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); + goto out; } bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); @@ -314,6 +291,49 @@ static int journal_entry_validate_blacklist_v2(struct bch_fs *c, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } +out: +fsck_err: + return ret; +} + +static int journal_entry_validate_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u), + c, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static int journal_entry_validate_data_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, + c, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } fsck_err: return ret; @@ -336,18 +356,10 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { static int journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, int write) { - int ret = 0; - - if (entry->type >= BCH_JSET_ENTRY_NR) { - journal_entry_err(c, "invalid journal entry type %u", - entry->type); - journal_entry_null_range(entry, vstruct_next(entry)); - return 0; - } - - ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write); -fsck_err: - return ret; + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, + entry, write) + : 0; } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, @@ -380,14 +392,17 @@ static int jset_validate(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); struct bch_csum csum; + unsigned version; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) { - bch_err(c, "unknown journal entry version %u", - le32_to_cpu(jset->version)); + version = le32_to_cpu(jset->version); + if ((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max) { + bch_err(c, "unknown journal entry version %u", jset->version); return BCH_FSCK_UNKNOWN_VERSION; } @@ -455,11 +470,10 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, static int journal_read_bucket(struct bch_dev *ca, struct journal_read_buf *buf, struct journal_list *jlist, - unsigned bucket, u64 *seq, bool *entries_found) + unsigned bucket) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; - struct bio *bio = ja->bio; struct jset *j = NULL; unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), @@ -471,10 +485,14 @@ static int journal_read_bucket(struct bch_dev *ca, while (offset < end) { if (!sectors_read) { -reread: sectors_read = min_t(unsigned, + struct bio *bio; +reread: + sectors_read = min_t(unsigned, end - offset, buf->size >> 9); - bio_reset(bio); + bio = bio_kmalloc(GFP_KERNEL, + buf_pages(buf->data, + sectors_read << 9)); bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = offset; bio->bi_iter.bi_size = sectors_read << 9; @@ -482,6 +500,7 @@ reread: sectors_read = min_t(unsigned, bch2_bio_map(bio, buf->data); ret = submit_bio_wait(bio); + bio_put(bio); if (bch2_dev_io_err_on(ret, ca, "journal read from sector %llu", @@ -536,7 +555,6 @@ reread: sectors_read = min_t(unsigned, switch (ret) { case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; break; case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; @@ -544,9 +562,6 @@ reread: sectors_read = min_t(unsigned, return ret; } - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); - sectors = vstruct_sectors(j, c->block_bits); next_block: pr_debug("next"); @@ -560,138 +575,59 @@ next_block: static void bch2_journal_read_device(struct closure *cl) { -#define read_bucket(b) \ - ({ \ - bool entries_found = false; \ - ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ - &entries_found); \ - if (ret) \ - goto err; \ - __set_bit(b, bitmap); \ - entries_found; \ - }) - struct journal_device *ja = container_of(cl, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); struct journal_read_buf buf = { NULL, 0 }; - - DECLARE_BITMAP(bitmap, ja->nr); - unsigned i, l, r; - u64 seq = 0; + u64 min_seq = U64_MAX; + unsigned i; int ret; if (!ja->nr) goto out; - bitmap_zero(bitmap, ja->nr); ret = journal_read_buf_realloc(&buf, PAGE_SIZE); if (ret) goto err; pr_debug("%u journal buckets", ja->nr); - /* - * If the device supports discard but not secure discard, we can't do - * the fancy fibonacci hash/binary search because the live journal - * entries might not form a contiguous range: - */ - for (i = 0; i < ja->nr; i++) - read_bucket(i); - goto search_done; - - if (!blk_queue_nonrot(q)) - goto linear_scan; - - /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries - */ for (i = 0; i < ja->nr; i++) { - l = (i * 2654435769U) % ja->nr; - - if (test_bit(l, bitmap)) - break; - - if (read_bucket(l)) - goto bsearch; + ret = journal_read_bucket(ca, &buf, jlist, i); + if (ret) + goto err; } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); -linear_scan: - for (l = find_first_zero_bit(bitmap, ja->nr); - l < ja->nr; - l = find_next_zero_bit(bitmap, ja->nr, l + 1)) - if (read_bucket(l)) - goto bsearch; - - /* no journal entries on this device? */ - if (l == ja->nr) - goto out; -bsearch: - /* Binary search */ - r = find_next_bit(bitmap, ja->nr, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); - - while (l + 1 < r) { - unsigned m = (l + r) >> 1; - u64 cur_seq = seq; - - read_bucket(m); + /* Find the journal bucket with the highest sequence number: */ + for (i = 0; i < ja->nr; i++) { + if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) + ja->cur_idx = i; - if (cur_seq != seq) - l = m; - else - r = m; + min_seq = min(ja->bucket_seq[i], min_seq); } -search_done: /* - * Find the journal bucket with the highest sequence number: - * * If there's duplicate journal entries in multiple buckets (which * definitely isn't supposed to happen, but...) - make sure to start * cur_idx at the last of those buckets, so we don't deadlock trying to * allocate */ - seq = 0; + while (ja->bucket_seq[ja->cur_idx] > min_seq && + ja->bucket_seq[ja->cur_idx] > + ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - for (i = 0; i < ja->nr; i++) - if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { - /* - * When journal_next_bucket() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - seq = ja->bucket_seq[i]; - } + ja->sectors_free = 0; /* - * Set last_idx to indicate the entire journal is full and needs to be + * Set dirty_idx to indicate the entire journal is full and needs to be * reclaimed - journal reclaim will immediately reclaim whatever isn't * pinned when it first runs: */ - ja->last_idx = (ja->cur_idx + 1) % ja->nr; - - /* - * Read buckets in reverse order until we stop finding more journal - * entries: - */ - for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; - i != ja->cur_idx; - i = (i + ja->nr - 1) % ja->nr) - if (!test_bit(i, bitmap) && - !read_bucket(i)) - break; + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); @@ -702,32 +638,15 @@ err: jlist->ret = ret; mutex_unlock(&jlist->lock); goto out; -#undef read_bucket -} - -void bch2_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } } int bch2_journal_read(struct bch_fs *c, struct list_head *list) { - struct journal *j = &c->journal; struct journal_list jlist; struct journal_replay *i; - struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq, seq; unsigned iter; - size_t entries = 0; - u64 nr, keys = 0; + size_t keys = 0, entries = 0; bool degraded = false; int ret = 0; @@ -737,7 +656,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) jlist.ret = 0; for_each_member_device(ca, c, iter) { - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) continue; if ((ca->mi.state == BCH_MEMBER_STATE_RW || @@ -756,12 +676,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct bch_replicas_padded replicas; + char buf[80]; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -771,294 +691,89 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, - i->devs), c, - "superblock not marked as containing replicas (type %u)", - BCH_DATA_JOURNAL))) { - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); + fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + "superblock not marked as containing replicas %s", + (bch2_replicas_entry_to_text(&PBUF(buf), + &replicas.e), buf)))) { + ret = bch2_mark_replicas(c, &replicas.e); if (ret) return ret; } - } - - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; for_each_jset_key(k, _n, entry, &i->j) keys++; - } - - i = list_last_entry(list, struct journal_replay, list); - - nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1; - - fsck_err_on(c->sb.clean && (keys || nr > 1), c, - "filesystem marked clean but journal not empty (%llu keys in %llu entries)", - keys, nr); - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, le64_to_cpu(i->j.seq)); - j->last_seq_ondisk = le64_to_cpu(i->j.last_seq); - - j->pin.front = le64_to_cpu(i->j.last_seq); - j->pin.back = le64_to_cpu(i->j.seq) + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } - - mutex_lock(&j->blacklist_lock); - - list_for_each_entry(i, list, list) { - p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - atomic_set(&p->count, 1); - p->devs = i->devs; - - if (bch2_journal_seq_blacklist_read(j, i)) { - mutex_unlock(&j->blacklist_lock); - return -ENOMEM; - } - } - - mutex_unlock(&j->blacklist_lock); - - cur_seq = journal_last_seq(j); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - bool blacklisted; - - mutex_lock(&j->blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_blacklist_find(j, cur_seq)) - cur_seq++; - - blacklisted = bch2_journal_seq_blacklist_find(j, - le64_to_cpu(i->j.seq)); - mutex_unlock(&j->blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - journal_last_seq(j), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; entries++; } - bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu", - keys, entries, journal_cur_seq(j)); -fsck_err: - return ret; -} - -/* journal replay: */ + if (!list_empty(list)) { + i = list_last_entry(list, struct journal_replay, list); -int bch2_journal_mark(struct bch_fs *c, struct list_head *list) -{ - struct bkey_i *k, *n; - struct jset_entry *j; - struct journal_replay *r; - int ret; - - list_for_each_entry(r, list, list) - for_each_jset_key(k, n, j, &r->j) { - enum bkey_type type = bkey_type(j->level, j->btree_id); - struct bkey_s_c k_s_c = bkey_i_to_s_c(k); - - if (btree_type_has_ptrs(type)) { - ret = bch2_btree_mark_key_initial(c, type, k_s_c); - if (ret) - return ret; - } - } - - return 0; -} - -int bch2_journal_replay(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct journal_entry_pin_list *pin_list; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - int ret = 0; - - list_for_each_entry_safe(i, n, list, list) { - - j->replay_journal_seq = le64_to_cpu(i->j.seq); - - for_each_jset_key(k, _n, entry, &i->j) { - - if (entry->btree_id == BTREE_ID_ALLOC) { - /* - * allocation code handles replay for - * BTREE_ID_ALLOC keys: - */ - ret = bch2_alloc_replay_key(c, k->k.p); - } else { - /* - * We might cause compressed extents to be - * split, so we need to pass in a - * disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - - ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); - } - - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - goto err; - } - - cond_resched(); - } - - pin_list = journal_seq_pin(j, j->replay_journal_seq); - - if (atomic_dec_and_test(&pin_list->count)) - journal_wake(j); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, le64_to_cpu(i->j.seq)); } - - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - ret = bch2_journal_flush_all_pins(j); -err: - bch2_journal_entries_free(list); +fsck_err: return ret; } /* journal write: */ -static void bch2_journal_add_btree_root(struct journal_buf *buf, - enum btree_id id, struct bkey_i *k, - unsigned level) -{ - struct jset_entry *entry; - - entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s); - entry->type = BCH_JSET_ENTRY_btree_root; - entry->btree_id = id; - entry->level = level; - memcpy_u64s(entry->_data, k, k->k.u64s); -} - -static unsigned journal_dev_buckets_available(struct journal *j, - struct bch_dev *ca) -{ - struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ja->nr; - unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - - /* - * Hack to avoid a deadlock during journal replay: - * journal replay might require setting a new btree - * root, which requires writing another journal entry - - * thus, if the journal is full (and this happens when - * replaying the first journal bucket's entries) we're - * screwed. - * - * So don't let the journal fill up unless we're in - * replay: - */ - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - available = max((int) available - 2, 0); - - /* - * Don't use the last bucket unless writing the new last_seq - * will make another bucket available: - */ - if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) - available = max((int) available - 1, 0); - - return available; -} - -/* returns number of sectors available for next journal entry: */ -int bch2_journal_entry_sectors(struct journal *j) +static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, + struct dev_alloc_list *devs_sorted, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja; struct bch_dev *ca; - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - unsigned sectors_available = UINT_MAX; - unsigned i, nr_online = 0, nr_devs = 0; + unsigned i; - lockdep_assert_held(&j->lock); - - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_required = 0; + if (*replicas >= replicas_want) + return; - if (!ja->nr) + for (i = 0; i < devs_sorted->nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + if (!ca) continue; - sectors_available = min_t(unsigned, sectors_available, - ca->mi.bucket_size); + ja = &ca->journal; /* - * Note that we don't allocate the space for a journal entry - * until we write it out - thus, if we haven't started the write - * for the previous entry we have to make sure we have space for - * it too: + * Check that we can use this device, and aren't already using + * it: */ - if (bch2_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ja->sectors_free) - buckets_required++; - - if (j->prev_buf_sectors + sectors_available > - ja->sectors_free) - buckets_required++; - } else { - if (j->prev_buf_sectors + sectors_available > - ca->mi.bucket_size) - buckets_required++; - - buckets_required++; - } + if (!ca->mi.durability || + ca->mi.state != BCH_MEMBER_STATE_RW || + !ja->nr || + bch2_bkey_has_device(bkey_i_to_s_c(&w->key), + ca->dev_idx) || + sectors > ja->sectors_free) + continue; - if (journal_dev_buckets_available(j, ca) >= buckets_required) - nr_devs++; - nr_online++; - } - rcu_read_unlock(); + bch2_dev_stripe_increment(c, ca, &j->wp.stripe); + + bch2_bkey_append_ptr(&w->key, + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]) + + ca->mi.bucket_size - + ja->sectors_free, + .dev = ca->dev_idx, + }); - if (nr_online < c->opts.metadata_replicas_required) - return -EROFS; + ja->sectors_free -= sectors; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas)) - return 0; + *replicas += ca->mi.durability; - return sectors_available; + if (*replicas >= replicas_want) + break; + } } /** @@ -1068,95 +783,51 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; - unsigned i, replicas, replicas_want = + unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); - spin_lock(&j->lock); - e = bkey_i_to_s_extent(&j->key); - - /* - * Drop any pointers to devices that have been removed, are no longer - * empty, or filled up their current journal bucket: - * - * Note that a device may have had a small amount of free space (perhaps - * one sector) that wasn't enough for the smallest possible journal - * entry - that's why we drop pointers to devices <= current free space, - * i.e. whichever device was limiting the current journal entry size. - */ - extent_for_each_ptr_backwards(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + rcu_read_lock(); - if (ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors) - __bch2_extent_drop_ptr(e, ptr); - else - ca->journal.sectors_free -= sectors; - } + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, + &c->rw_devs[BCH_DATA_JOURNAL]); - replicas = bch2_extent_nr_ptrs(e.c); + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); - rcu_read_lock(); - devs_sorted = bch2_wp_alloc_list(c, &j->wp, - &c->rw_devs[BCH_DATA_JOURNAL]); + if (replicas >= replicas_want) + goto done; for (i = 0; i < devs_sorted.nr; i++) { ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); if (!ca) continue; - if (!ca->mi.durability) - continue; - ja = &ca->journal; - if (!ja->nr) - continue; - - if (replicas >= replicas_want) - break; - - /* - * Check that we can use this device, and aren't already using - * it: - */ - if (bch2_extent_has_device(e.c, ca->dev_idx) || - !journal_dev_buckets_available(j, ca) || - sectors > ca->mi.bucket_size) - continue; - - j->wp.next_alloc[ca->dev_idx] += U32_MAX; - bch2_wp_rescale(c, ca, &j->wp); - - ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - extent_ptr_append(bkey_i_to_extent(&j->key), - (struct bch_extent_ptr) { - .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]), - .dev = ca->dev_idx, - }); + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; - replicas += ca->mi.durability; + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + } } - rcu_read_unlock(); - - j->prev_buf_sectors = 0; - - bkey_copy(&w->key, &j->key); - spin_unlock(&j->lock); - if (replicas < c->opts.metadata_replicas_required) - return -EROFS; - - BUG_ON(!replicas); + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); +done: + rcu_read_unlock(); - return 0; + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } static void journal_write_compact(struct jset *jset) @@ -1208,17 +879,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; - if (buf->size >= new_size) + if (buf->buf_size >= new_size) return; new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); if (!new_buf) return; - memcpy(new_buf, buf->data, buf->size); - kvpfree(buf->data, buf->size); + memcpy(new_buf, buf->data, buf->buf_size); + kvpfree(buf->data, buf->buf_size); buf->data = new_buf; - buf->size = new_size; + buf->buf_size = new_size; } static void journal_write_done(struct closure *cl) @@ -1227,24 +898,31 @@ static void journal_write_done(struct closure *cl) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_prev_buf(j); struct bch_devs_list devs = - bch2_extent_devs(bkey_i_to_s_c_extent(&w->key)); + bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + struct bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); + + bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); goto err; } - if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + + if (bch2_mark_replicas(c, &replicas.e)) goto err; -out: - bch2_time_stats_update(j->write_time, j->write_start_time); spin_lock(&j->lock); - j->last_seq_ondisk = seq; if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; + j->seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard * more buckets: @@ -1252,8 +930,8 @@ out: * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); - + mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); +out: /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1270,7 +948,7 @@ out: return; err: bch2_fatal_error(c); - bch2_journal_halt(j); + spin_lock(&j->lock); goto out; } @@ -1285,7 +963,7 @@ static void journal_write_endio(struct bio *bio) unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); - bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx); + bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } @@ -1299,36 +977,51 @@ void bch2_journal_write(struct closure *cl) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_prev_buf(j); + struct jset_entry *start, *end; struct jset *jset; struct bio *bio; struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s; + int ret; + + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); - mutex_lock(&c->btree_root_lock); - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; - if (r->alive) - bch2_journal_add_btree_root(w, i, &r->key, r->level); - } - c->btree_roots_dirty = false; - mutex_unlock(&c->btree_root_lock); + start = vstruct_last(jset); + end = bch2_journal_super_entries_add_common(c, start, + le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); + + le32_add_cpu(&jset->u64s, u64s); + BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); journal_write_compact(jset); jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = cpu_to_le32(BCACHE_JSET_VERSION); + + jset->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le32(BCH_JSET_VERSION_OLD) + : cpu_to_le32(c->sb.version); SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + + if (le32_to_cpu(jset->version) < + bcachefs_metadata_version_bkey_renumber) + validate_before_checksum = true; + + if (validate_before_checksum && jset_validate_entries(c, jset, WRITE)) goto err; @@ -1339,18 +1032,33 @@ void bch2_journal_write(struct closure *cl) jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) && + if (!validate_before_checksum && jset_validate_entries(c, jset, WRITE)) goto err; sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > j->prev_buf_sectors); + BUG_ON(sectors > w->sectors); + + bytes = vstruct_bytes(jset); + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); - bytes = vstruct_bytes(w->data); - memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); + spin_lock(&j->lock); + ret = journal_write_alloc(j, w, sectors); + + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + spin_unlock(&j->lock); - if (journal_write_alloc(j, w, sectors)) { - bch2_journal_halt(j); + if (ret) { bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); @@ -1389,7 +1097,7 @@ void bch2_journal_write(struct closure *cl) trace_journal_write(bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq); + ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } for_each_rw_member(ca, c, i) @@ -1407,8 +1115,7 @@ void bch2_journal_write(struct closure *cl) } no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) - ptr->offset += sectors; + bch2_bucket_seq_cleanup(c); continue_at(cl, journal_write_done, system_highpri_wq); return; diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4236b7fc37ff..72e575f360af 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -1,9 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration @@ -39,7 +37,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_read(struct bch_fs *, struct list_head *); -int bch2_journal_entry_sectors(struct journal *); void bch2_journal_write(struct closure *); #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 0e3e5b6abb39..695b2c8ba03b 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -1,49 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" -/* - * Journal entry pinning - machinery for holding a reference on a given journal - * entry, holding it open to ensure it gets replayed during recovery: - */ +/* Free space calculations: */ + +static unsigned journal_space_from(struct journal_device *ja, + enum journal_space_from from) +{ + switch (from) { + case journal_space_discarded: + return ja->discard_idx; + case journal_space_clean_ondisk: + return ja->dirty_idx_ondisk; + case journal_space_clean: + return ja->dirty_idx; + default: + BUG(); + } +} -static inline u64 journal_pin_seq(struct journal *j, - struct journal_entry_pin_list *pin_list) +unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) { - return fifo_entry_idx_abs(&j->pin, pin_list); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; + + /* + * Allocator startup needs some journal space before we can do journal + * replay: + */ + if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) + --available; + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (available && ja->dirty_idx_ondisk == ja->dirty_idx) + --available; + + return available; +} + +static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +{ + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + + do { + old.v = new.v = v; + new.remaining = u64s_remaining; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); +} + +static struct journal_space { + unsigned next_entry; + unsigned remaining; +} __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned sectors_next_entry = UINT_MAX; + unsigned sectors_total = UINT_MAX; + unsigned i, nr_devs = 0; + unsigned unwritten_sectors = j->reservations.prev_buf_unwritten + ? journal_prev_buf(j)->sectors + : 0; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + unsigned buckets_this_device, sectors_this_device; + + if (!ja->nr) + continue; + + buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); + sectors_this_device = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + if (unwritten_sectors >= sectors_this_device) { + if (!buckets_this_device) + continue; + + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + sectors_this_device -= unwritten_sectors; + + if (sectors_this_device < ca->mi.bucket_size && + buckets_this_device) { + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + if (!sectors_this_device) + continue; + + sectors_next_entry = min(sectors_next_entry, + sectors_this_device); + + sectors_total = min(sectors_total, + buckets_this_device * ca->mi.bucket_size + + sectors_this_device); + + nr_devs++; + } + rcu_read_unlock(); + + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + + return (struct journal_space) { + .next_entry = sectors_next_entry, + .remaining = max_t(int, 0, sectors_total - sectors_next_entry), + }; } -u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin) +void bch2_journal_space_available(struct journal *j) { - u64 ret = 0; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_space discarded, clean_ondisk, clean; + unsigned overhead, u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs_want; + bool can_discard = false; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_JOURNAL]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (ja->dirty_idx != ja->cur_idx && + ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + + while (ja->dirty_idx_ondisk != ja->dirty_idx && + ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + + if (ja->discard_idx != ja->dirty_idx_ondisk) + can_discard = true; + + max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); + nr_online++; + } + rcu_read_unlock(); + + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { + ret = -EROFS; + goto out; + } + + if (!fifo_free(&j->pin)) { + ret = -ENOSPC; + goto out; + } + + nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); + + discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); + clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); + clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + + if (!discarded.next_entry) + ret = -ENOSPC; + + overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * + journal_entry_overhead(j); + u64s_remaining = clean.remaining << 6; + u64s_remaining = max_t(int, 0, u64s_remaining - overhead); + u64s_remaining /= 4; +out: + j->cur_entry_sectors = !ret ? discarded.next_entry : 0; + j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); + journal_check_may_get_unreserved(j); + + if (!ret) + journal_wake(j); +} + +/* Discards - last part of journal reclaim: */ + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; spin_lock(&j->lock); - if (journal_pin_active(pin)) - ret = journal_pin_seq(j, pin->pin_list); + ret = ja->discard_idx != ja->dirty_idx_ondisk; spin_unlock(&j->lock); return ret; } +/* + * Advance ja->discard_idx as long as it points to buckets that are no longer + * dirty, issuing discards if necessary: + */ +void bch2_journal_do_discards(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned iter; + + mutex_lock(&j->discard_lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->discard_idx]), + ca->mi.bucket_size, GFP_NOIO, 0); + + spin_lock(&j->lock); + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + } + } + + mutex_unlock(&j->discard_lock); +} + +/* + * Journal entry pinning - machinery for holding a reference on a given journal + * entry, holding it open to ensure it gets replayed during recovery: + */ + +static void bch2_journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } + + if (popped) + bch2_journal_space_available(j); +} + +void bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) { + spin_lock(&j->lock); + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + } +} + static inline void __journal_pin_add(struct journal *j, - struct journal_entry_pin_list *pin_list, + u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + BUG_ON(journal_pin_active(pin)); BUG_ON(!atomic_read(&pin_list->count)); atomic_inc(&pin_list->count); - pin->pin_list = pin_list; + pin->seq = seq; pin->flush = flush_fn; - if (flush_fn) - list_add(&pin->list, &pin_list->list); - else - INIT_LIST_HEAD(&pin->list); + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); /* * If the journal is currently full, we might want to call flush_fn @@ -57,19 +318,20 @@ void bch2_journal_pin_add(struct journal *j, u64 seq, journal_pin_flush_fn flush_fn) { spin_lock(&j->lock); - __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn); + __journal_pin_add(j, seq, pin, flush_fn); spin_unlock(&j->lock); } static inline void __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { - struct journal_entry_pin_list *pin_list = pin->pin_list; + struct journal_entry_pin_list *pin_list; if (!journal_pin_active(pin)) return; - pin->pin_list = NULL; + pin_list = journal_seq_pin(j, pin->seq); + pin->seq = 0; list_del_init(&pin->list); /* @@ -79,16 +341,38 @@ static inline void __journal_pin_drop(struct journal *j, if (atomic_dec_and_test(&pin_list->count) && pin_list == &fifo_peek_front(&j->pin)) bch2_journal_reclaim_fast(j); + else if (fifo_used(&j->pin) == 1 && + atomic_read(&pin_list->count) == 1) + journal_wake(j); } void bch2_journal_pin_drop(struct journal *j, - struct journal_entry_pin *pin) + struct journal_entry_pin *pin) { spin_lock(&j->lock); __journal_pin_drop(j, pin); spin_unlock(&j->lock); } +void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock(&j->lock); + + if (pin->seq != seq) { + __journal_pin_drop(j, pin); + __journal_pin_add(j, seq, pin, flush_fn); + } else { + struct journal_entry_pin_list *pin_list = + journal_seq_pin(j, seq); + + list_move(&pin->list, &pin_list->list); + } + + spin_unlock(&j->lock); +} + void bch2_journal_pin_add_if_older(struct journal *j, struct journal_entry_pin *src_pin, struct journal_entry_pin *pin, @@ -98,15 +382,21 @@ void bch2_journal_pin_add_if_older(struct journal *j, if (journal_pin_active(src_pin) && (!journal_pin_active(pin) || - journal_pin_seq(j, src_pin->pin_list) < - journal_pin_seq(j, pin->pin_list))) { + src_pin->seq < pin->seq)) { __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->pin_list, pin, flush_fn); + __journal_pin_add(j, src_pin->seq, pin, flush_fn); } spin_unlock(&j->lock); } +void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) +{ + BUG_ON(journal_pin_active(pin)); + + wait_event(j->pin_flush_wait, j->flush_in_progress != pin); +} + /* * Journal reclaim: flush references to open journal entries to reclaim space in * the journal @@ -116,88 +406,55 @@ void bch2_journal_pin_add_if_older(struct journal *j, * data off of a specific device: */ -/** - * bch2_journal_reclaim_fast - do the fast part of journal reclaim - * - * Called from IO submission context, does not block. Cleans up after btree - * write completions by advancing the journal pin and each cache's last_idx, - * kicking off discards and background reclaim as necessary. - */ -void bch2_journal_reclaim_fast(struct journal *j) -{ - struct journal_entry_pin_list temp; - bool popped = false; - - lockdep_assert_held(&j->lock); - - /* - * Unpin journal entries whose reference counts reached zero, meaning - * all btree nodes got written out - */ - while (!atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!fifo_pop(&j->pin, temp)); - popped = true; - } - - if (popped) - journal_wake(j); -} - static struct journal_entry_pin * -__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) +journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) { struct journal_entry_pin_list *pin_list; - struct journal_entry_pin *ret; - u64 iter; + struct journal_entry_pin *ret = NULL; - /* no need to iterate over empty fifo entries: */ - bch2_journal_reclaim_fast(j); + spin_lock(&j->lock); - fifo_for_each_entry_ptr(pin_list, &j->pin, iter) { - if (iter > seq_to_flush) + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) + if (*seq > max_seq || + (ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list))) break; - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) { - /* must be list_del_init(), see bch2_journal_pin_drop() */ - list_move(&ret->list, &pin_list->flushed); - *seq = iter; - return ret; - } + if (ret) { + list_move(&ret->list, &pin_list->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = ret; + j->last_flushed = jiffies; } - return NULL; -} - -static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq) -{ - struct journal_entry_pin *ret; - - spin_lock(&j->lock); - ret = __journal_get_next_pin(j, seq_to_flush, seq); spin_unlock(&j->lock); return ret; } -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static void journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) { - bool ret; + struct journal_entry_pin *pin; + u64 seq; - spin_lock(&j->lock); - ret = ja->nr && - (ja->last_idx != ja->cur_idx && - ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk); - spin_unlock(&j->lock); + lockdep_assert_held(&j->reclaim_lock); - return ret; + while ((pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq))) { + if (min_nr) + min_nr--; + + pin->flush(j, pin, seq); + + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); + } } /** - * bch2_journal_reclaim_work - free up journal buckets + * bch2_journal_reclaim - free up journal buckets * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -214,75 +471,42 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -void bch2_journal_reclaim_work(struct work_struct *work) +void bch2_journal_reclaim(struct journal *j) { - struct bch_fs *c = container_of(to_delayed_work(work), - struct bch_fs, journal.reclaim_work); - struct journal *j = &c->journal; + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_entry_pin *pin; - u64 seq, seq_to_flush = 0; - unsigned iter, bucket_to_flush; - unsigned long next_flush; - bool reclaim_lock_held = false, need_flush; + unsigned iter, min_nr = 0; + u64 seq_to_flush = 0; + + lockdep_assert_held(&j->reclaim_lock); + + bch2_journal_do_discards(j); + + spin_lock(&j->lock); - /* - * Advance last_idx to point to the oldest journal entry containing - * btree node updates that have not yet been written out - */ for_each_rw_member(ca, c, iter) { struct journal_device *ja = &ca->journal; + unsigned nr_buckets, bucket_to_flush; if (!ja->nr) continue; - while (should_discard_bucket(j, ja)) { - if (!reclaim_lock_held) { - /* - * ugh: - * might be called from __journal_res_get() - * under wait_event() - have to go back to - * TASK_RUNNING before doing something that - * would block, but only if we're doing work: - */ - __set_current_state(TASK_RUNNING); - - mutex_lock(&j->reclaim_lock); - reclaim_lock_held = true; - /* recheck under reclaim_lock: */ - continue; - } + /* Try to keep the journal at most half full: */ + nr_buckets = ja->nr / 2; - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, - ja->buckets[ja->last_idx]), - ca->mi.bucket_size, GFP_NOIO, 0); + /* And include pre-reservations: */ + nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, + (ca->mi.bucket_size << 6) - + journal_entry_overhead(j)); - spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % ja->nr; - spin_unlock(&j->lock); - - journal_wake(j); - } + nr_buckets = min(nr_buckets, ja->nr); - /* - * Write out enough btree nodes to free up 50% journal - * buckets - */ - spin_lock(&j->lock); - bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; - seq_to_flush = max_t(u64, seq_to_flush, - ja->bucket_seq[bucket_to_flush]); - spin_unlock(&j->lock); + bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; + seq_to_flush = max(seq_to_flush, + ja->bucket_seq[bucket_to_flush]); } - if (reclaim_lock_held) - mutex_unlock(&j->reclaim_lock); - /* Also flush if the pin fifo is more than half full */ - spin_lock(&j->lock); seq_to_flush = max_t(s64, seq_to_flush, (s64) journal_cur_seq(j) - (j->pin.size >> 1)); @@ -292,86 +516,72 @@ void bch2_journal_reclaim_work(struct work_struct *work) * If it's been longer than j->reclaim_delay_ms since we last flushed, * make sure to flush at least one journal pin: */ - next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); - need_flush = time_after(jiffies, next_flush); - - while ((pin = journal_get_next_pin(j, need_flush - ? U64_MAX - : seq_to_flush, &seq))) { - __set_current_state(TASK_RUNNING); - pin->flush(j, pin, seq); - need_flush = false; + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; - j->last_flushed = jiffies; + if (j->prereserved.reserved * 2 > j->prereserved.remaining) { + seq_to_flush = max(seq_to_flush, journal_last_seq(j)); + min_nr = 1; } - if (!test_bit(BCH_FS_RO, &c->flags)) - queue_delayed_work(system_freezable_wq, &j->reclaim_work, + journal_flush_pins(j, seq_to_flush, min_nr); + + if (!bch2_journal_error(j)) + queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, msecs_to_jiffies(j->reclaim_delay_ms)); } -static int journal_flush_done(struct journal *j, u64 seq_to_flush, - struct journal_entry_pin **pin, - u64 *pin_seq) +void bch2_journal_reclaim_work(struct work_struct *work) { - int ret; + struct journal *j = container_of(to_delayed_work(work), + struct journal, reclaim_work); - *pin = NULL; + mutex_lock(&j->reclaim_lock); + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); +} + +static int journal_flush_done(struct journal *j, u64 seq_to_flush) +{ + int ret; ret = bch2_journal_error(j); if (ret) return ret; + mutex_lock(&j->reclaim_lock); + + journal_flush_pins(j, seq_to_flush, 0); + spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries * hold refs on their corresponding sequence numbers */ - ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL || - !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || journal_last_seq(j) > seq_to_flush || (fifo_used(&j->pin) == 1 && atomic_read(&fifo_peek_front(&j->pin).count) == 1); + spin_unlock(&j->lock); + mutex_unlock(&j->reclaim_lock); return ret; } -int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_entry_pin *pin; - u64 pin_seq; - bool flush; - if (!test_bit(JOURNAL_STARTED, &j->flags)) - return 0; -again: - wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq)); - if (pin) { - /* flushing a journal pin might cause a new one to be added: */ - pin->flush(j, pin, pin_seq); - goto again; - } - - spin_lock(&j->lock); - flush = journal_last_seq(j) != j->last_seq_ondisk || - (seq_to_flush == U64_MAX && c->btree_roots_dirty); - spin_unlock(&j->lock); - - return flush ? bch2_journal_meta(j) : 0; -} + return; -int bch2_journal_flush_all_pins(struct journal *j) -{ - return bch2_journal_flush_pins(j, U64_MAX); + closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush)); } int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; - struct bch_devs_list devs; u64 iter, seq = 0; int ret = 0; @@ -383,7 +593,9 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = iter; spin_unlock(&j->lock); - ret = bch2_journal_flush_pins(j, seq); + bch2_journal_flush_pins(j, seq); + + ret = bch2_journal_error(j); if (ret) return ret; @@ -394,17 +606,20 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) spin_lock(&j->lock); while (!ret && seq < j->pin.back) { + struct bch_replicas_padded replicas; + seq = max(seq, journal_last_seq(j)); - devs = journal_seq_pin(j, seq)->devs; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, + journal_seq_pin(j, seq)->devs); seq++; spin_unlock(&j->lock); - ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); + ret = bch2_mark_replicas(c, &replicas.e); spin_lock(&j->lock); } spin_unlock(&j->lock); - bch2_replicas_gc_end(c, ret); + ret = bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); return ret; diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 7d460c35cfae..9bf982a17797 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -1,36 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_RECLAIM_H #define _BCACHEFS_JOURNAL_RECLAIM_H #define JOURNAL_PIN (32 * 1024) +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, +}; + +unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *, + enum journal_space_from); +void bch2_journal_space_available(struct journal *); + static inline bool journal_pin_active(struct journal_entry_pin *pin) { - return pin->pin_list != NULL; + return pin->seq != 0; } static inline struct journal_entry_pin_list * journal_seq_pin(struct journal *j, u64 seq) { - BUG_ON(seq < j->pin.front || seq >= j->pin.back); + EBUG_ON(seq < j->pin.front || seq >= j->pin.back); return &j->pin.data[seq & j->pin.mask]; } -u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *); +void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, journal_pin_flush_fn); +void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, struct journal_entry_pin *, journal_pin_flush_fn); +void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -void bch2_journal_reclaim_fast(struct journal *); +void bch2_journal_do_discards(struct journal *); +void bch2_journal_reclaim(struct journal *); void bch2_journal_reclaim_work(struct work_struct *); -int bch2_journal_flush_pins(struct journal *, u64); -int bch2_journal_flush_all_pins(struct journal *); +void bch2_journal_flush_pins(struct journal *, u64); + +static inline void bch2_journal_flush_all_pins(struct journal *j) +{ + bch2_journal_flush_pins(j, U64_MAX); +} + int bch2_journal_flush_device_pins(struct journal *, int); #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 567289e22ca0..787d9f7638d0 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" +#include "btree_iter.h" +#include "eytzinger.h" #include "journal_seq_blacklist.h" +#include "super-io.h" /* * journal_seq_blacklist machinery: @@ -36,323 +34,285 @@ * record that it was blacklisted so that a) on recovery we don't think we have * missing journal entries and b) so that the btree code continues to ignore * that bset, until that btree node is rewritten. - * - * Blacklisted journal sequence numbers are themselves recorded as entries in - * the journal. */ -/* - * Called when journal needs to evict a blacklist entry to reclaim space: find - * any btree nodes that refer to the blacklist journal sequence numbers, and - * rewrite them: - */ -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) +static unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) { - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; - - closure_init_stack(&cl); + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} - for (i = 0;; i++) { - struct btree_iter iter; - struct btree *b; +static unsigned sb_blacklist_u64s(unsigned nr) +{ + struct bch_sb_field_journal_seq_blacklist *bl; - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} - __bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0); +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } - b = bch2_btree_iter_peek_node(&iter); + return bl; +} - /* The node might have already been rewritten: */ +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0); - if (ret) { - bch2_btree_iter_unlock(&iter); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; - } - } + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); - bch2_btree_iter_unlock(&iter); - } + if (bl) { + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; + if (start == le64_to_cpu(e->start) && + end == le64_to_cpu(e->end)) + goto out; - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; + if (start <= le64_to_cpu(e->start) && + end >= le64_to_cpu(e->end)) { + e->start = cpu_to_le64(start); + e->end = cpu_to_le64(end); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; } + } + } - mutex_unlock(&c->btree_interior_update_lock); + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -ENOMEM; + goto out; } - mutex_lock(&j->blacklist_lock); + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= + 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); - mutex_unlock(&j->blacklist_lock); + return ret; } -/* - * Determine if a particular sequence number is blacklisted - if so, return - * blacklist entry: - */ -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) { - struct journal_seq_blacklist *bl; + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq >= bl->start && seq <= bl->end) - return bl; - - return NULL; + return cmp_int(l->start, r->start); } -/* - * Allocate a new, in memory blacklist entry: - */ -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end) +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) { - struct journal_seq_blacklist *bl; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; - lockdep_assert_held(&j->blacklist_lock); + if (!t) + return false; - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; + BUG_ON(t->entries[idx].start > seq); - bl->start = start; - bl->end = end; + if (seq >= t->entries[idx].end) + return false; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; + if (dirty) + t->entries[idx].dirty = true; + return true; } -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +int bch2_blacklist_table_initialize(struct bch_fs *c) { - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq; - int ret = 0; + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); - if (!seq) - return 0; + BUG_ON(c->journal_seq_blacklist_table); - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); - - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + if (!bl) + return 0; - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - fsck_err_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -ENOMEM; - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; + t->nr = nr; - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = bch2_journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - if (!j->new_blacklist) { - j->new_blacklist = bch2_journal_seq_blacklisted_new(j, - journal_seq + 1, - journal_seq + 1); - if (!j->new_blacklist) { - ret = -ENOMEM; - goto out; - } - } - bl = j->new_blacklist; - bl->end = max(bl->end, seq); + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); } - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: -fsck_err: - mutex_unlock(&j->blacklist_lock); - return ret; + c->journal_seq_blacklist_table = t; + return 0; } -static int __bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - u64 start, u64 end) +static const char * +bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - - bch_verbose(c, "blacklisting existing journal seq %llu-%llu", - start, end); + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (le64_to_cpu(i->start) >= + le64_to_cpu(i->end)) + return "entry start >= end"; + + if (i + 1 < bl->start + nr && + le64_to_cpu(i[0].end) > + le64_to_cpu(i[1].start)) + return "entries out of order"; + } - bl = bch2_journal_seq_blacklisted_new(j, start, end); - if (!bl) - return -ENOMEM; + return NULL; +} - bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, - journal_seq_blacklist_flush); - return 0; +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + pr_buf(out, " "); + + pr_buf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } } -/* - * After reading the journal, find existing journal seq blacklist entries and - * read them into memory: - */ -int bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i) +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) { - struct jset_entry *entry; - int ret = 0; + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; - vstruct_for_each(&i->j, entry) { - switch (entry->type) { - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); + bch2_trans_init(&trans, c, 0, 0); - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq)); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end)); - break; - } - } + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter *iter; + struct btree *b; - if (ret) - break; + for_each_btree_node(&trans, iter, i, POS_MIN, + BTREE_ITER_PREFETCH, b) + if (test_bit(BCH_FS_STOPPING, &c->flags)) { + bch2_trans_exit(&trans); + return; + } + bch2_trans_iter_free(&trans, iter); } - return ret; -} - -/* - * After reading the journal and walking the btree, we might have new journal - * sequence numbers to blacklist - add entries to the next journal entry to be - * written: - */ -void bch2_journal_seq_blacklist_write(struct journal *j) -{ - struct journal_seq_blacklist *bl = j->new_blacklist; - struct jset_entry_blacklist_v2 *bl_entry; - struct jset_entry *entry; + ret = bch2_trans_exit(&trans); + if (ret) + return; + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); if (!bl) - return; + goto out; - entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j), - (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64)); + nr = blacklist_nr_entries(bl); + dst = bl->start; - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2; - bl_entry->start = cpu_to_le64(bl->start); - bl_entry->end = cpu_to_le64(bl->end); + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } - bch2_journal_pin_add(j, - journal_cur_seq(j), - &bl->pin, - journal_seq_blacklist_flush); + new_nr = dst - bl->start; - j->new_blacklist = NULL; + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= + ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index 95ea6e90ba93..03f4b97247fd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -1,13 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -struct journal_replay; +bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); +int bch2_blacklist_table_initialize(struct bch_fs *); -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *, u64); -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); -int bch2_journal_seq_blacklist_read(struct journal *, - struct journal_replay *); -void bch2_journal_seq_blacklist_write(struct journal *); +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +void bch2_blacklist_entries_gc(struct work_struct *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a27e0548c098..8eea12a03c06 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_TYPES_H #define _BCACHEFS_JOURNAL_TYPES_H @@ -21,8 +22,11 @@ struct journal_buf { struct closure_waitlist wait; - unsigned size; - unsigned disk_sectors; + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ + unsigned u64s_reserved; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -47,25 +51,7 @@ typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin { struct list_head list; journal_pin_flush_fn flush; - struct journal_entry_pin_list *pin_list; -}; - -/* corresponds to a btree node with a blacklisted bset: */ -struct blacklisted_node { - __le64 seq; - enum btree_id btree_id; - struct bpos pos; -}; - -struct journal_seq_blacklist { - struct list_head list; - u64 start; - u64 end; - - struct journal_entry_pin pin; - - struct blacklisted_node *entries; - size_t nr_entries; + u64 seq; }; struct journal_res { @@ -76,6 +62,14 @@ struct journal_res { u64 seq; }; +/* + * For reserving space in the journal prior to getting a reservation on a + * particular journal entry: + */ +struct journal_preres { + unsigned u64s; +}; + union journal_res_state { struct { atomic64_t counter; @@ -94,6 +88,21 @@ union journal_res_state { }; }; +union journal_preres_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u32 reserved; + u32 remaining; + }; +}; + /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -117,6 +126,8 @@ enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, JOURNAL_NEED_WRITE, + JOURNAL_NOT_EMPTY, + JOURNAL_MAY_GET_UNRESERVED, }; /* Embedded in struct bch_fs */ @@ -126,9 +137,22 @@ struct journal { unsigned long flags; union journal_res_state reservations; + + /* Max size of current journal entry */ unsigned cur_entry_u64s; - unsigned prev_buf_sectors; - unsigned cur_buf_sectors; + unsigned cur_entry_sectors; + + /* + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ + int cur_entry_error; + + union journal_preres_state prereserved; + + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + unsigned buf_size_want; /* @@ -139,9 +163,13 @@ struct journal { spinlock_t lock; + /* if nonzero, we may not open a new journal entry: */ + unsigned blocked; + /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; + struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -149,7 +177,8 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; - /* last_seq from the most recent journal entry written */ + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; u64 last_seq_ondisk; /* @@ -172,21 +201,23 @@ struct journal { u64 front, back, size, mask; struct journal_entry_pin_list *data; } pin; - u64 replay_journal_seq; - struct mutex blacklist_lock; - struct list_head seq_blacklist; - struct journal_seq_blacklist *new_blacklist; + u64 replay_journal_seq; + u64 replay_journal_seq_end; - BKEY_PADDED(key); struct write_point wp; spinlock_t err_lock; struct delayed_work reclaim_work; + struct mutex reclaim_lock; unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + wait_queue_head_t pin_flush_wait; + + /* protects advancing ja->discard_idx: */ + struct mutex discard_lock; + bool can_discard; - /* protects advancing ja->last_idx: */ - struct mutex reclaim_lock; unsigned write_delay_ms; unsigned reclaim_delay_ms; @@ -217,17 +248,15 @@ struct journal_device { unsigned sectors_free; - /* Journal bucket we're currently writing to */ - unsigned cur_idx; - - /* Last journal bucket that still contains an open journal entry */ - /* - * j->lock and j->reclaim_lock must both be held to modify, j->lock - * sufficient to read: + * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: */ - unsigned last_idx; + unsigned discard_idx; /* Next bucket to discard */ + unsigned dirty_idx_ondisk; + unsigned dirty_idx; + unsigned cur_idx; /* Journal bucket we're currently writing to */ unsigned nr; + u64 *buckets; /* Bio for journal reads/writes to this device */ @@ -237,4 +266,11 @@ struct journal_device { struct closure read; }; +/* + * journal_entry_res - reserve space in every journal entry: + */ +struct journal_entry_res { + unsigned u64s; +}; + #endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c index bc724e771aea..5da54ced9cad 100644 --- a/fs/bcachefs/keylist.c +++ b/fs/bcachefs/keylist.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "keylist.h" diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h index 3106759e35f7..a7ff86b08abc 100644 --- a/fs/bcachefs/keylist.h +++ b/fs/bcachefs/keylist.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_KEYLIST_H #define _BCACHEFS_KEYLIST_H diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h index 48a17d7af6d8..4b3ff7d8a875 100644 --- a/fs/bcachefs/keylist_types.h +++ b/fs/bcachefs/keylist_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_KEYLIST_TYPES_H #define _BCACHEFS_KEYLIST_TYPES_H diff --git a/fs/bcachefs/lz4.h b/fs/bcachefs/lz4.h deleted file mode 100644 index 22e7859c0576..000000000000 --- a/fs/bcachefs/lz4.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __BCH_LZ4_H__ -#define __BCH_LZ4_H__ - -int bch2_lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); - -#endif diff --git a/fs/bcachefs/lz4_decompress.c b/fs/bcachefs/lz4_decompress.c deleted file mode 100644 index 9e809f972844..000000000000 --- a/fs/bcachefs/lz4_decompress.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * LZ4 Decompressor for Linux kernel - * - * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com> - * - * Based on LZ4 implementation by Yann Collet. - * - * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ - */ - -#ifndef STATIC -#include <linux/module.h> -#include <linux/kernel.h> -#endif - -#include "lz4.h" - -/* - * Detects 64 bits mode - */ -#if defined(CONFIG_64BIT) -#define LZ4_ARCH64 1 -#else -#define LZ4_ARCH64 0 -#endif - -#include <asm/unaligned.h> -#include <linux/log2.h> -#include <linux/string.h> - -#define A32(_p) get_unaligned((u32 *) (_p)) -#define A16(_p) get_unaligned((u16 *) (_p)) - -#define GET_LE16_ADVANCE(_src) \ -({ \ - u16 _r = get_unaligned_le16(_src); \ - (_src) += 2; \ - _r; \ -}) - -#define PUT_LE16_ADVANCE(_dst, _v) \ -do { \ - put_unaligned_le16((_v), (_dst)); \ - (_dst) += 2; \ -} while (0) - -#define LENGTH_LONG 15 -#define COPYLENGTH 8 -#define ML_BITS 4 -#define ML_MASK ((1U << ML_BITS) - 1) -#define RUN_BITS (8 - ML_BITS) -#define RUN_MASK ((1U << RUN_BITS) - 1) -#define MEMORY_USAGE 14 -#define MINMATCH 4 -#define SKIPSTRENGTH 6 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH + MINMATCH) -#define MINLENGTH (MFLIMIT + 1) -#define MAXD_LOG 16 -#define MAXD (1 << MAXD_LOG) -#define MAXD_MASK (u32)(MAXD - 1) -#define MAX_DISTANCE (MAXD - 1) -#define HASH_LOG (MAXD_LOG - 1) -#define HASHTABLESIZE (1 << HASH_LOG) -#define MAX_NB_ATTEMPTS 256 -#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) -#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) - -#define __HASH_VALUE(p, bits) \ - (((A32(p)) * 2654435761U) >> (32 - (bits))) - -#define HASH_VALUE(p) __HASH_VALUE(p, HASH_LOG) - -#define MEMCPY_ADVANCE(_dst, _src, length) \ -do { \ - typeof(length) _length = (length); \ - memcpy(_dst, _src, _length); \ - _src += _length; \ - _dst += _length; \ -} while (0) - -#define MEMCPY_ADVANCE_BYTES(_dst, _src, _length) \ -do { \ - const u8 *_end = (_src) + (_length); \ - while ((_src) < _end) \ - *_dst++ = *_src++; \ -} while (0) - -#define STEPSIZE __SIZEOF_LONG__ - -#define LZ4_COPYPACKET(_src, _dst) \ -do { \ - MEMCPY_ADVANCE(_dst, _src, STEPSIZE); \ - MEMCPY_ADVANCE(_dst, _src, COPYLENGTH - STEPSIZE);\ -} while (0) - -/* - * Equivalent to MEMCPY_ADVANCE - except may overrun @_dst and @_src by - * COPYLENGTH: - * - * Note: src and dst may overlap (with src < dst) - we must do the copy in - * STEPSIZE chunks for correctness - * - * Note also: length may be negative - we must not call memcpy if length is - * negative, but still adjust dst and src by length - */ -#define MEMCPY_ADVANCE_CHUNKED(_dst, _src, _length) \ -do { \ - u8 *_end = (_dst) + (_length); \ - while ((_dst) < _end) \ - LZ4_COPYPACKET(_src, _dst); \ - _src -= (_dst) - _end; \ - _dst = _end; \ -} while (0) - -#define MEMCPY_ADVANCE_CHUNKED_NOFIXUP(_dst, _src, _end)\ -do { \ - while ((_dst) < (_end)) \ - LZ4_COPYPACKET((_src), (_dst)); \ -} while (0) - -static const int dec32table[8] = {0, 3, 2, 3, 0, 0, 0, 0}; -#if LZ4_ARCH64 -static const int dec64table[8] = {0, 0, 0, -1, 0, 1, 2, 3}; -#else -static const int dec64table[8] = {0, 0, 0, 0, 0, 0, 0, 0}; -#endif - -static inline size_t get_length(const u8 **ip, size_t length) -{ - if (length == LENGTH_LONG) { - size_t len; - - do { - length += (len = *(*ip)++); - } while (len == 255); - } - - return length; -} - -static int lz4_uncompress(const u8 *source, u8 *dest, int osize) -{ - const u8 *ip = source; - const u8 *ref; - u8 *op = dest; - u8 * const oend = op + osize; - u8 *cpy; - unsigned token, offset; - ssize_t length; - - while (1) { - /* get runlength */ - token = *ip++; - length = get_length(&ip, token >> ML_BITS); - - /* copy literals */ - if (unlikely(op + length > oend - COPYLENGTH)) { - /* - * Error: not enough place for another match - * (min 4) + 5 literals - */ - if (op + length != oend) - goto _output_error; - - MEMCPY_ADVANCE(op, ip, length); - break; /* EOF */ - } - MEMCPY_ADVANCE_CHUNKED(op, ip, length); - - /* get match offset */ - offset = GET_LE16_ADVANCE(ip); - ref = op - offset; - - /* Error: offset create reference outside destination buffer */ - if (unlikely(ref < (u8 *const) dest)) - goto _output_error; - - /* get match length */ - length = get_length(&ip, token & ML_MASK); - length += MINMATCH; - - /* copy first STEPSIZE bytes of match: */ - if (unlikely(offset < STEPSIZE)) { - MEMCPY_ADVANCE_BYTES(op, ref, 4); - ref -= dec32table[offset]; - - memcpy(op, ref, 4); - op += STEPSIZE - 4; - ref -= dec64table[offset]; - } else { - MEMCPY_ADVANCE(op, ref, STEPSIZE); - } - length -= STEPSIZE; - /* - * Note - length could have been < STEPSIZE; that's ok, length - * will now be negative and we'll just end up rewinding op: - */ - - /* copy rest of match: */ - cpy = op + length; - if (cpy > oend - COPYLENGTH) { - /* Error: request to write beyond destination buffer */ - if (cpy > oend || - ref + COPYLENGTH > oend) - goto _output_error; -#if !LZ4_ARCH64 - if (op + COPYLENGTH > oend) - goto _output_error; -#endif - MEMCPY_ADVANCE_CHUNKED_NOFIXUP(op, ref, oend - COPYLENGTH); - /* op could be > cpy here */ - while (op < cpy) - *op++ = *ref++; - op = cpy; - /* - * Check EOF (should never happen, since last 5 bytes - * are supposed to be literals) - */ - if (op == oend) - goto _output_error; - } else { - MEMCPY_ADVANCE_CHUNKED(op, ref, length); - } - } - /* end of decoding */ - return ip - source; - - /* write overflow error detected */ -_output_error: - return -1; -} - -int bch2_lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len) -{ - int ret = -1; - int input_len = 0; - - input_len = lz4_uncompress(src, dest, actual_dest_len); - if (input_len < 0) - goto exit_0; - *src_len = input_len; - - return 0; -exit_0: - return ret; -} diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ea519102a228..ad41f5e36a7c 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for moving data off a device. */ #include "bcachefs.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "extents.h" #include "io.h" @@ -14,7 +16,7 @@ #include "replicas.h" #include "super-io.h" -static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, +static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, unsigned dev_idx, int flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; @@ -22,9 +24,9 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; unsigned nr_good; - bch2_extent_drop_device(e, dev_idx); + bch2_bkey_drop_device(k, dev_idx); - nr_good = bch2_extent_durability(c, e.c); + nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) return -EINVAL; @@ -34,54 +36,50 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; - struct bkey_s_extent e; BKEY_PADDED(key) tmp; - struct btree_iter iter; int ret = 0; - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS_MIN, BTREE_ITER_PREFETCH); - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { if (!bkey_extent_is_data(k.k) || !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); + ret = bch2_mark_bkey_replicas(c, k); if (ret) break; - bch2_btree_iter_next(&iter); + bch2_btree_iter_next(iter); continue; } bkey_reassemble(&tmp.key, k); - e = bkey_i_to_s_extent(&tmp.key); - ret = drop_dev_ptrs(c, e, dev_idx, flags, false); + ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key), + dev_idx, flags, false); if (ret) break; /* * If the new extent no longer has any pointers, bch2_extent_normalize() * will do the appropriate thing with it (turning it into a - * KEY_TYPE_ERROR key, or just a discard if it was a cached extent) + * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, e.s); + bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, - bkey_i_to_s_c(&tmp.key)); - if (ret) - break; + /* XXX not sketchy at all */ + iter->pos = bkey_start_pos(&tmp.key.k); - iter.pos = bkey_start_pos(&tmp.key.k); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key)); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &tmp.key)); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL); /* * don't want to leave ret == -EINTR, since if we raced and @@ -94,17 +92,17 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) break; } - bch2_btree_iter_unlock(&iter); + ret = bch2_trans_exit(&trans) ?: ret; - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); + BUG_ON(ret == -EINTR); return ret; } static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct closure cl; struct btree *b; unsigned id; @@ -114,63 +112,71 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; + bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); - for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { + for_each_btree_node(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH, b) { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct bkey_i_extent *new_key; + struct bkey_i_btree_ptr *new_key; retry: - if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key), - dev_idx)) { - bch2_btree_iter_set_locks_want(&iter, 0); - - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, - bkey_i_to_s_c(&b->key)); + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) { + /* + * we might have found a btree node key we + * needed to update, and then tried to update it + * but got -EINTR after upgrading the iter, but + * then raced and the node is now gone: + */ + bch2_btree_iter_downgrade(iter); + + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); if (ret) goto err; } else { bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_extent(&tmp.k); + new_key = bkey_i_to_btree_ptr(&tmp.k); - ret = drop_dev_ptrs(c, extent_i_to_s(new_key), + ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), dev_idx, flags, true); if (ret) goto err; - if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) { - b = bch2_btree_iter_peek_node(&iter); - goto retry; - } - - ret = bch2_btree_node_update_key(c, &iter, b, new_key); + ret = bch2_btree_node_update_key(c, iter, b, new_key); if (ret == -EINTR) { - b = bch2_btree_iter_peek_node(&iter); + b = bch2_btree_iter_peek_node(iter); goto retry; } if (ret) goto err; } } - bch2_btree_iter_unlock(&iter); + bch2_trans_iter_free(&trans, iter); + } + + /* flush relevant btree updates */ + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); } ret = 0; -out: - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); +err: + ret = bch2_trans_exit(&trans) ?: ret; + + BUG_ON(ret == -EINTR); return ret; -err: - bch2_btree_iter_unlock(&iter); - goto out; } int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) { return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags); + bch2_dev_metadata_drop(c, dev_idx, flags) ?: + bch2_replicas_gc2(c); } diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index de2faab24e11..027efaa0d575 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 3e52b7a26c7f..d0bb6ab31022 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1,8 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" +#include "disk_groups.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" @@ -51,30 +55,32 @@ struct moving_context { static int bch2_migrate_index_update(struct bch_write_op *op) { struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter *iter; struct migrate_write *m = container_of(op, struct migrate_write, op); struct keylist *keys = &op->insert_keys; - struct btree_iter iter; int ret = 0; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); struct bkey_i_extent *insert, *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; - struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; bool did_work = false; int nr; - if (btree_iter_err(k)) { - ret = bch2_btree_iter_unlock(&iter); + ret = bkey_err(k); + if (ret) break; - } if (bversion_cmp(k.k->version, new->k.version) || !bkey_extent_is_data(k.k) || @@ -93,19 +99,16 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); - bch2_cut_front(iter.pos, &insert->k_i); + bch2_cut_front(iter->pos, &insert->k_i); bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); - if (m->data_cmd == DATA_REWRITE) { - ptr = (struct bch_extent_ptr *) - bch2_extent_has_device(extent_i_to_s_c(insert), - m->data_opts.rewrite_dev); - bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); - } + if (m->data_cmd == DATA_REWRITE) + bch2_bkey_drop_device(extent_i_to_s(insert).s, + m->data_opts.rewrite_dev); - extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { + if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -114,8 +117,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) continue; } - bch2_extent_crc_append(insert, crc); - extent_ptr_append(insert, *ptr); + bch2_extent_ptr_decoded_append(insert, &p); did_work = true; } @@ -130,19 +132,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.data_replicas); /* - * It's possible we race, and for whatever reason the extent now - * has fewer replicas than when we last looked at it - meaning - * we need to get a disk reservation here: + * If we're not fully overwriting @k, and it's compressed, we + * need a reservation for all the pointers in @insert */ - nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - - (bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved); - if (nr > 0) { - /* - * can't call bch2_disk_reservation_add() with btree - * locks held, at least not without a song and dance - */ - bch2_btree_iter_unlock(&iter); + nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - + m->nr_ptrs_reserved; + if (insert->k.size < k.k->size && + bch2_extent_is_compressed(k) && + nr > 0) { ret = bch2_disk_reservation_add(c, &op->res, keylist_sectors(keys) * nr, 0); if (ret) @@ -152,18 +150,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto next; } - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, - extent_i_to_s_c(insert).s_c); - if (ret) - break; + bch2_trans_update(&trans, + BTREE_INSERT_ENTRY(iter, &insert->k_i)); - ret = bch2_btree_insert_at(c, &op->res, - NULL, op_journal_seq(op), + ret = bch2_trans_commit(&trans, &op->res, + op_journal_seq(op), BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| - m->data_opts.btree_insert_flags, - BTREE_INSERT_ENTRY(&iter, &insert->k_i)); + m->data_opts.btree_insert_flags); if (!ret) atomic_long_inc(&c->extent_migrate_done); if (ret == -EINTR) @@ -171,25 +166,26 @@ static int bch2_migrate_index_update(struct bch_write_op *op) if (ret) break; next: - while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { bch2_keylist_pop_front(keys); if (bch2_keylist_empty(keys)) goto out; } - bch2_cut_front(iter.pos, bch2_keylist_front(keys)); + bch2_cut_front(iter->pos, bch2_keylist_front(keys)); continue; nomatch: if (m->ctxt) - atomic64_add(k.k->p.offset - iter.pos.offset, + atomic64_add(k.k->p.offset - iter->pos.offset, &m->ctxt->stats->sectors_raced); atomic_long_inc(&c->extent_migrate_raced); trace_move_race(&new->k); - bch2_btree_iter_next_slot(&iter); + bch2_btree_iter_next_slot(iter); goto next; } out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); + BUG_ON(ret == -EINTR); return ret; } @@ -241,8 +237,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_NOMARK_REPLICAS; + BCH_WRITE_DATA_ENCODED; m->op.nr_replicas = 1; m->op.nr_replicas_required = 1; @@ -250,8 +245,16 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, switch (data_cmd) { case DATA_ADD_REPLICAS: { + /* + * DATA_ADD_REPLICAS is used for moving data to a different + * device in the background, and due to compression the new copy + * might take up more space than the old copy: + */ +#if 0 int nr = (int) io_opts.data_replicas - - bch2_extent_nr_dirty_ptrs(k); + bch2_bkey_nr_dirty_ptrs(k); +#endif + int nr = (int) io_opts.data_replicas; if (nr > 0) { m->op.nr_replicas = m->nr_ptrs_reserved = nr; @@ -263,8 +266,26 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, } break; } - case DATA_REWRITE: + case DATA_REWRITE: { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned compressed_sectors = 0; + + extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE && + bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) + compressed_sectors += p.crc.compressed_size; + + if (compressed_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, + compressed_sectors, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } break; + } case DATA_PROMOTE: m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; m->op.flags |= BCH_WRITE_CACHED; @@ -280,12 +301,13 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; int i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -378,8 +400,8 @@ static int bch2_move_extent(struct bch_fs *c, struct data_opts data_opts) { struct moving_io *io; - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned sectors = e.k->size, pages; int ret = -ENOMEM; @@ -392,8 +414,8 @@ static int bch2_move_extent(struct bch_fs *c, SECTORS_IN_FLIGHT_PER_DEVICE); /* write path might have to decompress data: */ - extent_for_each_ptr_crc(e, ptr, crc) - sectors = max_t(unsigned, sectors, crc.uncompressed_size); + extent_for_each_ptr_decode(e, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); io = kzalloc(sizeof(struct moving_io) + @@ -467,35 +489,61 @@ int bch2_move_data(struct bch_fs *c, struct moving_context ctxt = { .stats = stats }; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; - struct bkey_s_c_extent e; struct data_opts data_opts; enum data_cmd data_cmd; - u64 cur_inum = U64_MAX; + u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); + bch2_trans_init(&trans, c, 0, 0); + stats->data_type = BCH_DATA_USER; - bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start, - BTREE_ITER_PREFETCH); + stats->btree_id = BTREE_ID_EXTENTS; + stats->pos = POS_MIN; + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + BTREE_ITER_PREFETCH); if (rate) bch2_ratelimit_reset(rate); - while (!kthread || !(ret = kthread_should_stop())) { - if (rate && - bch2_ratelimit_delay(rate) && - (bch2_btree_iter_unlock(&stats->iter), - (ret = bch2_ratelimit_wait_freezable_stoppable(rate)))) - break; + while (1) { + do { + delay = rate ? bch2_ratelimit_delay(rate) : 0; + + if (delay) { + bch2_trans_unlock(&trans); + set_current_state(TASK_INTERRUPTIBLE); + } + + if (kthread && (ret = kthread_should_stop())) { + __set_current_state(TASK_RUNNING); + goto out; + } + + if (delay) + schedule_timeout(delay); + + if (unlikely(freezing(current))) { + bch2_trans_unlock(&trans); + move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + try_to_freeze(); + } + } while (delay); peek: - k = bch2_btree_iter_peek(&stats->iter); + k = bch2_btree_iter_peek(iter); + + stats->pos = iter->pos; + if (!k.k) break; - ret = btree_iter_err(k); + ret = bkey_err(k); if (ret) break; if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) @@ -504,13 +552,11 @@ peek: if (!bkey_extent_is_data(k.k)) goto next_nondata; - e = bkey_s_c_to_extent(k); - if (cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; /* don't hold btree locks while looking up inode: */ - bch2_btree_iter_unlock(&stats->iter); + bch2_trans_unlock(&trans); io_opts = bch2_opts_to_inode_opts(c->opts); if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) @@ -519,8 +565,7 @@ peek: goto peek; } - switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e, - &io_opts, &data_opts))) { + switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { case DATA_SKIP: goto next; case DATA_SCRUB: @@ -536,7 +581,7 @@ peek: /* unlock before doing IO: */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&stats->iter); + bch2_trans_unlock(&trans); ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, bkey_s_c_to_extent(k), @@ -555,14 +600,14 @@ peek: if (rate) bch2_ratelimit_increment(rate, k.k->size); next: - atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k), + atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k), &stats->sectors_seen); next_nondata: - bch2_btree_iter_next(&stats->iter); - bch2_btree_iter_cond_resched(&stats->iter); + bch2_btree_iter_next(iter); + bch2_trans_cond_resched(&trans); } - - bch2_btree_iter_unlock(&stats->iter); +out: + bch2_trans_exit(&trans); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); @@ -576,76 +621,34 @@ next_nondata: return ret; } -static int bch2_gc_data_replicas(struct bch_fs *c) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_PREFETCH, k) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); - if (ret) - break; - } - ret = bch2_btree_iter_unlock(&iter) ?: ret; - - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - -static int bch2_gc_btree_replicas(struct bch_fs *c) -{ - struct btree_iter iter; - struct btree *b; - unsigned id; - int ret = 0; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); - - for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, - bkey_i_to_s_c(&b->key)); - - bch2_btree_iter_cond_resched(&iter); - } - - ret = bch2_btree_iter_unlock(&iter) ?: ret; - } - - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - - return ret; -} - static int bch2_move_btree(struct bch_fs *c, move_pred_fn pred, void *arg, struct bch_move_stats *stats) { struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans trans; + struct btree_iter *iter; struct btree *b; unsigned id; struct data_opts data_opts; enum data_cmd cmd; int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + stats->data_type = BCH_DATA_BTREE; for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE, - bkey_i_to_s_c_extent(&b->key), - &io_opts, - &data_opts))) { + stats->btree_id = id; + + for_each_btree_node(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH, b) { + stats->pos = iter->pos; + + switch ((cmd = pred(c, arg, + bkey_i_to_s_c(&b->key), + &io_opts, &data_opts))) { case DATA_SKIP: goto next; case DATA_SCRUB: @@ -657,22 +660,23 @@ static int bch2_move_btree(struct bch_fs *c, BUG(); } - ret = bch2_btree_node_rewrite(c, &stats->iter, + ret = bch2_btree_node_rewrite(c, iter, b->data->keys.seq, 0) ?: ret; next: - bch2_btree_iter_cond_resched(&stats->iter); + bch2_trans_cond_resched(&trans); } - ret = bch2_btree_iter_unlock(&stats->iter) ?: ret; + ret = bch2_trans_iter_free(&trans, iter) ?: ret; } + bch2_trans_exit(&trans); + return ret; } #if 0 static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, + struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { @@ -681,33 +685,38 @@ static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, #endif static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, + struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - unsigned nr_good = bch2_extent_durability(c, e); - unsigned replicas = type == BKEY_TYPE_BTREE - ? c->opts.metadata_replicas - : io_opts->data_replicas; + unsigned nr_good = bch2_bkey_durability(c, k); + unsigned replicas = 0; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + replicas = c->opts.metadata_replicas; + break; + case KEY_TYPE_extent: + replicas = io_opts->data_replicas; + break; + } if (!nr_good || nr_good >= replicas) return DATA_SKIP; data_opts->target = 0; - data_opts->btree_insert_flags = 0; + data_opts->btree_insert_flags = 0; return DATA_ADD_REPLICAS; } static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, + struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { struct bch_ioctl_data *op = arg; - if (!bch2_extent_has_device(e, op->migrate.dev)) + if (!bch2_bkey_has_device(k, op->migrate.dev)) return DATA_SKIP; data_opts->target = 0; @@ -728,14 +737,24 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; - ret = bch2_gc_btree_replicas(c) ?: ret; + + while (1) { + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c) || + c->btree_roots_dirty); + if (!bch2_btree_interior_updates_nr_pending(c)) + break; + bch2_journal_meta(&c->journal); + } + + ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, NULL, writepoint_hashed((unsigned long) current), op.start, op.end, rereplicate_pred, c, stats) ?: ret; - ret = bch2_gc_data_replicas(c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_MIGRATE: if (op.migrate.dev >= c->sb.nr_devices) @@ -745,14 +764,14 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; - ret = bch2_gc_btree_replicas(c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, NULL, writepoint_hashed((unsigned long) current), op.start, op.end, migrate_pred, &op, stats) ?: ret; - ret = bch2_gc_data_replicas(c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index bc87e0670d92..71b3d2b2ddb6 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H @@ -46,7 +47,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, struct bkey_s_c); typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, - enum bkey_type, struct bkey_s_c_extent, + struct bkey_s_c, struct bch_io_opts *, struct data_opts *); int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 832542a879ab..6788170d3f95 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -1,9 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_MOVE_TYPES_H #define _BCACHEFS_MOVE_TYPES_H struct bch_move_stats { enum bch_data_type data_type; - struct btree_iter iter; + enum btree_id btree_id; + struct bpos pos; atomic64_t keys_moved; atomic64_t sectors_moved; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 7bef456110f1..b13af5662f22 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Moving/copying garbage collector * @@ -5,6 +6,7 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" @@ -52,7 +54,7 @@ static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) { - return (l.sectors > r.sectors) - (l.sectors < r.sectors); + return cmp_int(l.sectors, r.sectors); } static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) @@ -60,40 +62,46 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; - return (l->offset > r->offset) - (l->offset < r->offset); + return cmp_int(l->offset, r->offset); } static bool __copygc_pred(struct bch_dev *ca, - struct bkey_s_c_extent e) + struct bkey_s_c k) { copygc_heap *h = &ca->copygc_heap; - const struct bch_extent_ptr *ptr = - bch2_extent_has_device(e, ca->dev_idx); - if (ptr) { - struct copygc_heap_entry search = { .offset = ptr->offset }; + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr = + bch2_extent_has_device(e, ca->dev_idx); - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); + if (ptr) { + struct copygc_heap_entry search = { .offset = ptr->offset }; - return (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen); + ssize_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); + + return (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen); + } + break; + } } return false; } static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, + struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { struct bch_dev *ca = arg; - if (!__copygc_pred(ca, e)) + if (!__copygc_pred(ca, k)) return DATA_SKIP; data_opts->target = dev_to_target(ca->dev_idx); @@ -108,7 +116,7 @@ static bool have_copygc_reserve(struct bch_dev *ca) spin_lock(&ca->freelist_lock); ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || - ca->allocator_blocked; + ca->allocator_state != ALLOCATOR_RUNNING; spin_unlock(&ca->freelist_lock); return ret; @@ -159,7 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) .sectors = bucket_sectors_used(m), .offset = bucket_to_sector(ca, b), }; - heap_add_or_replace(h, e, -sectors_used_cmp); + heap_add_or_replace(h, e, -sectors_used_cmp, NULL); } up_read(&ca->bucket_lock); up_read(&c->gc_lock); @@ -168,7 +176,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) sectors_to_move += i->sectors; while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(h, e, -sectors_used_cmp)); + BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); sectors_to_move -= e.sectors; } @@ -201,7 +209,8 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) - bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", + bch_warn_ratelimited(c, + "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", sectors_not_moved, sectors_to_move, buckets_not_moved, buckets_to_move); @@ -227,16 +236,10 @@ static int bch2_copygc_thread(void *arg) last = atomic_long_read(&clock->now); - reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * - ca->mi.bucket_size * - c->opts.gc_reserve_percent, 200); + reserve = ca->copygc_threshold; usage = bch2_dev_usage_read(c, ca); - /* - * don't start copygc until less than half the gc reserve is - * available: - */ available = __dev_buckets_available(ca, usage) * ca->mi.bucket_size; if (available > reserve) { @@ -280,7 +283,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) { struct task_struct *t; - BUG_ON(ca->copygc_thread); + if (ca->copygc_thread) + return 0; if (c->opts.nochanges) return 0; diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index c46fa1f15f5d..dcd479632cf1 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8db8096e5ed4..13a9a2fcd575 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include "bcachefs.h" +#include "compress.h" #include "disk_groups.h" #include "opts.h" #include "super-io.h" @@ -73,22 +75,22 @@ const char * const bch2_dev_state[] = { void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { -#define BCH_OPT(_name, ...) \ +#define x(_name, ...) \ if (opt_defined(src, _name)) \ opt_set(*dst, _name, src._name); BCH_OPTS() -#undef BCH_OPT +#undef x } bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) { switch (id) { -#define BCH_OPT(_name, ...) \ +#define x(_name, ...) \ case Opt_##_name: \ return opt_defined(*opts, _name); BCH_OPTS() -#undef BCH_OPT +#undef x default: BUG(); } @@ -97,11 +99,11 @@ bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) { switch (id) { -#define BCH_OPT(_name, ...) \ +#define x(_name, ...) \ case Opt_##_name: \ return opts->_name; BCH_OPTS() -#undef BCH_OPT +#undef x default: BUG(); } @@ -110,12 +112,12 @@ u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) { switch (id) { -#define BCH_OPT(_name, ...) \ +#define x(_name, ...) \ case Opt_##_name: \ opt_set(*opts, _name, v); \ break; BCH_OPTS() -#undef BCH_OPT +#undef x default: BUG(); } @@ -129,11 +131,11 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) { struct bch_opts opts = bch2_opts_empty(); -#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ +#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ if (_sb_opt != NO_SB_OPT) \ opt_set(opts, _name, _sb_opt(sb)); BCH_OPTS() -#undef BCH_OPT +#undef x return opts; } @@ -141,24 +143,27 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max +#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, \ .parse = _fn##_parse, \ - .print = _fn##_print + .to_text = _fn##_to_text -#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ +#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ .attr = { \ .name = #_name, \ - .mode = _mode == OPT_RUNTIME ? 0644 : 0444, \ + .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ }, \ .mode = _mode, \ + .hint = _hint, \ + .help = _help, \ .set_sb = SET_##_sb_opt, \ _type \ }, BCH_OPTS() -#undef BCH_OPT +#undef x }; int bch2_opt_lookup(const char *name) @@ -217,8 +222,21 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, if (*res < opt->min || *res >= opt->max) return -ERANGE; break; + case BCH_OPT_SECTORS: + ret = bch2_strtou64_h(val, res); + if (ret < 0) + return ret; + + if (*res & 511) + return -EINVAL; + + *res >>= 9; + + if (*res < opt->min || *res >= opt->max) + return -ERANGE; + break; case BCH_OPT_STR: - ret = bch2_read_string_list(val, opt->choices); + ret = match_string(opt->choices, -1, val); if (ret < 0) return ret; @@ -234,38 +252,81 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, return 0; } -int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len, - const struct bch_option *opt, u64 v, - unsigned flags) +void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_option *opt, u64 v, + unsigned flags) { - char *out = buf, *end = buf + len; - if (flags & OPT_SHOW_MOUNT_STYLE) { - if (opt->type == BCH_OPT_BOOL) - return scnprintf(out, end - out, "%s%s", - v ? "" : "no", - opt->attr.name); + if (opt->type == BCH_OPT_BOOL) { + pr_buf(out, "%s%s", + v ? "" : "no", + opt->attr.name); + return; + } - out += scnprintf(out, end - out, "%s=", opt->attr.name); + pr_buf(out, "%s=", opt->attr.name); } switch (opt->type) { case BCH_OPT_BOOL: case BCH_OPT_UINT: - out += scnprintf(out, end - out, "%lli", v); + pr_buf(out, "%lli", v); + break; + case BCH_OPT_SECTORS: + bch2_hprint(out, v); break; case BCH_OPT_STR: - out += (flags & OPT_SHOW_FULL_LIST) - ? bch2_scnprint_string_list(out, end - out, opt->choices, v) - : scnprintf(out, end - out, opt->choices[v]); + if (flags & OPT_SHOW_FULL_LIST) + bch2_string_opt_to_text(out, opt->choices, v); + else + pr_buf(out, opt->choices[v]); break; case BCH_OPT_FN: - return opt->print(c, out, end - out, v); + opt->to_text(out, c, v); + break; default: BUG(); } +} + +int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +{ + int ret = 0; + + switch (id) { + case Opt_compression: + case Opt_background_compression: + ret = bch2_check_set_has_compressed_data(c, v); + break; + case Opt_erasure_code: + if (v && + !(c->sb.features & (1ULL << BCH_FEATURE_EC))) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_EC); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; + } + + return ret; +} + +int bch2_opts_check_may_set(struct bch_fs *c) +{ + unsigned i; + int ret; + + for (i = 0; i < bch2_opts_nr; i++) { + ret = bch2_opt_check_may_set(c, i, + bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } - return out - buf; + return 0; } int bch2_parse_mount_opts(struct bch_opts *opts, char *options) @@ -303,7 +364,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) goto no_val; } - if (bch2_opt_table[id].mode < OPT_MOUNT) + if (!(bch2_opt_table[id].mode & OPT_MOUNT)) goto bad_opt; if (id == Opt_acl && @@ -335,40 +396,40 @@ no_val: struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { struct bch_io_opts ret = { 0 }; -#define BCH_INODE_OPT(_name, _bits) \ +#define x(_name, _bits) \ if (opt_defined(src, _name)) \ opt_set(ret, _name, src._name); BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x return ret; } struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) { struct bch_opts ret = { 0 }; -#define BCH_INODE_OPT(_name, _bits) \ +#define x(_name, _bits) \ if (opt_defined(src, _name)) \ opt_set(ret, _name, src._name); BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x return ret; } void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) { -#define BCH_INODE_OPT(_name, _bits) \ +#define x(_name, _bits) \ if (opt_defined(src, _name)) \ opt_set(*dst, _name, src._name); BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x } bool bch2_opt_is_inode_opt(enum bch_opt_id id) { static const enum bch_opt_id inode_opt_list[] = { -#define BCH_INODE_OPT(_name, _bits) Opt_##_name, +#define x(_name, _bits) Opt_##_name, BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x }; unsigned i; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index e7ab8870d3ac..c6ec9f7effe5 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_OPTS_H #define _BCACHEFS_OPTS_H @@ -31,22 +32,25 @@ extern const char * const bch2_dev_state[]; /* dummy option, for options that aren't stored in the superblock */ LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); +/* When can be set: */ enum opt_mode { - OPT_INTERNAL, - OPT_FORMAT, - OPT_MOUNT, - OPT_RUNTIME, + OPT_FORMAT = (1 << 0), + OPT_MOUNT = (1 << 1), + OPT_RUNTIME = (1 << 2), + OPT_INODE = (1 << 3), + OPT_DEVICE = (1 << 4), }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, + BCH_OPT_SECTORS, BCH_OPT_STR, BCH_OPT_FN, }; /** - * BCH_OPT(name, type, in mem type, mode, sb_opt) + * x(name, shortopt, type, in mem type, mode, sb_opt) * * @name - name of mount option, sysfs attribute, and struct bch_opts * member @@ -65,132 +69,246 @@ enum opt_type { */ #define BCH_OPTS() \ - BCH_OPT(block_size, u16, OPT_FORMAT, \ - OPT_UINT(1, 128), \ - BCH_SB_BLOCK_SIZE, 8) \ - BCH_OPT(btree_node_size, u16, OPT_FORMAT, \ - OPT_UINT(1, 128), \ - BCH_SB_BTREE_NODE_SIZE, 512) \ - BCH_OPT(errors, u8, OPT_RUNTIME, \ - OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \ - BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_WANT, 1) \ - BCH_OPT(data_replicas, u8, OPT_RUNTIME, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_WANT, 1) \ - BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_META_REPLICAS_REQ, 1) \ - BCH_OPT(data_replicas_required, u8, OPT_MOUNT, \ - OPT_UINT(1, BCH_REPLICAS_MAX), \ - BCH_SB_DATA_REPLICAS_REQ, 1) \ - BCH_OPT(metadata_checksum, u8, OPT_RUNTIME, \ - OPT_STR(bch2_csum_types), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \ - BCH_OPT(data_checksum, u8, OPT_RUNTIME, \ - OPT_STR(bch2_csum_types), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \ - BCH_OPT(compression, u8, OPT_RUNTIME, \ - OPT_STR(bch2_compression_types), \ - BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE)\ - BCH_OPT(background_compression, u8, OPT_RUNTIME, \ - OPT_STR(bch2_compression_types), \ - BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\ - BCH_OPT(str_hash, u8, OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_types), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH) \ - BCH_OPT(foreground_target, u16, OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_FOREGROUND_TARGET, 0) \ - BCH_OPT(background_target, u16, OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_BACKGROUND_TARGET, 0) \ - BCH_OPT(promote_target, u16, OPT_RUNTIME, \ - OPT_FN(bch2_opt_target), \ - BCH_SB_PROMOTE_TARGET, 0) \ - BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_INODE_32BIT, false) \ - BCH_OPT(gc_reserve_percent, u8, OPT_MOUNT, \ - OPT_UINT(5, 21), \ - BCH_SB_GC_RESERVE, 8) \ - BCH_OPT(root_reserve_percent, u8, OPT_MOUNT, \ - OPT_UINT(0, 100), \ - BCH_SB_ROOT_RESERVE, 0) \ - BCH_OPT(wide_macs, u8, OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_128_BIT_MACS, false) \ - BCH_OPT(acl, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_POSIX_ACL, true) \ - BCH_OPT(usrquota, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_USRQUOTA, false) \ - BCH_OPT(grpquota, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_GRPQUOTA, false) \ - BCH_OPT(prjquota, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - BCH_SB_PRJQUOTA, false) \ - BCH_OPT(degraded, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(verbose_init, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(nofsck, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(fix_errors, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(nochanges, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(noreplay, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(norecovery, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(noexcl, u8, OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(sb, u64, OPT_MOUNT, \ - OPT_UINT(0, S64_MAX), \ - NO_SB_OPT, BCH_SB_SECTOR) \ - BCH_OPT(read_only, u8, OPT_INTERNAL, \ - OPT_BOOL(), \ - NO_SB_OPT, false) \ - BCH_OPT(nostart, u8, OPT_INTERNAL, \ - OPT_BOOL(), \ - NO_SB_OPT, false) + x(block_size, u16, \ + OPT_FORMAT, \ + OPT_SECTORS(1, 128), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ + x(btree_node_size, u16, \ + OPT_FORMAT, \ + OPT_SECTORS(1, 128), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(data_replicas_required, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(metadata_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_types), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_STR(bch2_csum_types), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + NULL, NULL) \ + x(compression, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_STR(bch2_compression_types), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE, \ + NULL, NULL) \ + x(background_compression, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_STR(bch2_compression_types), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE, \ + NULL, NULL) \ + x(str_hash, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_types), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH, \ + NULL, "Hash function for directory entries and xattrs")\ + x(foreground_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0, \ + "(target)", "Device or disk group for foreground writes") \ + x(background_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0, \ + "(target)", "Device or disk group to move data to in the background")\ + x(promote_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0, \ + "(target)", "Device or disk group to promote data to on read")\ + x(erasure_code, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, false, \ + NULL, "Constrain inode numbers to 32 bits") \ + x(gc_reserve_percent, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_SECTORS(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ + x(root_reserve_percent, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(0, 100), \ + BCH_SB_ROOT_RESERVE, 0, \ + "%", "Percentage of disk space to reserve for superuser")\ + x(wide_macs, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ + x(acl, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(discard, u8, \ + OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_disabled, u8, \ + OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ + x(fsck, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Fix errors during fsck without asking") \ + x(nochanges, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(noexcl, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ + NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ + 0, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, NULL) \ + x(nostart, u8, \ + 0, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(version_upgrade, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, NULL) \ + x(fs_size, u64, \ + OPT_DEVICE, \ + OPT_SECTORS(0, S64_MAX), \ + NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ + OPT_SECTORS(0, S64_MAX), \ + NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ + OPT_DEVICE, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ + NO_SB_OPT, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") \ + x(new_inode_updates, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Enable new btree write-cache for inode updates") + struct bch_opts { -#define BCH_OPT(_name, _bits, ...) unsigned _name##_defined:1; +#define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() -#undef BCH_OPT +#undef x -#define BCH_OPT(_name, _bits, ...) _bits _name; +#define x(_name, _bits, ...) _bits _name; BCH_OPTS() -#undef BCH_OPT +#undef x }; static const struct bch_opts bch2_opts_default = { -#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \ +#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ ._name = _default, \ BCH_OPTS() -#undef BCH_OPT +#undef x }; #define opt_defined(_opts, _name) ((_opts)._name##_defined) @@ -212,13 +330,14 @@ static inline struct bch_opts bch2_opts_empty(void) void bch2_opts_apply(struct bch_opts *, struct bch_opts); enum bch_opt_id { -#define BCH_OPT(_name, ...) Opt_##_name, +#define x(_name, ...) Opt_##_name, BCH_OPTS() -#undef BCH_OPT +#undef x bch2_opts_nr }; struct bch_fs; +struct printbuf; struct bch_option { struct attribute attr; @@ -235,10 +354,13 @@ struct bch_option { }; struct { int (*parse)(struct bch_fs *, const char *, u64 *); - int (*print)(struct bch_fs *, char *, size_t, u64); + void (*to_text)(struct printbuf *, struct bch_fs *, u64); }; }; + const char *hint; + const char *help; + }; extern const struct bch_option bch2_opt_table[]; @@ -255,30 +377,23 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) -int bch2_opt_to_text(struct bch_fs *, char *, size_t, - const struct bch_option *, u64, unsigned); +void bch2_opt_to_text(struct printbuf *, struct bch_fs *, + const struct bch_option *, u64, unsigned); +int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_mount_opts(struct bch_opts *, char *); /* inode opts: */ -#define BCH_INODE_OPTS() \ - BCH_INODE_OPT(data_checksum, 8) \ - BCH_INODE_OPT(compression, 8) \ - BCH_INODE_OPT(background_compression, 8) \ - BCH_INODE_OPT(data_replicas, 8) \ - BCH_INODE_OPT(promote_target, 16) \ - BCH_INODE_OPT(foreground_target, 16) \ - BCH_INODE_OPT(background_target, 16) - struct bch_io_opts { -#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1; +#define x(_name, _bits) unsigned _name##_defined:1; BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x -#define BCH_INODE_OPT(_name, _bits) u##_bits _name; +#define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() -#undef BCH_INODE_OPT +#undef x }; struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index bb03d83a53e4..f0da0fac09bf 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" #include "inode.h" @@ -21,23 +22,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_quota dq; - if (k.k->p.inode >= QTYP_NR) return "invalid quota type"; - switch (k.k->type) { - case BCH_QUOTA: { - dq = bkey_s_c_to_quota(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) - return "incorrect value size"; + if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) + return "incorrect value size"; - return NULL; - } - default: - return "invalid type"; - } + return NULL; } static const char * const bch2_quota_counters[] = { @@ -45,24 +36,17 @@ static const char * const bch2_quota_counters[] = { "inodes", }; -void bch2_quota_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - char *out = buf, *end= buf + size; - struct bkey_s_c_quota dq; + struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); unsigned i; - switch (k.k->type) { - case BCH_QUOTA: - dq = bkey_s_c_to_quota(k); - - for (i = 0; i < Q_COUNTERS; i++) - out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu", - bch2_quota_counters[i], - le64_to_cpu(dq.v->c[i].hardlimit), - le64_to_cpu(dq.v->c[i].softlimit)); - break; - } + for (i = 0; i < Q_COUNTERS; i++) + pr_buf(out, "%s hardlimit %llu softlimit %llu", + bch2_quota_counters[i], + le64_to_cpu(dq.v->c[i].hardlimit), + le64_to_cpu(dq.v->c[i].softlimit)); } #ifdef CONFIG_BCACHEFS_QUOTA @@ -178,7 +162,7 @@ static int bch2_quota_check_limit(struct bch_fs *c, BUG_ON((s64) n < 0); - if (mode == BCH_QUOTA_NOCHECK) + if (mode == KEY_TYPE_QUOTA_NOCHECK) return 0; if (v <= 0) { @@ -201,7 +185,7 @@ static int bch2_quota_check_limit(struct bch_fs *c, if (qc->hardlimit && qc->hardlimit < n && !ignore_hardlimit(q)) { - if (mode == BCH_QUOTA_PREALLOC) + if (mode == KEY_TYPE_QUOTA_PREALLOC) return -EDQUOT; prepare_warning(qc, qtype, counter, msgs, HARDWARN); @@ -212,7 +196,7 @@ static int bch2_quota_check_limit(struct bch_fs *c, qc->timer && ktime_get_real_seconds() >= qc->timer && !ignore_hardlimit(q)) { - if (mode == BCH_QUOTA_PREALLOC) + if (mode == KEY_TYPE_QUOTA_PREALLOC) return -EDQUOT; prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); @@ -221,7 +205,7 @@ static int bch2_quota_check_limit(struct bch_fs *c, if (qc->softlimit && qc->softlimit < n && qc->timer == 0) { - if (mode == BCH_QUOTA_PREALLOC) + if (mode == KEY_TYPE_QUOTA_PREALLOC) return -EDQUOT; prepare_warning(qc, qtype, counter, msgs, SOFTWARN); @@ -286,7 +270,8 @@ static void __bch2_quota_transfer(struct bch_memquota *src_q, int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, struct bch_qid dst, - struct bch_qid src, u64 space) + struct bch_qid src, u64 space, + enum quota_acct_mode mode) { struct bch_memquota_type *q; struct bch_memquota *src_q[3], *dst_q[3]; @@ -312,13 +297,13 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, dst_q[i]->c[Q_SPC].v + space, - BCH_QUOTA_PREALLOC); + mode); if (ret) goto err; ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, dst_q[i]->c[Q_INO].v + 1, - BCH_QUOTA_PREALLOC); + mode); if (ret) goto err; } @@ -347,7 +332,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) BUG_ON(k.k->p.inode >= QTYP_NR); switch (k.k->type) { - case BCH_QUOTA: + case KEY_TYPE_quota: dq = bkey_s_c_to_quota(k); q = &c->quotas[k.k->p.inode]; @@ -371,12 +356,15 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; - for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0), - BTREE_ITER_PREFETCH, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), + BTREE_ITER_PREFETCH, k, ret) { if (k.k->p.inode != type) break; @@ -385,7 +373,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) break; } - return bch2_btree_iter_unlock(&iter) ?: ret; + return bch2_trans_exit(&trans) ?: ret; } void bch2_fs_quota_exit(struct bch_fs *c) @@ -429,7 +417,8 @@ int bch2_fs_quota_read(struct bch_fs *c) { unsigned i, qtypes = enabled_qtypes(c); struct bch_memquota_type *q; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bch_inode_unpacked u; struct bkey_s_c k; int ret; @@ -444,21 +433,23 @@ int bch2_fs_quota_read(struct bch_fs *c) return ret; } - for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, - BTREE_ITER_PREFETCH, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { switch (k.k->type) { - case BCH_INODE_FS: + case KEY_TYPE_inode: ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); if (ret) return ret; bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - BCH_QUOTA_NOCHECK); + KEY_TYPE_QUOTA_NOCHECK); bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - BCH_QUOTA_NOCHECK); + KEY_TYPE_QUOTA_NOCHECK); } } - return bch2_btree_iter_unlock(&iter) ?: ret; + return bch2_trans_exit(&trans) ?: ret; } /* Enable/disable/delete quotas for an entire filesystem: */ @@ -467,7 +458,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; /* Accounting must be enabled at mount time: */ @@ -504,7 +495,7 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; mutex_lock(&c->sb_lock); @@ -528,7 +519,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) struct bch_fs *c = sb->s_fs_info; int ret; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; if (uflags & FS_USER_QUOTA) { @@ -538,7 +529,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_USR, 0), POS(QTYP_USR + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -550,7 +541,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_GRP, 0), POS(QTYP_GRP + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -562,7 +553,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, POS(QTYP_PRJ, 0), POS(QTYP_PRJ + 1, 0), - ZERO_VERSION, NULL, NULL, NULL); + NULL); if (ret) return ret; } @@ -610,7 +601,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, struct bch_sb_field_quota *sb_quota; struct bch_memquota_type *q; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; if (type >= QTYP_NR) @@ -700,22 +691,19 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, struct bch_fs *c = sb->s_fs_info; struct bch_memquota_type *q = &c->quotas[kqid->type]; qid_t qid = from_kqid(&init_user_ns, *kqid); - struct genradix_iter iter = genradix_iter_init(&q->table, qid); + struct genradix_iter iter; struct bch_memquota *mq; int ret = 0; mutex_lock(&q->lock); - while ((mq = genradix_iter_peek(&iter, &q->table))) { + genradix_for_each_from(&q->table, iter, mq, qid) if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { __bch2_quota_get(qdq, mq); *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); goto found; } - genradix_iter_advance(&iter, &q->table); - } - ret = -ENOENT; found: mutex_unlock(&q->lock); @@ -726,27 +714,30 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_quota new_quota; int ret; - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & SB_RDONLY) return -EROFS; bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); - ret = btree_iter_err(k); + ret = bkey_err(k); if (unlikely(ret)) return ret; switch (k.k->type) { - case BCH_QUOTA: + case KEY_TYPE_quota: new_quota.v = *bkey_s_c_to_quota(k).v; break; } @@ -761,9 +752,11 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, if (qdq->d_fieldmask & QC_INO_HARD) new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0, - BTREE_INSERT_ENTRY(&iter, &new_quota.k_i)); - bch2_btree_iter_unlock(&iter); + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i)); + + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + + bch2_trans_exit(&trans); if (ret) return ret; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 0b24f22cf4fb..51e4f9713ef0 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_QUOTA_H #define _BCACHEFS_QUOTA_H @@ -7,25 +8,19 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_quota; const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -#define bch2_bkey_quota_ops (struct bkey_ops) { \ +#define bch2_bkey_ops_quota (struct bkey_ops) { \ .key_invalid = bch2_quota_invalid, \ .val_to_text = bch2_quota_to_text, \ } -enum quota_acct_mode { - BCH_QUOTA_PREALLOC, - BCH_QUOTA_WARN, - BCH_QUOTA_NOCHECK, -}; - static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) { return (struct bch_qid) { .q[QTYP_USR] = u->bi_uid, .q[QTYP_GRP] = u->bi_gid, - .q[QTYP_PRJ] = u->bi_project, + .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, }; } @@ -42,7 +37,7 @@ int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, s64, enum quota_acct_mode); int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, - struct bch_qid, u64); + struct bch_qid, u64, enum quota_acct_mode); void bch2_fs_quota_exit(struct bch_fs *); void bch2_fs_quota_init(struct bch_fs *); @@ -61,7 +56,8 @@ static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, struct bch_qid dst, - struct bch_qid src, u64 space) + struct bch_qid src, u64 space, + enum quota_acct_mode mode) { return 0; } diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h index bcaed4ea8345..6a136083d389 100644 --- a/fs/bcachefs/quota_types.h +++ b/fs/bcachefs/quota_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_QUOTA_TYPES_H #define _BCACHEFS_QUOTA_TYPES_H @@ -7,6 +8,12 @@ struct bch_qid { u32 q[QTYP_NR]; }; +enum quota_acct_mode { + KEY_TYPE_QUOTA_PREALLOC, + KEY_TYPE_QUOTA_WARN, + KEY_TYPE_QUOTA_NOCHECK, +}; + struct memquota_counter { u64 v; u64 hardlimit; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4154b1e97acd..6bdd68177ac9 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc.h" +#include "alloc_foreground.h" #include "btree_iter.h" #include "buckets.h" #include "clock.h" @@ -17,17 +18,16 @@ #include <trace/events/bcachefs.h> static inline bool rebalance_ptr_pred(struct bch_fs *c, - const struct bch_extent_ptr *ptr, - struct bch_extent_crc_unpacked crc, + struct extent_ptr_decoded p, struct bch_io_opts *io_opts) { if (io_opts->background_target && - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && - !ptr->cached) + !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && + !p.ptr.cached) return true; if (io_opts->background_compression && - crc.compression_type != + p.crc.compression_type != bch2_compression_opt_to_type[io_opts->background_compression]) return true; @@ -38,8 +38,8 @@ void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; struct bkey_s_c_extent e; if (!bkey_extent_is_data(k.k)) @@ -51,13 +51,13 @@ void bch2_rebalance_add_key(struct bch_fs *c, e = bkey_s_c_to_extent(k); - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + extent_for_each_ptr_decode(e, p, entry) + if (rebalance_ptr_pred(c, p, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (atomic64_add_return(crc.compressed_size, + if (atomic64_add_return(p.crc.compressed_size, &ca->rebalance_work) == - crc.compressed_size) + p.crc.compressed_size) rebalance_wakeup(c); } } @@ -70,28 +70,34 @@ void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) } static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, - enum bkey_type type, - struct bkey_s_c_extent e, + struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return DATA_SKIP; + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return DATA_SKIP; - extent_for_each_ptr_crc(e, ptr, crc) - if (rebalance_ptr_pred(c, ptr, crc, io_opts)) - goto found; + extent_for_each_ptr_decode(e, p, entry) + if (rebalance_ptr_pred(c, p, io_opts)) + goto found; - return DATA_SKIP; + return DATA_SKIP; found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; + } + default: + return DATA_SKIP; + } } struct rebalance_work { @@ -112,7 +118,7 @@ static void rebalance_work_accumulate(struct rebalance_work *w, work = U64_MAX; work = min(work, capacity); - percent_full = div_u64(work * 100, capacity); + percent_full = div64_u64(work * 100, capacity); if (percent_full >= w->dev_most_full_percent) { w->dev_most_full_idx = idx; @@ -252,49 +258,43 @@ static int bch2_rebalance_thread(void *arg) ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) { - char *out = buf, *end = out + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_rebalance *r = &c->rebalance; struct rebalance_work w = rebalance_work(c); char h1[21], h2[21]; - bch2_hprint(h1, w.dev_most_full_work << 9); - bch2_hprint(h2, w.dev_most_full_capacity << 9); - out += scnprintf(out, end - out, - "fullest_dev (%i):\t%s/%s\n", - w.dev_most_full_idx, h1, h2); + bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); + bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); + pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", + w.dev_most_full_idx, h1, h2); - bch2_hprint(h1, w.total_work << 9); - bch2_hprint(h2, c->capacity << 9); - out += scnprintf(out, end - out, - "total work:\t\t%s/%s\n", - h1, h2); + bch2_hprint(&PBUF(h1), w.total_work << 9); + bch2_hprint(&PBUF(h2), c->capacity << 9); + pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); - out += scnprintf(out, end - out, - "rate:\t\t\t%u\n", - r->pd.rate.rate); + pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); switch (r->state) { case REBALANCE_WAITING: - out += scnprintf(out, end - out, "waiting\n"); + pr_buf(&out, "waiting\n"); break; case REBALANCE_THROTTLED: - bch2_hprint(h1, + bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - atomic_long_read(&c->io_clock[WRITE].now)) << 9); - out += scnprintf(out, end - out, - "throttled for %lu sec or %s io\n", - (r->throttled_until_cputime - jiffies) / HZ, - h1); + pr_buf(&out, "throttled for %lu sec or %s io\n", + (r->throttled_until_cputime - jiffies) / HZ, + h1); break; case REBALANCE_RUNNING: - out += scnprintf(out, end - out, "running\n"); - out += scnprintf(out, end - out, "pos %llu:%llu\n", - r->move_stats.iter.pos.inode, - r->move_stats.iter.pos.offset); + pr_buf(&out, "running\n"); + pr_buf(&out, "pos %llu:%llu\n", + r->move_stats.pos.inode, + r->move_stats.pos.offset); break; } - return out - buf; + return out.pos - buf; } void bch2_rebalance_stop(struct bch_fs *c) diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 2e6aa6772471..99e2a1fb6084 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_REBALANCE_H #define _BCACHEFS_REBALANCE_H diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index aaf5b9ca133c..192c6be20ced 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_REBALANCE_TYPES_H #define _BCACHEFS_REBALANCE_TYPES_H diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 index 000000000000..e0df2c0a4fdf --- /dev/null +++ b/fs/bcachefs/recovery.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "buckets.h" +#include "dirent.h" +#include "ec.h" +#include "error.h" +#include "fsck.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "quota.h" +#include "recovery.h" +#include "replicas.h" +#include "super-io.h" + +#include <linux/sort.h> +#include <linux/stat.h> + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +/* sort and dedup all keys in the journal: */ + +static void journal_entries_free(struct list_head *list) +{ + + while (!list_empty(list)) { + struct journal_replay *i = + list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + } +} + +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos); +} + +static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) +{ + while (i + 1 < keys->d + keys->nr && + journal_sort_key_cmp(i, i + 1) > 0) { + swap(i[0], i[1]); + i++; + } +} + +static void journal_keys_free(struct journal_keys *keys) +{ + struct journal_key *i; + + for_each_journal_key(*keys, i) + if (i->allocated) + kfree(i->k); + kvfree(keys->d); + keys->d = NULL; + keys->nr = 0; +} + +static struct journal_keys journal_keys_sort(struct list_head *journal_entries) +{ + struct journal_replay *p; + struct jset_entry *entry; + struct bkey_i *k, *_n; + struct journal_keys keys = { NULL }, keys_deduped = { NULL }; + struct journal_key *i; + size_t nr_keys = 0; + + list_for_each_entry(p, journal_entries, list) + for_each_jset_key(k, _n, entry, &p->j) + nr_keys++; + + keys.journal_seq_base = keys_deduped.journal_seq_base = + le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); + + keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + if (!keys.d) + goto err; + + keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); + if (!keys_deduped.d) + goto err; + + list_for_each_entry(p, journal_entries, list) + for_each_jset_key(k, _n, entry, &p->j) + keys.d[keys.nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .pos = bkey_start_pos(&k->k), + .k = k, + .journal_seq = le64_to_cpu(p->j.seq) - + keys.journal_seq_base, + .journal_offset = k->_data - p->j._data, + }; + + sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); + + i = keys.d; + while (i < keys.d + keys.nr) { + if (i + 1 < keys.d + keys.nr && + i[0].btree_id == i[1].btree_id && + !bkey_cmp(i[0].pos, i[1].pos)) { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { + i++; + } else { + bch2_cut_front(i[1].k->k.p, i[0].k); + i[0].pos = i[1].k->k.p; + journal_keys_sift(&keys, i); + } + continue; + } + + if (i + 1 < keys.d + keys.nr && + i[0].btree_id == i[1].btree_id && + bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { + if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: + cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { + bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); + } else { + struct bkey_i *split = + kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); + + if (!split) + goto err; + + bkey_copy(split, i[0].k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); + keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { + .btree_id = i[0].btree_id, + .allocated = true, + .pos = bkey_start_pos(&split->k), + .k = split, + .journal_seq = i[0].journal_seq, + .journal_offset = i[0].journal_offset, + }; + + bch2_cut_front(i[1].k->k.p, i[0].k); + i[0].pos = i[1].k->k.p; + journal_keys_sift(&keys, i); + continue; + } + } else { + if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { + i[1] = i[0]; + i++; + continue; + } else { + bch2_cut_front(i[0].k->k.p, i[1].k); + i[1].pos = i[0].k->k.p; + journal_keys_sift(&keys, i + 1); + continue; + } + } + } + + keys_deduped.d[keys_deduped.nr++] = *i++; + } + + kvfree(keys.d); + return keys_deduped; +err: + journal_keys_free(&keys_deduped); + kvfree(keys.d); + return (struct journal_keys) { NULL }; +} + +/* journal replay: */ + +static void replay_now_at(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->replay_journal_seq); + BUG_ON(seq > j->replay_journal_seq_end); + + while (j->replay_journal_seq < seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); +} + +static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + struct btree_trans trans; + struct btree_iter *iter, *split_iter; + /* + * We might cause compressed extents to be split, so we need to pass in + * a disk_reservation: + */ + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i *split; + bool split_compressed = false; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); +retry: + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + + do { + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + split_iter = bch2_trans_copy_iter(&trans, iter); + ret = PTR_ERR_OR_ZERO(split_iter); + if (ret) + goto err; + + split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); + ret = PTR_ERR_OR_ZERO(split); + if (ret) + goto err; + + if (!split_compressed && + bch2_extent_is_compressed(bkey_i_to_s_c(k)) && + !bch2_extent_is_atomic(k, split_iter)) { + ret = bch2_disk_reservation_add(c, &disk_res, + k->k.size * + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + + split_compressed = true; + } + + bkey_copy(split, k); + bch2_cut_front(split_iter->pos, split); + bch2_extent_trim_atomic(split, split_iter); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); + bch2_btree_iter_set_pos(iter, split->k.p); + } while (bkey_cmp(iter->pos, k->k.p) < 0); + + if (split_compressed) { + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), + -((s64) k->k.size), + BCH_BUCKET_MARK_OVERWRITE) ?: + bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOMARK_OVERWRITES| + BTREE_INSERT_NO_CLEAR_REPLICAS); + } else { + ret = bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); + } + + if (ret) + goto err; +err: + if (ret == -EINTR) + goto retry; + + bch2_disk_reservation_put(c, &disk_res); + + return bch2_trans_exit(&trans) ?: ret; +} + +static int bch2_journal_replay(struct bch_fs *c, + struct journal_keys keys) +{ + struct journal *j = &c->journal; + struct journal_key *i; + int ret; + + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + + for_each_journal_key(keys, i) { + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + + switch (i->btree_id) { + case BTREE_ID_ALLOC: + ret = bch2_alloc_replay_key(c, i->k); + break; + case BTREE_ID_EXTENTS: + ret = bch2_extent_replay_key(c, i->k); + break; + default: + ret = bch2_btree_insert(c, i->btree_id, i->k, + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); + break; + } + + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", + ret); + return ret; + } + + cond_resched(); + } + + replay_now_at(j, j->replay_journal_seq_end); + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + return bch2_journal_error(j); +} + +static bool journal_empty(struct list_head *journal) +{ + return list_empty(journal) || + journal_entry_empty(&list_last_entry(journal, + struct journal_replay, list)->j); +} + +static int +verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, + struct list_head *journal) +{ + struct journal_replay *i = + list_last_entry(journal, struct journal_replay, list); + u64 start_seq = le64_to_cpu(i->j.last_seq); + u64 end_seq = le64_to_cpu(i->j.seq); + u64 seq = start_seq; + int ret = 0; + + list_for_each_entry(i, journal, list) { + fsck_err_on(seq != le64_to_cpu(i->j.seq), c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + seq, le64_to_cpu(i->j.seq) - 1, + start_seq, end_seq); + + seq = le64_to_cpu(i->j.seq); + + fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, + "found blacklisted journal entry %llu", seq); + + do { + seq++; + } while (bch2_journal_seq_is_blacklisted(c, seq, false)); + } +fsck_err: + return ret; +} + +/* journal replay early: */ + +static int journal_replay_entry_early(struct bch_fs *c, + struct jset_entry *entry) +{ + int ret = 0; + + switch (entry->type) { + case BCH_JSET_ENTRY_btree_root: { + struct btree_root *r; + + if (entry->btree_id >= BTREE_ID_NR) { + bch_err(c, "filesystem has unknown btree type %u", + entry->btree_id); + return -EINVAL; + } + + r = &c->btree_roots[entry->btree_id]; + + if (entry->u64s) { + r->level = entry->level; + bkey_copy(&r->key, &entry->start[0]); + r->error = 0; + } else { + r->error = -EIO; + } + r->alive = true; + break; + } + case BCH_JSET_ENTRY_usage: { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + switch (entry->btree_id) { + case FS_USAGE_RESERVED: + if (entry->level < BCH_REPLICAS_MAX) + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); + break; + case FS_USAGE_INODES: + c->usage_base->nr_inodes = le64_to_cpu(u->v); + break; + case FS_USAGE_KEY_VERSION: + atomic64_set(&c->key_version, + le64_to_cpu(u->v)); + break; + } + + break; + } + case BCH_JSET_ENTRY_data_usage: { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, + le64_to_cpu(u->v)); + break; + } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq) + 1); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end) + 1); + break; + } + } + + return ret; +} + +static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct list_head *journal) +{ + struct jset_entry *entry; + int ret; + + if (clean) { + c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } else { + struct journal_replay *i = + list_last_entry(journal, struct journal_replay, list); + + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + + list_for_each_entry(i, journal, list) + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } + + bch2_fs_usage_initialize(c); + + return 0; +} + +/* sb clean section: */ + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + int ret = 0; + + if (!c->sb.clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-ENOMEM); + } + + if (le16_to_cpu(c->disk_sb.sb->version) < + bcachefs_metadata_version_bkey_renumber) + bch2_sb_clean_renumber(clean, READ); + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + +static int read_btree_roots(struct bch_fs *c) +{ + unsigned i; + int ret = 0; + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = &c->btree_roots[i]; + + if (!r->alive) + continue; + + if (i == BTREE_ID_ALLOC && + test_reconstruct_alloc(c)) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + continue; + } + + + if (r->error) { + __fsck_err(c, i == BTREE_ID_ALLOC + ? FSCK_CAN_IGNORE : 0, + "invalid btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { + __fsck_err(c, i == BTREE_ID_ALLOC + ? FSCK_CAN_IGNORE : 0, + "error reading btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_ALLOC) + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + } + } + + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b) + bch2_btree_root_alloc(c, i); +fsck_err: + return ret; +} + +int bch2_fs_recovery(struct bch_fs *c) +{ + const char *err = "cannot allocate memory"; + struct bch_sb_field_clean *clean = NULL; + u64 journal_seq; + LIST_HEAD(journal_entries); + struct journal_keys journal_keys = { NULL }; + bool wrote = false, write_sb = false; + int ret; + + if (c->sb.clean) + clean = read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; + + if (c->sb.clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + + if (!c->replicas.entries) { + bch_info(c, "building replicas info"); + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + + if (!c->sb.clean || c->opts.fsck) { + struct jset *j; + + ret = bch2_journal_read(c, &journal_entries); + if (ret) + goto err; + + if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + + if (!c->sb.clean && list_empty(&journal_entries)) { + bch_err(c, "no journal entries found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; + } + + journal_keys = journal_keys_sort(&journal_entries); + if (!journal_keys.d) { + ret = -ENOMEM; + goto err; + } + + j = &list_last_entry(&journal_entries, + struct journal_replay, list)->j; + + ret = verify_superblock_clean(c, &clean, j); + if (ret) + goto err; + + journal_seq = le64_to_cpu(j->seq) + 1; + } else { + journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + + ret = journal_replay_early(c, clean, &journal_entries); + if (ret) + goto err; + + if (!c->sb.clean) { + ret = bch2_journal_seq_blacklist_add(c, + journal_seq, + journal_seq + 4); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; + } + + journal_seq += 4; + } + + ret = bch2_blacklist_table_initialize(c); + + if (!list_empty(&journal_entries)) { + ret = verify_journal_entries_not_blacklisted_or_missing(c, + &journal_entries); + if (ret) + goto err; + } + + ret = bch2_fs_journal_start(&c->journal, journal_seq, + &journal_entries); + if (ret) + goto err; + + ret = read_btree_roots(c); + if (ret) + goto err; + + bch_verbose(c, "starting alloc read"); + err = "error reading allocation information"; + ret = bch2_alloc_read(c, &journal_keys); + if (ret) + goto err; + bch_verbose(c, "alloc read done"); + + bch_verbose(c, "starting stripes_read"); + err = "error reading stripes"; + ret = bch2_stripes_read(c, &journal_keys); + if (ret) + goto err; + bch_verbose(c, "stripes_read done"); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + + if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { + /* + * interior btree node updates aren't consistent with the + * journal; after an unclean shutdown we have to walk all + * pointers to metadata: + */ + bch_info(c, "starting metadata mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, NULL, true, true); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); + } + + if (c->opts.fsck || + !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || + test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + bch_info(c, "starting mark and sweep"); + err = "error in mark and sweep"; + ret = bch2_gc(c, &journal_keys, true, false); + if (ret) + goto err; + bch_verbose(c, "mark and sweep done"); + } + + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type && !c->sb.clean) + atomic64_add(1 << 16, &c->key_version); + + if (c->opts.norecovery) + goto out; + + bch_verbose(c, "starting journal replay"); + err = "journal replay failed"; + ret = bch2_journal_replay(c, journal_keys); + if (ret) + goto err; + bch_verbose(c, "journal replay done"); + + if (!c->opts.nochanges) { + /* + * note that even when filesystem was clean there might be work + * to do here, if we ran gc (because of fsck) which recalculated + * oldest_gen: + */ + bch_verbose(c, "writing allocation info"); + err = "error writing out alloc info"; + ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: + bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); + if (ret) { + bch_err(c, "error writing alloc info"); + goto err; + } + bch_verbose(c, "alloc write done"); + } + + if (!c->sb.clean) { + if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + bch_info(c, "checking inode link counts"); + err = "error in recovery"; + ret = bch2_fsck_inode_nlink(c); + if (ret) + goto err; + bch_verbose(c, "check inodes done"); + + } else { + bch_verbose(c, "checking for deleted inodes"); + err = "error in recovery"; + ret = bch2_fsck_walk_inodes_only(c); + if (ret) + goto err; + bch_verbose(c, "check inodes done"); + } + } + + if (c->opts.fsck) { + bch_info(c, "starting fsck"); + err = "error in fsck"; + ret = bch2_fsck_full(c); + if (ret) + goto err; + bch_verbose(c, "fsck done"); + } + + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } + + mutex_lock(&c->sb_lock); + if (c->opts.version_upgrade) { + if (c->sb.version < bcachefs_metadata_version_new_versioning) + c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_min); + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + write_sb = true; + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + write_sb = true; + } + + if (c->opts.fsck && + !test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); +out: + ret = 0; +err: +fsck_err: + bch2_flush_fsck_errs(c); + journal_keys_free(&journal_keys); + journal_entries_free(&journal_entries); + kfree(clean); + if (ret) + bch_err(c, "Error in recovery: %s (%i)", err, ret); + else + bch_verbose(c, "ret %i", ret); + return ret; +} + +int bch2_fs_initialize(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + struct bkey_inode_buf packed_inode; + struct bch_hash_info root_hash_info; + struct qstr lostfound = QSTR("lost+found"); + const char *err = "cannot allocate memory"; + struct bch_dev *ca; + LIST_HEAD(journal); + unsigned i; + int ret; + + bch_notice(c, "initializing new filesystem"); + + mutex_lock(&c->sb_lock); + for_each_online_member(ca, c, i) + bch2_mark_dev_superblock(c, ca, 0); + mutex_unlock(&c->sb_lock); + + set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + + err = "unable to allocate journal buckets"; + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + } + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch2_fs_journal_start(&c->journal, 1, &journal); + bch2_journal_set_replay_done(&c->journal); + + err = "error going read write"; + ret = __bch2_fs_read_write(c, true); + if (ret) + goto err; + + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_nlink++; /* lost+found */ + bch2_inode_pack(&packed_inode, &root_inode); + + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, 0); + if (ret) + goto err; + + bch2_inode_init(c, &lostfound_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, + &root_inode); + lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1; + bch2_inode_pack(&packed_inode, &lostfound_inode); + + err = "error creating lost+found"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, + NULL, NULL, 0); + if (ret) + goto err; + + root_hash_info = bch2_hash_info_init(c, &root_inode); + + ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, + &lostfound, lostfound_inode.bi_inum, NULL, + BTREE_INSERT_NOFAIL); + if (ret) + goto err; + + if (enabled_qtypes(c)) { + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + } + + err = "error writing first journal entry"; + ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = c->disk_sb.sb->version_min = + le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; + + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +err: + pr_err("Error initializing new filesystem: %s (%i)", err, ret); + return ret; +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 index 000000000000..a69260d6165a --- /dev/null +++ b/fs/bcachefs/recovery.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H + +struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; + unsigned allocated:1; + struct bpos pos; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; + } *d; + size_t nr; + u64 journal_seq_base; +}; + +#define for_each_journal_key(keys, i) \ + for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) + +int bch2_fs_recovery(struct bch_fs *); +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 6c52d1d456c5..4818453c015a 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -1,5 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "buckets.h" +#include "journal.h" #include "replicas.h" #include "super-io.h" @@ -8,100 +11,123 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, /* Replicas tracking - in memory: */ -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -static inline struct bch_replicas_cpu_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +static inline int u8_cmp(u8 l, u8 r) { - return (void *) r->entries + r->entry_size * i; + return cmp_int(l, r); } -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) +static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +#ifdef CONFIG_BCACHES_DEBUG + unsigned i; + + for (i = 0; i + 1 < e->nr_devs; i++) + BUG_ON(e->devs[i] >= e->devs[i + 1]); +#endif } -static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) +static void replicas_entry_sort(struct bch_replicas_entry *e) { - return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; + bubble_sort(e->devs, e->nr_devs, u8_cmp); } -static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) +static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - e->devs[dev >> 3] |= 1 << (dev & 7); + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } -static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) { - return (r->entry_size - - offsetof(struct bch_replicas_cpu_entry, devs)) * 8; + unsigned i; + + pr_buf(out, "%s: %u/%u [", + bch2_data_types[e->data_type], + e->nr_required, + e->nr_devs); + + for (i = 0; i < e->nr_devs; i++) + pr_buf(out, i ? " %u" : "%u", e->devs[i]); + pr_buf(out, "]"); } -int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r, - char *buf, size_t size) +void bch2_cpu_replicas_to_text(struct printbuf *out, + struct bch_replicas_cpu *r) { - char *out = buf, *end = out + size; - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; bool first = true; - unsigned i; for_each_cpu_replicas_entry(r, e) { - bool first_e = true; - if (!first) - out += scnprintf(out, end - out, " "); + pr_buf(out, " "); first = false; - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) { - if (!first_e) - out += scnprintf(out, end - out, " "); - first_e = false; - out += scnprintf(out, end - out, "%u", i); - } - out += scnprintf(out, end - out, "]"); + bch2_replicas_entry_to_text(out, e); } +} + +static void extent_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + r->nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; - return out - buf; + if (p.ec_nr) { + r->nr_devs = 0; + break; + } + + r->devs[r->nr_devs++] = p.ptr.dev; + } } -static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) +static void stripe_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; - unsigned nr = 0; - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); + r->nr_required = s.v->nr_blocks - s.v->nr_redundant; - memset(r, 0, sizeof(*r)); - r->data_type = data_type; + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + r->devs[r->nr_devs++] = ptr->dev; +} - *max_dev = 0; +void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) +{ + e->nr_devs = 0; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + e->data_type = BCH_DATA_BTREE; + extent_to_replicas(k, e); + break; + case KEY_TYPE_extent: + e->data_type = BCH_DATA_USER; + extent_to_replicas(k, e); + break; + case KEY_TYPE_stripe: + e->data_type = BCH_DATA_USER; + stripe_to_replicas(k, e); + break; + } - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - *max_dev = max_t(unsigned, *max_dev, ptr->dev); - replicas_set_dev(r, ptr->dev); - nr++; - } - return nr; + replicas_entry_sort(e); } -static inline void devlist_to_replicas(struct bch_devs_list devs, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) +void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + enum bch_data_type data_type, + struct bch_devs_list devs) { unsigned i; @@ -109,312 +135,655 @@ static inline void devlist_to_replicas(struct bch_devs_list devs, data_type == BCH_DATA_SB || data_type >= BCH_DATA_NR); - memset(r, 0, sizeof(*r)); - r->data_type = data_type; + e->data_type = data_type; + e->nr_devs = 0; + e->nr_required = 1; - *max_dev = 0; + for (i = 0; i < devs.nr; i++) + e->devs[e->nr_devs++] = devs.devs[i]; - for (i = 0; i < devs.nr; i++) { - *max_dev = max_t(unsigned, *max_dev, devs.devs[i]); - replicas_set_dev(r, devs.devs[i]); - } + replicas_entry_sort(e); } -static struct bch_replicas_cpu * +static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_replicas_cpu *old, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) + struct bch_replicas_entry *new_entry) { - struct bch_replicas_cpu *new; - unsigned i, nr, entry_size; - - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - entry_size = max(entry_size, old->entry_size); - nr = old->nr + 1; + unsigned i; + struct bch_replicas_cpu new = { + .nr = old->nr + 1, + .entry_size = max_t(unsigned, old->entry_size, + replicas_entry_bytes(new_entry)), + }; - new = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!new) - return NULL; + BUG_ON(!new_entry->data_type); + verify_replicas_entry_sorted(new_entry); - new->nr = nr; - new->entry_size = entry_size; + new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); + if (!new.entries) + return new; for (i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(new, i), + memcpy(cpu_replicas_entry(&new, i), cpu_replicas_entry(old, i), - min(new->entry_size, old->entry_size)); + old->entry_size); - memcpy(cpu_replicas_entry(new, old->nr), - &new_entry, - new->entry_size); + memcpy(cpu_replicas_entry(&new, old->nr), + new_entry, + replicas_entry_bytes(new_entry)); - bch2_cpu_replicas_sort(new); + bch2_cpu_replicas_sort(&new); return new; } -static bool replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_cpu_entry search, - unsigned max_dev) +static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + int idx, entry_size = replicas_entry_bytes(search); + + if (unlikely(entry_size > r->entry_size)) + return -1; + + verify_replicas_entry_sorted(search); + +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) + idx = eytzinger0_find(r->entries, r->nr, r->entry_size, + entry_cmp, search); +#undef entry_cmp + + return idx < r->nr ? idx : -1; +} + +int bch2_replicas_entry_idx(struct bch_fs *c, + struct bch_replicas_entry *search) { - return max_dev < replicas_dev_slots(r) && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, &search) < r->nr; + replicas_entry_sort(search); + + return __replicas_entry_idx(&c->replicas, search); +} + +static bool __replicas_has_entry(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + return __replicas_entry_idx(r, search) >= 0; +} + +static bool bch2_replicas_marked_locked(struct bch_fs *c, + struct bch_replicas_entry *search, + bool check_gc_replicas) +{ + if (!search->nr_devs) + return true; + + verify_replicas_entry_sorted(search); + + return __replicas_has_entry(&c->replicas, search) && + (!check_gc_replicas || + likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search, + bool check_gc_replicas) +{ + bool marked; + + percpu_down_read(&c->mark_lock); + marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); + percpu_up_read(&c->mark_lock); + + return marked; +} + +static void __replicas_table_update(struct bch_fs_usage *dst, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage *src, + struct bch_replicas_cpu *src_r) +{ + int src_idx, dst_idx; + + *dst = *src; + + for (src_idx = 0; src_idx < src_r->nr; src_idx++) { + if (!src->replicas[src_idx]) + continue; + + dst_idx = __replicas_entry_idx(dst_r, + cpu_replicas_entry(src_r, src_idx)); + BUG_ON(dst_idx < 0); + + dst->replicas[dst_idx] = src->replicas[src_idx]; + } +} + +static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage __percpu *src_p, + struct bch_replicas_cpu *src_r) +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((void *) src_p, src_nr); + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + __replicas_table_update(dst, dst_r, src, src_r); +} + +/* + * Resize filesystem accounting: + */ +static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) +{ + struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; + struct bch_fs_usage *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; + unsigned bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; + int ret = -ENOMEM; + + if (!(new_base = kzalloc(bytes, GFP_NOIO)) || + !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO)) || + !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), + GFP_NOIO)) || + !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + (c->usage_gc && + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) + goto err; + + if (c->usage_base) + __replicas_table_update(new_base, new_r, + c->usage_base, &c->replicas); + if (c->usage[0]) + __replicas_table_update_pcpu(new_usage[0], new_r, + c->usage[0], &c->replicas); + if (c->usage[1]) + __replicas_table_update_pcpu(new_usage[1], new_r, + c->usage[1], &c->replicas); + if (c->usage_gc) + __replicas_table_update_pcpu(new_gc, new_r, + c->usage_gc, &c->replicas); + + swap(c->usage_base, new_base); + swap(c->usage[0], new_usage[0]); + swap(c->usage[1], new_usage[1]); + swap(c->usage_scratch, new_scratch); + swap(c->usage_gc, new_gc); + swap(c->replicas, *new_r); + ret = 0; +err: + free_percpu(new_gc); + kfree(new_scratch); + free_percpu(new_usage[1]); + free_percpu(new_usage[0]); + kfree(new_base); + return ret; +} + +static unsigned reserve_journal_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + unsigned journal_res_u64s = 0; + + /* nr_inodes: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* key_version: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* persistent_reserved: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * + BCH_REPLICAS_MAX; + + for_each_cpu_replicas_entry(r, e) + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + + e->nr_devs, sizeof(u64)); + return journal_res_u64s; } noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) + struct bch_replicas_entry *new_entry) { - struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL; + struct bch_replicas_cpu new_r, new_gc; int ret = -ENOMEM; + memset(&new_r, 0, sizeof(new_r)); + memset(&new_gc, 0, sizeof(new_gc)); + mutex_lock(&c->sb_lock); - old_gc = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { - new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); - if (!new_gc) + if (c->replicas_gc.entries && + !__replicas_has_entry(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); + if (!new_gc.entries) goto err; } - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - if (!replicas_has_entry(old_r, new_entry, max_dev)) { - new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); - if (!new_r) + if (!__replicas_has_entry(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(&c->replicas, new_entry); + if (!new_r.entries) goto err; - ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); + ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) goto err; + + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &new_r)); } + if (!new_r.entries && + !new_gc.entries) + goto out; + /* allocations done, now commit: */ - if (new_r) + if (new_r.entries) bch2_write_super(c); /* don't update in memory replicas until changes are persistent */ - - if (new_gc) { - rcu_assign_pointer(c->replicas_gc, new_gc); - kfree_rcu(old_gc, rcu); - } - - if (new_r) { - rcu_assign_pointer(c->replicas, new_r); - kfree_rcu(old_r, rcu); - } - - mutex_unlock(&c->sb_lock); - return 0; + percpu_down_write(&c->mark_lock); + if (new_r.entries) + ret = replicas_table_update(c, &new_r); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + percpu_up_write(&c->mark_lock); +out: + ret = 0; err: mutex_unlock(&c->sb_lock); - if (new_gc) - kfree(new_gc); - if (new_r) - kfree(new_r); + + kfree(new_r.entries); + kfree(new_gc.entries); + return ret; } int bch2_mark_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) + struct bch_replicas_entry *r) { - struct bch_replicas_cpu_entry search; - struct bch_replicas_cpu *r, *gc_r; - unsigned max_dev; - bool marked; + return likely(bch2_replicas_marked(c, r, true)) + ? 0 + : bch2_mark_replicas_slowpath(c, r); +} - if (!devs.nr) - return 0; +bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, + struct bkey_s_c k, + bool check_gc_replicas) +{ + struct bch_replicas_padded search; + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; - BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); - devlist_to_replicas(devs, data_type, &search, &max_dev); + if (!bch2_replicas_marked_locked(c, &search.e, + check_gc_replicas)) + return false; + } - rcu_read_lock(); - r = rcu_dereference(c->replicas); - gc_r = rcu_dereference(c->replicas_gc); - marked = replicas_has_entry(r, search, max_dev) && - (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); - rcu_read_unlock(); + bch2_bkey_to_replicas(&search.e, k); - return likely(marked) ? 0 - : bch2_mark_replicas_slowpath(c, search, max_dev); + return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); } -int bch2_mark_bkey_replicas(struct bch_fs *c, - enum bch_data_type data_type, - struct bkey_s_c k) +bool bch2_bkey_replicas_marked(struct bch_fs *c, + struct bkey_s_c k, + bool check_gc_replicas) { + bool marked; + + percpu_down_read(&c->mark_lock); + marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); + percpu_up_read(&c->mark_lock); + + return marked; +} + +int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_replicas_padded search; struct bch_devs_list cached = bch2_bkey_cached_devs(k); unsigned i; int ret; - for (i = 0; i < cached.nr; i++) - if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i])))) + for (i = 0; i < cached.nr; i++) { + bch2_replicas_entry_cached(&search.e, cached.devs[i]); + + ret = bch2_mark_replicas(c, &search.e); + if (ret) return ret; + } + + bch2_bkey_to_replicas(&search.e, k); - return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); + return bch2_mark_replicas(c, &search.e); } -int bch2_replicas_gc_end(struct bch_fs *c, int err) +int bch2_replicas_gc_end(struct bch_fs *c, int ret) { - struct bch_replicas_cpu *new_r, *old_r; - int ret = 0; + unsigned i; lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + /* + * this is kind of crappy; the replicas gc mechanism needs to be ripped + * out + */ + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct bch_replicas_cpu n; + + if (!__replicas_has_entry(&c->replicas_gc, e) && + (c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i]))) { + n = cpu_replicas_add_entry(&c->replicas_gc, e); + if (!n.entries) { + ret = -ENOSPC; + goto err; + } - new_r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - - if (err) { - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(new_r, rcu); - goto err; + swap(n, c->replicas_gc); + kfree(n.entries); + } } - if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { + if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ret = -ENOSPC; goto err; } - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); + ret = replicas_table_update(c, &c->replicas_gc); +err: + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + + percpu_up_write(&c->mark_lock); - rcu_assign_pointer(c->replicas, new_r); - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(old_r, rcu); + if (!ret) + bch2_write_super(c); - bch2_write_super(c); -err: mutex_unlock(&c->sb_lock); + return ret; } int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_cpu *dst, *src; - struct bch_replicas_cpu_entry *e; + struct bch_replicas_entry *e; + unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc); + BUG_ON(c->replicas_gc.entries); - src = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); + c->replicas_gc.nr = 0; + c->replicas_gc.entry_size = 0; - dst = kzalloc(sizeof(struct bch_replicas_cpu) + - src->nr * src->entry_size, GFP_NOIO); - if (!dst) { + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) { + c->replicas_gc.nr++; + c->replicas_gc.entry_size = + max_t(unsigned, c->replicas_gc.entry_size, + replicas_entry_bytes(e)); + } + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, + GFP_NOIO); + if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); return -ENOMEM; } - dst->nr = 0; - dst->entry_size = src->entry_size; - - for_each_cpu_replicas_entry(src, e) + for_each_cpu_replicas_entry(&c->replicas, e) if (!((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(dst, dst->nr++), - e, dst->entry_size); + memcpy(cpu_replicas_entry(&c->replicas_gc, i++), + e, c->replicas_gc.entry_size); - bch2_cpu_replicas_sort(dst); + bch2_cpu_replicas_sort(&c->replicas_gc); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_replicas_gc2(struct bch_fs *c) +{ + struct bch_replicas_cpu new = { 0 }; + unsigned i, nr; + int ret = 0; + + bch2_journal_meta(&c->journal); +retry: + nr = READ_ONCE(c->replicas.nr); + new.entry_size = READ_ONCE(c->replicas.entry_size); + new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); + if (!new.entries) + return -ENOMEM; + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + if (nr != c->replicas.nr || + new.entry_size != c->replicas.entry_size) { + percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sb_lock); + kfree(new.entries); + goto retry; + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + if (e->data_type == BCH_DATA_JOURNAL || + c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i])) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } + + bch2_cpu_replicas_sort(&new); + + if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { + ret = -ENOSPC; + goto err; + } + + ret = replicas_table_update(c, &new); +err: + kfree(new.entries); + + percpu_up_write(&c->mark_lock); + + if (!ret) + bch2_write_super(c); - rcu_assign_pointer(c->replicas_gc, dst); mutex_unlock(&c->sb_lock); + return ret; +} + +int bch2_replicas_set_usage(struct bch_fs *c, + struct bch_replicas_entry *r, + u64 sectors) +{ + int ret, idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) { + struct bch_replicas_cpu n; + + n = cpu_replicas_add_entry(&c->replicas, r); + if (!n.entries) + return -ENOMEM; + + ret = replicas_table_update(c, &n); + if (ret) + return ret; + + kfree(n.entries); + + idx = bch2_replicas_entry_idx(c, r); + BUG_ON(ret < 0); + } + + c->usage_base->replicas[idx] = sectors; + return 0; } /* Replicas tracking - superblock: */ -static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, - unsigned *nr, - unsigned *bytes, - unsigned *max_dev) +static int +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry *i; - unsigned j; + struct bch_replicas_entry *e, *dst; + unsigned nr = 0, entry_size = 0, idx = 0; - *nr = 0; - *bytes = sizeof(*r); - *max_dev = 0; + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } - if (!r) - return; + cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + if (!cpu_r->entries) + return -ENOMEM; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; - for_each_replicas_entry(r, i) { - for (j = 0; j < i->nr; j++) - *max_dev = max_t(unsigned, *max_dev, i->devs[j]); - (*nr)++; + for_each_replicas_entry(sb_r, e) { + dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(dst, e, replicas_entry_bytes(e)); + replicas_entry_sort(dst); } - *bytes = (void *) i - (void *) r; + return 0; } -static struct bch_replicas_cpu * -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) +static int +__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_cpu *cpu_r; - unsigned i, nr, bytes, max_dev, entry_size; + struct bch_replicas_entry_v0 *e; + unsigned nr = 0, entry_size = 0, idx = 0; - bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); + entry_size += sizeof(struct bch_replicas_entry) - + sizeof(struct bch_replicas_entry_v0); - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!cpu_r) - return NULL; + cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + if (!cpu_r->entries) + return -ENOMEM; cpu_r->nr = nr; cpu_r->entry_size = entry_size; - if (nr) { - struct bch_replicas_cpu_entry *dst = - cpu_replicas_entry(cpu_r, 0); - struct bch_replicas_entry *src = sb_r->entries; - - while (dst < cpu_replicas_entry(cpu_r, nr)) { - dst->data_type = src->data_type; - for (i = 0; i < src->nr; i++) - replicas_set_dev(dst, src->devs[i]); - - src = replicas_entry_next(src); - dst = (void *) dst + entry_size; - } + for_each_replicas_entry(sb_r, e) { + struct bch_replicas_entry *dst = + cpu_replicas_entry(cpu_r, idx++); + + dst->data_type = e->data_type; + dst->nr_devs = e->nr_devs; + dst->nr_required = 1; + memcpy(dst->devs, e->devs, e->nr_devs); + replicas_entry_sort(dst); } - bch2_cpu_replicas_sort(cpu_r); - return cpu_r; + return 0; } int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *cpu_r, *old_r; + struct bch_sb_field_replicas *sb_v1; + struct bch_sb_field_replicas_v0 *sb_v0; + struct bch_replicas_cpu new_r = { 0, 0, NULL }; + int ret = 0; + + if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) + ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); + else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - sb_r = bch2_sb_get_replicas(c->disk_sb.sb); - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) + if (ret) return -ENOMEM; - old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas, cpu_r); - if (old_r) - kfree_rcu(old_r, rcu); + bch2_cpu_replicas_sort(&new_r); + + percpu_down_write(&c->mark_lock); + + ret = replicas_table_update(c, &new_r); + percpu_up_write(&c->mark_lock); + + kfree(new_r.entries); + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas_v0 *sb_r; + struct bch_replicas_entry_v0 *dst; + struct bch_replicas_entry *src; + size_t bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, src) + bytes += replicas_entry_bytes(src) - 1; + + sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) + return -ENOSPC; + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); + sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; + memcpy(dst->devs, src->devs, src->nr_devs); + + dst = replicas_entry_next(dst); + + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); + } return 0; } @@ -423,48 +792,74 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *sb_e; - struct bch_replicas_cpu_entry *e; - size_t i, bytes; + struct bch_replicas_entry *dst, *src; + bool need_v1 = false; + size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); - for_each_cpu_replicas_entry(r, e) { - bytes += sizeof(struct bch_replicas_entry); - for (i = 0; i < r->entry_size - 1; i++) - bytes += hweight8(e->devs[i]); + for_each_cpu_replicas_entry(r, src) { + bytes += replicas_entry_bytes(src); + if (src->nr_required != 1) + need_v1 = true; } + if (!need_v1) + return bch2_cpu_replicas_to_sb_replicas_v0(c, r); + sb_r = bch2_sb_resize_replicas(&c->disk_sb, - DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); + DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -ENOSPC; + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); + sb_r = bch2_sb_get_replicas(c->disk_sb.sb); + memset(&sb_r->entries, 0, vstruct_end(&sb_r->field) - (void *) &sb_r->entries); - sb_e = sb_r->entries; - for_each_cpu_replicas_entry(r, e) { - sb_e->data_type = e->data_type; - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) - sb_e->devs[sb_e->nr++] = i; + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + memcpy(dst, src, replicas_entry_bytes(src)); - sb_e = replicas_entry_next(sb_e); + dst = replicas_entry_next(dst); - BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); } return 0; } +static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) +{ + unsigned i; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i + 1 < cpu_r->nr; i++) { + struct bch_replicas_entry *l = + cpu_replicas_entry(cpu_r, i); + struct bch_replicas_entry *r = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + + if (!memcmp(l, r, cpu_r->entry_size)) + return "duplicate replicas entry"; + } + + return NULL; +} + static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_cpu cpu_r = { .entries = NULL }; struct bch_replicas_entry *e; const char *err; unsigned i; @@ -475,166 +870,140 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi goto err; err = "invalid replicas entry: no devices"; - if (!e->nr) + if (!e->nr_devs) goto err; - err = "invalid replicas entry: too many devices"; - if (e->nr >= BCH_REPLICAS_MAX) + err = "invalid replicas entry: bad nr_required"; + if (!e->nr_required || + (e->nr_required > 1 && + e->nr_required >= e->nr_devs)) goto err; err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr; i++) + for (i = 0; i < e->nr_devs; i++) if (!bch2_dev_exists(sb, mi, e->devs[i])) goto err; } err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) goto err; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); - - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_cpu_entry *l = - cpu_replicas_entry(cpu_r, i); - struct bch_replicas_cpu_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - - err = "duplicate replicas entry"; - if (!memcmp(l, r, cpu_r->entry_size)) - goto err; - } - - err = NULL; + err = check_dup_replicas_entries(&cpu_r); err: - kfree(cpu_r); + kfree(cpu_r.entries); return err; } -const struct bch_sb_field_ops bch_sb_field_ops_replicas = { - .validate = bch2_sb_validate_replicas, -}; - -int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size) +static void bch2_sb_replicas_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) { - char *out = buf, *end = out + size; + struct bch_sb_field_replicas *r = field_to_type(f, replicas); struct bch_replicas_entry *e; bool first = true; - unsigned i; - - if (!r) { - out += scnprintf(out, end - out, "(no replicas section found)"); - return out - buf; - } for_each_replicas_entry(r, e) { if (!first) - out += scnprintf(out, end - out, " "); + pr_buf(out, " "); first = false; - out += scnprintf(out, end - out, "%u: [", e->data_type); - - for (i = 0; i < e->nr; i++) - out += scnprintf(out, end - out, - i ? " %u" : "%u", e->devs[i]); - out += scnprintf(out, end - out, "]"); + bch2_replicas_entry_to_text(out, e); } - - return out - buf; } -/* Query replicas: */ +const struct bch_sb_field_ops bch_sb_field_ops_replicas = { + .validate = bch2_sb_validate_replicas, + .to_text = bch2_sb_replicas_to_text, +}; -bool bch2_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) +static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) { - struct bch_replicas_cpu_entry search; - unsigned max_dev; - bool ret; + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + struct bch_replicas_cpu cpu_r = { .entries = NULL }; + struct bch_replicas_entry_v0 *e; + const char *err; + unsigned i; - if (!devs.nr) - return true; + for_each_replicas_entry_v0(sb_r, e) { + err = "invalid replicas entry: invalid data type"; + if (e->data_type >= BCH_DATA_NR) + goto err; - devlist_to_replicas(devs, data_type, &search, &max_dev); + err = "invalid replicas entry: no devices"; + if (!e->nr_devs) + goto err; - rcu_read_lock(); - ret = replicas_has_entry(rcu_dereference(c->replicas), - search, max_dev); - rcu_read_unlock(); + err = "invalid replicas entry: invalid device"; + for (i = 0; i < e->nr_devs; i++) + if (!bch2_dev_exists(sb, mi, e->devs[i])) + goto err; + } - return ret; -} + err = "cannot allocate memory"; + if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) + goto err; -bool bch2_bkey_replicas_marked(struct bch_fs *c, - enum bch_data_type data_type, - struct bkey_s_c k) -{ - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; + err = check_dup_replicas_entries(&cpu_r); +err: + kfree(cpu_r.entries); + return err; +} - for (i = 0; i < cached.nr; i++) - if (!bch2_replicas_marked(c, BCH_DATA_CACHED, - bch2_dev_list_single(cached.devs[i]))) - return false; +const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { + .validate = bch2_sb_validate_replicas_v0, +}; - return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k)); -} +/* Query replicas: */ struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { struct bch_sb_field_members *mi; - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned i, dev, dev_slots, nr_online, nr_offline; + struct bch_replicas_entry *e; + unsigned i, nr_online, nr_offline; struct replicas_status ret; memset(&ret, 0, sizeof(ret)); for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - ret.replicas[i].nr_online = UINT_MAX; + ret.replicas[i].redundancy = INT_MAX; mi = bch2_sb_get_members(c->disk_sb.sb); - rcu_read_lock(); - r = rcu_dereference(c->replicas); - dev_slots = replicas_dev_slots(r); + percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(r, e) { + for_each_cpu_replicas_entry(&c->replicas, e) { if (e->data_type >= ARRAY_SIZE(ret.replicas)) panic("e %p data_type %u\n", e, e->data_type); nr_online = nr_offline = 0; - for (dev = 0; dev < dev_slots; dev++) { - if (!replicas_test_dev(e, dev)) - continue; + for (i = 0; i < e->nr_devs; i++) { + BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, + e->devs[i])); - BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev)); - - if (test_bit(dev, online_devs.d)) + if (test_bit(e->devs[i], online_devs.d)) nr_online++; else nr_offline++; } - ret.replicas[e->data_type].nr_online = - min(ret.replicas[e->data_type].nr_online, - nr_online); + ret.replicas[e->data_type].redundancy = + min(ret.replicas[e->data_type].redundancy, + (int) nr_online - (int) e->nr_required); ret.replicas[e->data_type].nr_offline = max(ret.replicas[e->data_type].nr_offline, nr_offline); } - rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + + for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) + if (ret.replicas[i].redundancy == INT_MAX) + ret.replicas[i].redundancy = 0; return ret; } @@ -650,7 +1019,7 @@ static bool have_enough_devs(struct replicas_status s, bool force_if_lost) { return (!s.replicas[type].nr_offline || force_if_degraded) && - (s.replicas[type].nr_online || force_if_lost); + (s.replicas[type].redundancy >= 0 || force_if_lost); } bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) @@ -666,33 +1035,37 @@ bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) flags & BCH_FORCE_IF_DATA_LOST)); } -unsigned bch2_replicas_online(struct bch_fs *c, bool meta) +int bch2_replicas_online(struct bch_fs *c, bool meta) { struct replicas_status s = bch2_replicas_status(c); - return meta - ? min(s.replicas[BCH_DATA_JOURNAL].nr_online, - s.replicas[BCH_DATA_BTREE].nr_online) - : s.replicas[BCH_DATA_USER].nr_online; + return (meta + ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, + s.replicas[BCH_DATA_BTREE].redundancy) + : s.replicas[BCH_DATA_USER].redundancy) + 1; } unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned ret = 0; + struct bch_replicas_entry *e; + unsigned i, ret = 0; - rcu_read_lock(); - r = rcu_dereference(c->replicas); + percpu_down_read(&c->mark_lock); - if (ca->dev_idx >= replicas_dev_slots(r)) - goto out; + for_each_cpu_replicas_entry(&c->replicas, e) + for (i = 0; i < e->nr_devs; i++) + if (e->devs[i] == ca->dev_idx) + ret |= 1 << e->data_type; - for_each_cpu_replicas_entry(r, e) - if (replicas_test_dev(e, ca->dev_idx)) - ret |= 1 << e->data_type; -out: - rcu_read_unlock(); + percpu_up_read(&c->mark_lock); return ret; } + +int bch2_fs_replicas_init(struct bch_fs *c) +{ + c->journal.entry_u64s_reserved += + reserve_journal_replicas(c, &c->replicas); + + return replicas_table_update(c, &c->replicas); +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 49f114b01c1e..0d6e19126021 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -1,21 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_REPLICAS_H #define _BCACHEFS_REPLICAS_H -bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); -bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type, - struct bkey_s_c); -int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); -int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type, - struct bkey_s_c); +#include "eytzinger.h" +#include "replicas_types.h" -int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); -int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t); +void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); +void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); + +static inline struct bch_replicas_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +int bch2_replicas_entry_idx(struct bch_fs *, + struct bch_replicas_entry *); + +void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +bool bch2_replicas_marked(struct bch_fs *, + struct bch_replicas_entry *, bool); +int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + +bool bch2_bkey_replicas_marked_locked(struct bch_fs *, + struct bkey_s_c, bool); +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +bool bch2_bkey_replicas_marked(struct bch_fs *, + struct bkey_s_c, bool); +int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); + +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +{ + e->data_type = BCH_DATA_CACHED; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +} struct replicas_status { struct { - unsigned nr_online; + int redundancy; unsigned nr_offline; } replicas[BCH_DATA_NR]; }; @@ -25,27 +54,45 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *, struct replicas_status bch2_replicas_status(struct bch_fs *); bool bch2_have_enough_devs(struct replicas_status, unsigned); -unsigned bch2_replicas_online(struct bch_fs *, bool); +int bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); int bch2_replicas_gc_start(struct bch_fs *, unsigned); +int bch2_replicas_gc2(struct bch_fs *); + +int bch2_replicas_set_usage(struct bch_fs *, + struct bch_replicas_entry *, + u64); + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ + _i = (void *) (_i) + (_r)->entry_size) /* iterate over superblock replicas - used by userspace tools: */ -static inline struct bch_replicas_entry * -replicas_entry_next(struct bch_replicas_entry *i) -{ - return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; -} +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#define replicas_entry_next(_i) \ + ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) #define for_each_replicas_entry(_r, _i) \ for (_i = (_r)->entries; \ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ (_i) = replicas_entry_next(_i)) +#define for_each_replicas_entry_v0(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; + +int bch2_fs_replicas_init(struct bch_fs *); #endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h new file mode 100644 index 000000000000..0535b1d3760e --- /dev/null +++ b/fs/bcachefs/replicas_types.h @@ -0,0 +1,10 @@ +#ifndef _BCACHEFS_REPLICAS_TYPES_H +#define _BCACHEFS_REPLICAS_TYPES_H + +struct bch_replicas_cpu { + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry *entries; +}; + +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c index 3a6c9c8217f0..c062edb3fbc2 100644 --- a/fs/bcachefs/siphash.c +++ b/fs/bcachefs/siphash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: BSD-3-Clause /* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ /*- diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h index 7a4b2241f1e1..3dfaf34a43b2 100644 --- a/fs/bcachefs/siphash.h +++ b/fs/bcachefs/siphash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ /* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ /*- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index f7dd0144f608..1779f755b21d 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_STR_HASH_H #define _BCACHEFS_STR_HASH_H @@ -11,6 +12,7 @@ #include <linux/crc32c.h> #include <crypto/hash.h> +#include <crypto/sha.h> struct bch_hash_info { u8 type; @@ -37,7 +39,7 @@ bch2_hash_info_init(struct bch_fs *c, break; case BCH_STR_HASH_SIPHASH: { SHASH_DESC_ON_STACK(desc, c->sha256); - u8 digest[crypto_shash_digestsize(c->sha256)]; + u8 digest[SHA256_DIGEST_SIZE]; desc->tfm = c->sha256; desc->flags = 0; @@ -117,7 +119,6 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, struct bch_hash_desc { enum btree_id btree_id; u8 key_type; - u8 whiteout_type; u64 (*hash_key)(const struct bch_hash_info *, const void *); u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); @@ -125,268 +126,192 @@ struct bch_hash_desc { bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); }; -static inline struct bkey_s_c -bch2_hash_lookup_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, const void *search) +static __always_inline struct btree_iter * +bch2_hash_lookup(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key, + unsigned flags) { - u64 inode = iter->pos.inode; - struct bkey_s_c k; - - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { - if (iter->pos.inode != inode) - break; - - if (k.k->type == desc.key_type) { - if (!desc.cmp_key(k, search)) - return k; - } else if (k.k->type == desc.whiteout_type) { - ; - } else { - /* hole, not found */ - break; - } - } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); -} - -static inline struct bkey_s_c -bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, struct bkey_s_c search) -{ - u64 inode = iter->pos.inode; + struct btree_iter *iter; struct bkey_s_c k; + int ret; - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|flags, k, ret) { if (iter->pos.inode != inode) break; if (k.k->type == desc.key_type) { - if (!desc.cmp_bkey(k, search)) - return k; - } else if (k.k->type == desc.whiteout_type) { + if (!desc.cmp_key(k, key)) + return iter; + } else if (k.k->type == KEY_TYPE_whiteout) { ; } else { /* hole, not found */ break; } } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); -} - -static inline struct bkey_s_c -bch2_hash_lookup(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) -{ - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS); - return bch2_hash_lookup_at(desc, info, iter, key); + return ERR_PTR(ret ?: -ENOENT); } -static inline struct bkey_s_c -bch2_hash_lookup_intent(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, const void *key) +static __always_inline struct btree_iter * +bch2_hash_hole(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) { - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - return bch2_hash_lookup_at(desc, info, iter, key); -} - -static inline struct bkey_s_c -bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter) -{ - u64 inode = iter->pos.inode; + struct btree_iter *iter; struct bkey_s_c k; + int ret; - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_key(info, key)), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (iter->pos.inode != inode) break; if (k.k->type != desc.key_type) - return k; + return iter; } - return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT); -} -static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - struct btree_iter *iter, - const void *key) -{ - bch2_btree_iter_init(iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - return bch2_hash_hole_at(desc, iter); + return ERR_PTR(ret ?: -ENOSPC); } -static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - struct btree_iter *start) +static __always_inline +int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *start) { + struct btree_iter *iter; struct bkey_s_c k; + iter = bch2_trans_copy_iter(trans, start); + if (IS_ERR(iter)) + return PTR_ERR(iter); + bch2_btree_iter_next_slot(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { if (k.k->type != desc.key_type && - k.k->type != desc.whiteout_type) - return false; + k.k->type != KEY_TYPE_whiteout) + break; if (k.k->type == desc.key_type && - desc.hash_bkey(info, k) <= start->pos.offset) - return true; + desc.hash_bkey(info, k) <= start->pos.offset) { + bch2_trans_iter_free_on_commit(trans, iter); + return 1; + } } - return btree_iter_err(k); + + return bch2_trans_iter_free(trans, iter); } -static inline int bch2_hash_set(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - u64 *journal_seq, - struct bkey_i *insert, int flags) +static __always_inline +int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, struct bkey_i *insert, int flags) { - struct btree_iter iter, hashed_slot; + struct btree_iter *iter, *slot = NULL; struct bkey_s_c k; + bool found = false; int ret; - bch2_btree_iter_init(&hashed_slot, c, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_link(&hashed_slot, &iter); -retry: - /* - * On hash collision, we have to keep the slot we hashed to locked while - * we do the insert - to avoid racing with another thread deleting - * whatever's in the slot we hashed to: - */ - ret = bch2_btree_iter_traverse(&hashed_slot); - if (ret) - goto err; - - /* - * On -EINTR/retry, we dropped locks - always restart from the slot we - * hashed to: - */ - bch2_btree_iter_copy(&iter, &hashed_slot); - - k = bch2_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert)); - - ret = btree_iter_err(k); - if (ret == -ENOENT) { - if (flags & BCH_HASH_SET_MUST_REPLACE) { - ret = -ENOENT; - goto err; + for_each_btree_key(trans, iter, desc.btree_id, + POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (iter->pos.inode != inode) + break; + + if (k.k->type == desc.key_type) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; + + /* hash collision: */ + continue; } - /* - * Not found, so we're now looking for any open - * slot - we might have skipped over a whiteout - * that we could have used, so restart from the - * slot we hashed to: - */ - bch2_btree_iter_copy(&iter, &hashed_slot); - k = bch2_hash_hole_at(desc, &iter); - if ((ret = btree_iter_err(k))) - goto err; - } else if (!ret) { - if (flags & BCH_HASH_SET_MUST_CREATE) { - ret = -EEXIST; - goto err; + if (!slot && + !(flags & BCH_HASH_SET_MUST_REPLACE)) { + slot = bch2_trans_copy_iter(trans, iter); + if (IS_ERR(slot)) + return PTR_ERR(slot); } + + if (k.k->type != KEY_TYPE_whiteout) + goto not_found; + } + + if (slot) + bch2_trans_iter_free(trans, slot); + bch2_trans_iter_free(trans, iter); + + return ret ?: -ENOSPC; +found: + found = true; +not_found: + + if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + ret = -ENOENT; + } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + ret = -EEXIST; } else { - goto err; + if (!found && slot) { + bch2_trans_iter_free(trans, iter); + iter = slot; + } + + insert->k.p = iter->pos; + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert)); + bch2_trans_iter_free_on_commit(trans, iter); } - insert->k.p = iter.pos; - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, - BTREE_INSERT_ATOMIC|flags, - BTREE_INSERT_ENTRY(&iter, insert)); -err: - if (ret == -EINTR) - goto retry; - - /* - * On successful insert, we don't want to clobber ret with error from - * iter: - */ - bch2_btree_iter_unlock(&iter); - bch2_btree_iter_unlock(&hashed_slot); return ret; } -static inline int bch2_hash_delete_at(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct btree_iter *iter, - u64 *journal_seq) +static __always_inline +int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter) { - struct btree_iter whiteout_iter; - struct bkey_i delete; - int ret = -ENOENT; - - bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id, - iter->pos, BTREE_ITER_SLOTS); - bch2_btree_iter_link(iter, &whiteout_iter); + struct bkey_i *delete; + int ret; - ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter); + ret = bch2_hash_needs_whiteout(trans, desc, info, iter); if (ret < 0) - goto err; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED; - - ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC, - BTREE_INSERT_ENTRY(iter, &delete)); -err: - bch2_btree_iter_unlink(&whiteout_iter); - return ret; + return ret; + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + if (IS_ERR(delete)) + return PTR_ERR(delete); + + bkey_init(&delete->k); + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete)); + return 0; } -static inline int bch2_hash_delete(const struct bch_hash_desc desc, - const struct bch_hash_info *info, - struct bch_fs *c, u64 inode, - u64 *journal_seq, const void *key) +static __always_inline +int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + u64 inode, const void *key) { - struct btree_iter iter, whiteout_iter; - struct bkey_s_c k; - int ret = -ENOENT; - - bch2_btree_iter_init(&iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id, - POS(inode, desc.hash_key(info, key)), - BTREE_ITER_SLOTS); - bch2_btree_iter_link(&iter, &whiteout_iter); -retry: - k = bch2_hash_lookup_at(desc, info, &iter, key); - if ((ret = btree_iter_err(k))) - goto err; - - ret = bch2_hash_delete_at(desc, info, &iter, journal_seq); -err: - if (ret == -EINTR) - goto retry; - - bch2_btree_iter_unlock(&whiteout_iter); - bch2_btree_iter_unlock(&iter); - return ret; + struct btree_iter *iter; + + iter = bch2_hash_lookup(trans, desc, info, inode, key, + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + return bch2_hash_delete_at(trans, desc, info, iter); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 9772d5973078..5e1ae7e425ff 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1,9 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "buckets.h" #include "checksum.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "io.h" +#include "journal.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" #include "super-io.h" @@ -55,8 +60,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, void *src, *dst; src = vstruct_end(f); - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); + + if (u64s) { + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + } else { + dst = f; + } memmove(dst, src, vstruct_end(sb->sb) - src); @@ -66,7 +76,16 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, sb->sb->u64s = cpu_to_le32(sb_u64s); - return f; + return u64s ? f : NULL; +} + +void bch2_sb_field_delete(struct bch_sb_handle *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + + if (f) + __bch2_sb_field_resize(sb, f, 0); } /* Superblock realloc/free: */ @@ -89,6 +108,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) struct bch_sb *new_sb; struct bio *bio; + if (sb->sb && sb->page_order >= order) + return 0; + if (sb->have_layout) { u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; @@ -117,7 +139,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) sb->bio = bio; } - new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); if (!new_sb) return -ENOMEM; @@ -162,8 +184,10 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, } } + f = bch2_sb_field_get(sb->sb, type); f = __bch2_sb_field_resize(sb, f, u64s); - f->type = cpu_to_le32(type); + if (f) + f->type = cpu_to_le32(type); return f; } @@ -212,16 +236,24 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) struct bch_sb_field *f; struct bch_sb_field_members *mi; const char *err; + u32 version, version_min; u16 block_size; - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN || - le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX) - return"Unsupported superblock version"; + version = le16_to_cpu(sb->version); + version_min = version >= bcachefs_metadata_version_new_versioning + ? le16_to_cpu(sb->version_min) + : version; - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) { - SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7); - SET_BCH_SB_POSIX_ACL(sb, 1); - } + if (version >= bcachefs_metadata_version_max || + version_min < bcachefs_metadata_version_min) + return "Unsupported superblock version"; + + if (version_min > version) + return "Bad minimum version"; + + if (sb->features[1] || + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) + return "Filesystem has incompatible features"; block_size = le16_to_cpu(sb->block_size); @@ -309,13 +341,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) return err; } - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 && - bch2_sb_get_crypt(sb) && - BCH_SB_INITIALIZED(sb)) - return "Incompatible extent nonces"; - - sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); - return NULL; } @@ -332,6 +357,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); @@ -340,6 +366,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); c->sb.time_precision = le32_to_cpu(src->time_precision); c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); for_each_member_device(ca, c, i) ca->mi = bch2_mi_to_cpu(mi->members + i); @@ -350,8 +377,10 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; struct bch_sb *dst = dst_handle->sb; + unsigned i; dst->version = src->version; + dst->version_min = src->version_min; dst->seq = src->seq; dst->uuid = src->uuid; dst->user_uuid = src->user_uuid; @@ -368,15 +397,17 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) memcpy(dst->features, src->features, sizeof(dst->features)); memcpy(dst->compat, src->compat, sizeof(dst->compat)); - vstruct_for_each(src, src_f) { - if (src_f->type == BCH_SB_FIELD_journal) + for (i = 0; i < BCH_SB_FIELD_NR; i++) { + if (i == BCH_SB_FIELD_journal) continue; - dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type)); + src_f = bch2_sb_field_get(src, i); + dst_f = bch2_sb_field_get(dst, i); dst_f = __bch2_sb_field_resize(dst_handle, dst_f, - le32_to_cpu(src_f->u64s)); + src_f ? le32_to_cpu(src_f->u64s) : 0); - memcpy(dst_f, src_f, vstruct_bytes(src_f)); + if (src_f) + memcpy(dst_f, src_f, vstruct_bytes(src_f)); } } @@ -449,9 +480,9 @@ reread: if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) return "Not a bcachefs superblock"; - if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN || - le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX) - return"Unsupported superblock version"; + if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || + le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) + return "Unsupported superblock version"; bytes = vstruct_bytes(sb->sb); @@ -474,6 +505,8 @@ reread: if (bch2_crc_cmp(csum, sb->sb->csum)) return "bad checksum reading superblock"; + sb->seq = le64_to_cpu(sb->sb->seq); + return NULL; } @@ -609,6 +642,27 @@ static void write_super_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } +static void read_back_super(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); + bio->bi_iter.bi_size = PAGE_SIZE; + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(bio, ca->sb_read_scratch); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) { struct bch_sb *sb = ca->disk_sb.sb; @@ -638,7 +692,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) closure_bio_submit(bio, &c->sb_write); } -void bch2_write_super(struct bch_fs *c) +int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; struct bch_dev *ca; @@ -646,6 +700,7 @@ void bch2_write_super(struct bch_fs *c) const char *err; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; + int ret = 0; lockdep_assert_held(&c->sb_lock); @@ -654,6 +709,9 @@ void bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -661,12 +719,12 @@ void bch2_write_super(struct bch_fs *c) err = bch2_sb_validate(&ca->disk_sb); if (err) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err); + ret = -1; goto out; } } - if (c->opts.nochanges || - test_bit(BCH_FS_ERROR, &c->flags)) + if (c->opts.nochanges) goto out; for_each_online_member(ca, c, i) { @@ -674,10 +732,27 @@ void bch2_write_super(struct bch_fs *c) ca->sb_write_error = 0; } + for_each_online_member(ca, c, i) + read_back_super(c, ca); + closure_sync(cl); + + for_each_online_member(ca, c, i) { + if (!ca->sb_write_error && + ca->disk_sb.seq != + le64_to_cpu(ca->sb_read_scratch->seq)) { + bch2_fs_fatal_error(c, + "Superblock modified by another process"); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; + } + } + do { wrote = false; for_each_online_member(ca, c, i) - if (sb < ca->disk_sb.sb->layout.nr_superblocks) { + if (!ca->sb_write_error && + sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); wrote = true; } @@ -685,9 +760,12 @@ void bch2_write_super(struct bch_fs *c) sb++; } while (wrote); - for_each_online_member(ca, c, i) + for_each_online_member(ca, c, i) { if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); + else + ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); + } nr_wrote = dev_mask_nr(&sb_written); @@ -710,13 +788,15 @@ void bch2_write_super(struct bch_fs *c) * written anything (new filesystem), we continue if we'd be able to * mount with the devices we did successfully write to: */ - bch2_fs_fatal_err_on(!nr_wrote || - (can_mount_without_written && - !can_mount_with_written), c, - "Unable to write superblock to sufficient devices"); + if (bch2_fs_fatal_err_on(!nr_wrote || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices")) + ret = -1; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + return ret; } /* BCH_SB_FIELD_journal: */ @@ -804,7 +884,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb, return "Too many buckets"; if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < 1 << 10) + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) return "Not enough buckets"; if (le16_to_cpu(m->bucket_size) < @@ -816,12 +896,6 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb, return "bucket size smaller than btree node size"; } - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) - SET_BCH_MEMBER_DATA_ALLOWED(m, ~0); - return NULL; } @@ -849,6 +923,194 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { .validate = bch2_sb_validate_crypt, }; +/* BCH_SB_FIELD_clean: */ + +void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) +{ + struct jset_entry *entry; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) + bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); +} + +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; + + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +struct jset_entry * +bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry *entry, + u64 journal_seq) +{ + struct btree_root *r; + unsigned i; + + mutex_lock(&c->btree_root_lock); + + for (r = c->btree_roots; + r < c->btree_roots + BTREE_ID_NR; + r++) + if (r->alive) { + entry->u64s = r->key.u64s; + entry->btree_id = r - c->btree_roots; + entry->level = r->level; + entry->type = BCH_JSET_ENTRY_btree_root; + bkey_copy(&entry->start[0], &r->key); + + entry = vstruct_next(entry); + } + c->btree_roots_dirty = false; + + mutex_unlock(&c->btree_root_lock); + + percpu_down_write(&c->mark_lock); + + if (!journal_seq) { + bch2_fs_usage_acc_to_base(c, 0); + bch2_fs_usage_acc_to_base(c, 1); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & 1); + } + + { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + memset(u, 0, sizeof(*u)); + u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_INODES; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + + entry = vstruct_next(entry); + } + + { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + memset(u, 0, sizeof(*u)); + u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_KEY_VERSION; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + + entry = vstruct_next(entry); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + memset(u, 0, sizeof(*u)); + u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_RESERVED; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + + entry = vstruct_next(entry); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + memset(u, 0, sizeof(*u)); + u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs, + sizeof(u64)) - 1; + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + memcpy(&u->r, e, replicas_entry_bytes(e)); + + entry = vstruct_next(entry); + } + + percpu_up_write(&c->mark_lock); + + return entry; +} + +void bch2_fs_mark_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); + sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); + sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; + entry = bch2_journal_super_entries_add_common(c, entry, 0); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + if (le16_to_cpu(c->disk_sb.sb->version) < + bcachefs_metadata_version_bkey_renumber) + bch2_sb_clean_renumber(sb_clean, WRITE); + + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} + +static const char *bch2_sb_validate_clean(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) + return "invalid field crypt: wrong size"; + + return NULL; +} + +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_validate_clean, +}; + static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, @@ -866,21 +1128,20 @@ static const char *bch2_sb_field_validate(struct bch_sb *sb, : NULL; } -size_t bch2_sb_field_to_text(char *buf, size_t size, - struct bch_sb *sb, struct bch_sb_field *f) +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { unsigned type = le32_to_cpu(f->type); - size_t (*to_text)(char *, size_t, struct bch_sb *, - struct bch_sb_field *) = - type < BCH_SB_FIELD_NR - ? bch2_sb_field_ops[type]->to_text - : NULL; + const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type] : NULL; - if (!to_text) { - if (size) - buf[0] = '\0'; - return 0; - } + if (ops) + pr_buf(out, "%s", bch2_sb_fields[type]); + else + pr_buf(out, "(unknown field %u)", type); + + pr_buf(out, " (size %llu):", vstruct_bytes(f)); - return to_text(buf, size, sb, f); + if (ops && ops->to_text) + bch2_sb_field_ops[type]->to_text(out, sb, f); } diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 995b1c907318..f5450e596c62 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_SUPER_IO_H #define _BCACHEFS_SUPER_IO_H @@ -11,6 +12,7 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, enum bch_sb_field_type, unsigned); +void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); #define field_to_type(_f, _name) \ container_of_or_null(_f, struct bch_sb_field_##_name, field) @@ -37,7 +39,7 @@ extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { const char * (*validate)(struct bch_sb *, struct bch_sb_field *); - size_t (*to_text)(char *, size_t, struct bch_sb *, + void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; @@ -87,7 +89,7 @@ int bch2_sb_realloc(struct bch_sb_handle *, unsigned); const char *bch2_sb_validate(struct bch_sb_handle *); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -void bch2_write_super(struct bch_fs *); +int bch2_write_super(struct bch_fs *); /* BCH_SB_FIELD_journal: */ @@ -131,7 +133,18 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } -size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *, - struct bch_sb_field *); +/* BCH_SB_FIELD_clean: */ + +struct jset_entry * +bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry *, u64); + +void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); #endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 16b301ff698a..7e1b1bf43c31 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcachefs setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. @@ -7,10 +8,11 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" -#include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "chardev.h" @@ -19,6 +21,7 @@ #include "compress.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "fs.h" #include "fs-io.h" @@ -26,14 +29,14 @@ #include "inode.h" #include "io.h" #include "journal.h" -#include "journal_io.h" #include "journal_reclaim.h" -#include "keylist.h" +#include "journal_seq_blacklist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" #include "quota.h" #include "rebalance.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -201,22 +204,12 @@ int bch2_congested(void *data, int bdi_bits) * - allocator depends on the journal (when it rewrites prios and gens) */ -static void bch_fs_mark_clean(struct bch_fs *c) -{ - if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) { - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } -} - static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; - unsigned i; + bool wrote; + unsigned i, clean_passes = 0; + int ret; bch2_rebalance_stop(c); @@ -229,34 +222,58 @@ static void __bch2_fs_read_only(struct bch_fs *c) * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: */ - bch2_journal_flush_pins(&c->journal, U64_MAX - 1); + bch2_journal_flush_all_pins(&c->journal); - for_each_member_device(ca, c, i) - bch2_dev_allocator_stop(ca); + if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) + goto allocator_not_running; - bch2_journal_flush_all_pins(&c->journal); + do { + wrote = false; - /* - * We need to explicitly wait on btree interior updates to complete - * before stopping the journal, flushing all journal pins isn't - * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree - * interior updates have to drop their journal pin before they're - * fully complete: - */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: + bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); - if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_btree_verify_flushed(c); + if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); + + if (ret) + break; + + for_each_member_device(ca, c, i) + bch2_dev_allocator_quiesce(c, ca); + + bch2_journal_flush_all_pins(&c->journal); + + /* + * We need to explicitly wait on btree interior updates to complete + * before stopping the journal, flushing all journal pins isn't + * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree + * interior updates have to drop their journal pin before they're + * fully complete: + */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + + clean_passes = wrote ? 0 : clean_passes + 1; + } while (clean_passes < 2); +allocator_not_running: + for_each_member_device(ca, c, i) + bch2_dev_allocator_stop(ca); + + clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); bch2_fs_journal_stop(&c->journal); + /* XXX: mark super that alloc info is persistent */ + /* * the journal kicks off btree writes via reclaim - wait for in flight * writes after stopping journal: */ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) bch2_btree_flush_all_writes(c); + else + bch2_btree_verify_flushed(c); /* * After stopping journal: @@ -275,12 +292,12 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RW) + if (!test_bit(BCH_FS_RW, &c->flags)) { + cancel_delayed_work_sync(&c->journal.reclaim_work); return; + } - if (test_bit(BCH_FS_ERROR, &c->flags)) - return; + BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); /* * Block new foreground-end write operations from starting - any new @@ -311,13 +328,19 @@ void bch2_fs_read_only(struct bch_fs *c) __bch2_fs_read_only(c); - bch_fs_mark_clean(c); - wait_event(bch_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - c->state = BCH_FS_RO; + + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && + test_bit(BCH_FS_STARTED, &c->flags) && + !c->opts.norecovery) + bch2_fs_mark_clean(c); + + clear_bit(BCH_FS_RW, &c->flags); } static void bch2_fs_read_only_work(struct work_struct *work) @@ -346,52 +369,112 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } -const char *bch2_fs_read_write(struct bch_fs *c) +static int bch2_fs_read_write_late(struct bch_fs *c) { struct bch_dev *ca; - const char *err = NULL; unsigned i; + int ret; - if (c->state != BCH_FS_STARTING && - c->state != BCH_FS_RO) - return NULL; + ret = bch2_gc_thread_start(c); + if (ret) { + bch_err(c, "error starting gc thread"); + return ret; + } + + for_each_rw_member(ca, c, i) { + ret = bch2_copygc_start(c, ca); + if (ret) { + bch_err(c, "error starting copygc threads"); + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + ret = bch2_rebalance_start(c); + if (ret) { + bch_err(c, "error starting rebalance thread"); + return ret; + } + + schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); + + return 0; +} + +int __bch2_fs_read_write(struct bch_fs *c, bool early) +{ + struct bch_dev *ca; + unsigned i; + int ret; + + if (test_bit(BCH_FS_RW, &c->flags)) + return 0; + + /* + * nochanges is used for fsck -n mode - we have to allow going rw + * during recovery for that to work: + */ + if (c->opts.norecovery || + (c->opts.nochanges && + (!early || c->opts.read_only))) + return -EROFS; + + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - err = "error starting allocator thread"; - for_each_rw_member(ca, c, i) - if (bch2_dev_allocator_start(ca)) { - percpu_ref_put(&ca->io_ref); + if (!test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags)) { + ret = bch2_fs_allocator_start(c); + if (ret) { + bch_err(c, "error initializing allocator"); goto err; } - err = "error starting btree GC thread"; - if (bch2_gc_thread_start(c)) - goto err; + set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); + } - err = "error starting copygc thread"; - for_each_rw_member(ca, c, i) - if (bch2_copygc_start(c, ca)) { + for_each_rw_member(ca, c, i) { + ret = bch2_dev_allocator_start(ca); + if (ret) { + bch_err(c, "error starting allocator threads"); percpu_ref_put(&ca->io_ref); goto err; } + } - err = "error starting rebalance thread"; - if (bch2_rebalance_start(c)) - goto err; + set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); - schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) + goto err; + } - if (c->state != BCH_FS_STARTING) - percpu_ref_reinit(&c->writes); + percpu_ref_reinit(&c->writes); + set_bit(BCH_FS_RW, &c->flags); - c->state = BCH_FS_RW; - return NULL; + queue_delayed_work(c->journal_reclaim_wq, + &c->journal.reclaim_work, 0); + return 0; err: __bch2_fs_read_only(c); - return err; + return ret; +} + +int bch2_fs_read_write(struct bch_fs *c) +{ + return __bch2_fs_read_write(c, false); +} + +int bch2_fs_read_write_early(struct bch_fs *c) +{ + lockdep_assert_held(&c->state_lock); + + return __bch2_fs_read_write(c, true); } /* Filesystem startup/shutdown: */ @@ -405,6 +488,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); bch2_fs_btree_cache_exit(c); @@ -412,17 +496,26 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - percpu_free_rwsem(&c->usage_lock); - free_percpu(c->usage_percpu); + percpu_free_rwsem(&c->mark_lock); + kfree(c->usage_scratch); + free_percpu(c->usage[1]); + free_percpu(c->usage[0]); + kfree(c->usage_base); + free_percpu(c->pcpu); + mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->fill_iter); percpu_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->replicas, 1)); + kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); + if (c->journal_reclaim_wq) + destroy_workqueue(c->journal_reclaim_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->wq) @@ -446,10 +539,11 @@ void bch2_fs_stop(struct bch_fs *c) struct bch_dev *ca; unsigned i; - mutex_lock(&c->state_lock); - BUG_ON(c->state == BCH_FS_STOPPING); - c->state = BCH_FS_STOPPING; - mutex_unlock(&c->state_lock); + bch_verbose(c, "shutting down"); + + set_bit(BCH_FS_STOPPING, &c->flags); + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && @@ -475,11 +569,9 @@ void bch2_fs_stop(struct bch_fs *c) closure_debug_destroy(&c->cl); mutex_lock(&c->state_lock); - __bch2_fs_read_only(c); + bch2_fs_read_only(c); mutex_unlock(&c->state_lock); - bch_fs_mark_clean(c); - /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); @@ -494,6 +586,8 @@ void bch2_fs_stop(struct bch_fs *c) if (c->devs[i]) bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); + bch_verbose(c, "shutdown complete"); + kobject_put(&c->kobj); } @@ -568,7 +662,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_allocator_init(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); @@ -578,28 +673,43 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_reserve_cache_lock); mutex_init(&c->btree_interior_update_lock); + mutex_init(&c->usage_scratch_lock); + mutex_init(&c->bio_bounce_pages_lock); bio_list_init(&c->btree_write_error_list); spin_lock_init(&c->btree_write_error_lock); INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); + INIT_LIST_HEAD(&c->ec_new_stripe_list); + mutex_init(&c->ec_new_stripe_lock); + mutex_init(&c->ec_stripe_create_lock); + spin_lock_init(&c->ec_stripes_heap_lock); + seqcount_init(&c->gc_pos_lock); + seqcount_init(&c->usage_lock); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; c->journal.write_time = &c->times[BCH_TIME_journal_write]; c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; - c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked]; + c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); + if (percpu_init_rwsem(&c->mark_lock)) + goto err; + mutex_lock(&c->sb_lock); if (bch2_sb_to_fs(c, sb)) { @@ -618,9 +728,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->block_bits = ilog2(c->opts.block_size); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - c->opts.nochanges |= c->opts.noreplay; - c->opts.read_only |= c->opts.nochanges; - if (bch2_fs_init_fault("fs_alloc")) goto err; @@ -629,10 +736,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) sizeof(struct btree_node_iter_set); if (!(c->wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->copygc_wq = alloc_workqueue("bcache_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || - percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, sizeof(struct btree_reserve)) || mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, @@ -642,17 +752,22 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || - percpu_init_rwsem(&c->usage_lock) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * BTREE_ITER_MAX + + sizeof(struct btree_insert_entry) * + (BTREE_ITER_MAX + 4)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || + bch2_fs_replicas_init(c) || bch2_fs_btree_cache_init(c) || bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || + bch2_fs_ec_init(c) || bch2_fs_fsio_init(c)) goto err; @@ -690,207 +805,101 @@ err: goto out; } -const char *bch2_fs_start(struct bch_fs *c) +noinline_for_stack +static void print_mount_opts(struct bch_fs *c) +{ + enum bch_opt_id i; + char buf[512]; + struct printbuf p = PBUF(buf); + bool first = true; + + strcpy(buf, "(null)"); + + if (c->opts.read_only) { + pr_buf(&p, "ro"); + first = false; + } + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->mode & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + if (!first) + pr_buf(&p, ","); + first = false; + bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + bch_info(c, "mounted with opts: %s", buf); +} + +int bch2_fs_start(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; - LIST_HEAD(journal); - struct jset *j; - time64_t now; + time64_t now = ktime_get_real_seconds(); unsigned i; int ret = -EINVAL; mutex_lock(&c->state_lock); - BUG_ON(c->state != BCH_FS_STARTING); + BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); mutex_lock(&c->sb_lock); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); + + mi = bch2_sb_get_members(c->disk_sb.sb); + for_each_online_member(ca, c, i) + mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); + mutex_unlock(&c->sb_lock); for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (BCH_SB_INITIALIZED(c->disk_sb.sb)) { - ret = bch2_journal_read(c, &journal); - if (ret) - goto err; - - j = &list_entry(journal.prev, struct journal_replay, list)->j; - - c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock); - - for (i = 0; i < BTREE_ID_NR; i++) { - unsigned level; - struct bkey_i *k; - - k = bch2_journal_find_btree_root(c, j, i, &level); - if (!k) - continue; - - err = "invalid btree root pointer"; - if (IS_ERR(k)) - goto err; - - err = "error reading btree root"; - if (bch2_btree_root_read(c, i, k, level)) { - if (i != BTREE_ID_ALLOC) - goto err; - - mustfix_fsck_err(c, "error reading btree root"); - } - } - - for (i = 0; i < BTREE_ID_NR; i++) - if (!c->btree_roots[i].b) - bch2_btree_root_alloc(c, i); - - err = "error reading allocation information"; - ret = bch2_alloc_read(c, &journal); - if (ret) - goto err; - - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - bch_verbose(c, "starting mark and sweep:"); - err = "error in recovery"; - ret = bch2_initial_gc(c, &journal); - if (ret) - goto err; - bch_verbose(c, "mark and sweep done"); - - if (c->opts.noreplay) - goto recovery_done; - - /* - * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() - * will give spurious errors about oldest_gen > bucket_gen - - * this is a hack but oh well. - */ - bch2_fs_journal_start(&c->journal); - - err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) - goto err; - - bch_verbose(c, "starting journal replay:"); - err = "journal replay failed"; - ret = bch2_journal_replay(c, &journal); - if (ret) - goto err; - bch_verbose(c, "journal replay done"); - - if (c->opts.norecovery) - goto recovery_done; - - bch_verbose(c, "starting fsck:"); - err = "error in fsck"; - ret = bch2_fsck(c, !c->opts.nofsck); - if (ret) - goto err; - bch_verbose(c, "fsck done"); - - if (enabled_qtypes(c)) { - bch_verbose(c, "reading quotas:"); - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - bch_verbose(c, "quotas done"); - } - } else { - struct bch_inode_unpacked inode; - struct bkey_inode_buf packed_inode; - - bch_notice(c, "initializing new filesystem"); - - set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - - ret = bch2_initial_gc(c, &journal); - if (ret) - goto err; - - err = "unable to allocate journal buckets"; - for_each_online_member(ca, c, i) - if (bch2_dev_journal_alloc(ca)) { - percpu_ref_put(&ca->io_ref); - goto err; - } - - for (i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); - - /* - * journal_res_get() will crash if called before this has - * set up the journal.pin FIFO and journal.cur pointer: - */ - bch2_fs_journal_start(&c->journal); - bch2_journal_set_replay_done(&c->journal); - - err = "error starting allocator"; - if (bch2_fs_allocator_start(c)) - goto err; - - bch2_inode_init(c, &inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - inode.bi_inum = BCACHEFS_ROOT_INO; - - bch2_inode_pack(&packed_inode, &inode); - - err = "error creating root directory"; - if (bch2_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, NULL, 0)) - goto err; + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); + if (ret) + goto err; - if (enabled_qtypes(c)) { - ret = bch2_fs_quota_read(c); - if (ret) - goto err; - } + ret = bch2_opts_check_may_set(c); + if (ret) + goto err; - err = "error writing first journal entry"; - if (bch2_journal_meta(&c->journal)) - goto err; - } -recovery_done: err = "dynamic fault"; + ret = -EINVAL; if (bch2_fs_init_fault("fs_start")) goto err; - if (c->opts.read_only) { + if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { - err = bch2_fs_read_write(c); - if (err) + err = "error going read write"; + ret = !test_bit(BCH_FS_RW, &c->flags) + ? bch2_fs_read_write(c) + : bch2_fs_read_write_late(c); + if (ret) goto err; } - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb.sb); - now = ktime_get_seconds(); - - for_each_member_device(ca, c, i) - mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); - - SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - set_bit(BCH_FS_STARTED, &c->flags); - - err = NULL; + print_mount_opts(c); + ret = 0; out: mutex_unlock(&c->state_lock); - bch2_journal_entries_free(&journal); - return err; + return ret; err: -fsck_err: switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); @@ -917,8 +926,8 @@ fsck_err: break; } - BUG_ON(!err); - set_bit(BCH_FS_ERROR, &c->flags); + if (ret >= 0) + ret = -EIO; goto out; } @@ -985,6 +994,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bioset_exit(&ca->replica_set); bch2_dev_buckets_free(ca); + free_page((unsigned long) ca->sb_read_scratch); bch2_time_stats_exit(&ca->io_latency[WRITE]); bch2_time_stats_exit(&ca->io_latency[READ]); @@ -1091,10 +1101,14 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ca->mi = bch2_mi_to_cpu(member); ca->uuid = member->uuid; + if (opt_defined(c->opts, discard)) + ca->mi.discard = opt_get(c->opts, discard); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio), 0) || @@ -1182,14 +1196,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); - if (ca->fs) - mutex_lock(&ca->fs->sb_lock); - - bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); - - if (ca->fs) - mutex_unlock(&ca->fs->sb_lock); - percpu_ref_reinit(&ca->io_ref); return 0; @@ -1215,6 +1221,15 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && + !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { + mutex_lock(&c->sb_lock); + bch2_mark_dev_superblock(ca->fs, ca, 0); + mutex_unlock(&c->sb_lock); + } + + bch2_dev_sysfs_online(c, ca); + if (c->sb.nr_devices == 1) bdevname(ca->disk_sb.bdev, c->name); bdevname(ca->disk_sb.bdev, ca->name); @@ -1429,10 +1444,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) data = bch2_dev_has_data(c, ca); if (data) { char data_has_str[100]; - bch2_scnprint_flag_list(data_has_str, - sizeof(data_has_str), - bch2_data_types, - data); + + bch2_flags_to_text(&PBUF(data_has_str), + bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ret = -EBUSY; goto err; @@ -1441,8 +1455,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), POS(ca->dev_idx + 1, 0), - ZERO_VERSION, - NULL, NULL, NULL); + NULL); if (ret) { bch_err(ca, "Remove failed, error deleting alloc info"); goto err; @@ -1452,7 +1465,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * must flush all existing journal entries, they might have * (overwritten) keys that point to the device we're removing: */ - ret = bch2_journal_flush_all_pins(&c->journal); + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_error(&c->journal); if (ret) { bch_err(ca, "Remove failed, journal error"); goto err; @@ -1483,12 +1497,26 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->state_lock); return 0; err: - if (ca->mi.state == BCH_MEMBER_STATE_RW) + if (ca->mi.state == BCH_MEMBER_STATE_RW && + !percpu_ref_is_zero(&ca->io_ref)) __bch2_dev_read_write(c, ca); mutex_unlock(&c->state_lock); return ret; } +static void dev_usage_clear(struct bch_dev *ca) +{ + struct bucket_array *buckets; + + percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); + up_read(&ca->bucket_lock); +} + /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1527,11 +1555,27 @@ int bch2_dev_add(struct bch_fs *c, const char *path) return ret; } + /* + * We want to allocate journal on the new device before adding the new + * device to the filesystem because allocating after we attach requires + * spinning up the allocator thread, and the allocator thread requires + * doing btree writes, which if the existing devices are RO isn't going + * to work + * + * So we have to mark where the superblocks are, but marking allocated + * data normally updates the filesystem usage too, so we have to mark, + * allocate the journal, reset all the marks, then remark after we + * attach... + */ + bch2_mark_dev_superblock(ca->fs, ca, 0); + err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); if (ret) goto err; + dev_usage_clear(ca); + mutex_lock(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1576,12 +1620,14 @@ have_slot: /* success: */ mi->members[dev_idx] = dev_mi; - mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds()); + mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); c->disk_sb.sb->nr_devices = nr_devices; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); + bch2_mark_dev_superblock(c, ca, 0); + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1613,6 +1659,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; + struct bch_sb_field_members *mi; struct bch_dev *ca; unsigned dev_idx; const char *err; @@ -1644,6 +1691,15 @@ int bch2_dev_online(struct bch_fs *c, const char *path) goto err; } + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + + mi->members[ca->dev_idx].last_mount = + cpu_to_le64(ktime_get_real_seconds()); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + mutex_unlock(&c->state_lock); return 0; err: @@ -1803,9 +1859,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; if (!c->opts.nostart) { - err = bch2_fs_start(c); - if (err) - goto err_print; + ret = bch2_fs_start(c); + if (ret) + goto err; } out: kfree(sb); @@ -1832,6 +1888,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, const char *err; struct bch_fs *c; bool allocated_fs = false; + int ret; err = bch2_sb_validate(sb); if (err) @@ -1864,8 +1921,9 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, mutex_unlock(&c->sb_lock); if (!c->opts.nostart && bch2_fs_may_start(c)) { - err = bch2_fs_start(c); - if (err) + err = "error starting filesystem"; + ret = bch2_fs_start(c); + if (ret) goto err; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 231bc5295740..41992e891391 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_SUPER_H #define _BCACHEFS_SUPER_H @@ -217,11 +218,14 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); -const char *bch2_fs_read_write(struct bch_fs *); + +int __bch2_fs_read_write(struct bch_fs *, bool); +int bch2_fs_read_write(struct bch_fs *); +int bch2_fs_read_write_early(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); -const char *bch2_fs_start(struct bch_fs *); +int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); const char *bch2_fs_open_incremental(const char *path); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index ab83ade959e4..20406ebd6f5b 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H @@ -10,6 +11,7 @@ struct bch_sb_handle { unsigned have_layout:1; unsigned have_bio:1; unsigned fs_sb:1; + u64 seq; }; struct bch_devs_mask { @@ -34,18 +36,6 @@ struct bch_member_cpu { u8 valid; }; -struct bch_replicas_cpu_entry { - u8 data_type; - u8 devs[BCH_SB_MEMBERS_MAX / 8]; -}; - -struct bch_replicas_cpu { - struct rcu_head rcu; - unsigned nr; - unsigned entry_size; - struct bch_replicas_cpu_entry entries[]; -}; - struct bch_disk_group_cpu { bool deleted; u16 parent; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 5e341a712cdf..27646c435e30 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache sysfs interfaces * @@ -8,8 +9,7 @@ #ifndef NO_BCACHEFS_SYSFS #include "bcachefs.h" -#include "alloc.h" -#include "compress.h" +#include "alloc_background.h" #include "sysfs.h" #include "btree_cache.h" #include "btree_io.h" @@ -19,6 +19,7 @@ #include "btree_gc.h" #include "buckets.h" #include "disk_groups.h" +#include "ec.h" #include "inode.h" #include "journal.h" #include "keylist.h" @@ -27,6 +28,7 @@ #include "rebalance.h" #include "replicas.h" #include "super-io.h" +#include "tests.h" #include <linux/blkdev.h> #include <linux/sort.h> @@ -71,9 +73,10 @@ do { \ #define sysfs_hprint(file, val) \ do { \ if (attr == &sysfs_ ## file) { \ - ssize_t ret = bch2_hprint(buf, val); \ - strcat(buf, "\n"); \ - return ret + 1; \ + struct printbuf out = _PBUF(buf, PAGE_SIZE); \ + bch2_hprint(&out, val); \ + pr_buf(&out, "\n"); \ + return out.pos - buf; \ } \ } while (0) @@ -130,6 +133,7 @@ do { \ write_attribute(trigger_journal_flush); write_attribute(trigger_btree_coalesce); write_attribute(trigger_gc); +write_attribute(trigger_alloc_write); write_attribute(prune_cache); rw_attribute(btree_gc_periodic); @@ -187,11 +191,17 @@ sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_work); rw_attribute(promote_whole_extents); +read_attribute(new_stripes); + rw_attribute(pd_controllers_update_seconds); read_attribute(meta_replicas_have); read_attribute(data_replicas_have); +#ifdef CONFIG_BCACHEFS_TESTS +write_attribute(perf_test); +#endif /* CONFIG_BCACHEFS_TESTS */ + #define BCH_DEBUG_PARAM(name, description) \ rw_attribute(name); @@ -224,78 +234,63 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) { - struct bch_fs_usage stats = bch2_fs_usage_read(c); + struct printbuf out = _PBUF(buf, PAGE_SIZE); + struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); - return scnprintf(buf, PAGE_SIZE, - "capacity:\t\t%llu\n" - "1 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "2 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "3 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "4 replicas:\n" - "\tmeta:\t\t%llu\n" - "\tdirty:\t\t%llu\n" - "\treserved:\t%llu\n" - "online reserved:\t%llu\n", - c->capacity, - stats.s[0].data[S_META], - stats.s[0].data[S_DIRTY], - stats.s[0].persistent_reserved, - stats.s[1].data[S_META], - stats.s[1].data[S_DIRTY], - stats.s[1].persistent_reserved, - stats.s[2].data[S_META], - stats.s[2].data[S_DIRTY], - stats.s[2].persistent_reserved, - stats.s[3].data[S_META], - stats.s[3].data[S_DIRTY], - stats.s[3].persistent_reserved, - stats.online_reserved); + if (!fs_usage) + return -ENOMEM; + + bch2_fs_usage_to_text(&out, c, fs_usage); + + percpu_up_read(&c->mark_lock); + + kfree(fs_usage); + + return out.pos - buf; } static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, nr_compressed_extents = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; + int ret; - if (!bch2_fs_running(c)) + if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, 0, k) - if (k.k->type == BCH_EXTENT) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) + if (k.k->type == KEY_TYPE_extent) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - extent_for_each_ptr_crc(e, ptr, crc) { - if (crc.compression_type == BCH_COMPRESSION_NONE) { + extent_for_each_ptr_decode(e, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_NONE) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else { nr_compressed_extents++; compressed_sectors_compressed += - crc.compressed_size; + p.crc.compressed_size; compressed_sectors_uncompressed += - crc.uncompressed_size; + p.crc.uncompressed_size; } /* only looking at the first ptr */ break; } } - bch2_btree_iter_unlock(&iter); + + ret = bch2_trans_exit(&trans) ?: ret; + if (ret) + return ret; return scnprintf(buf, PAGE_SIZE, "uncompressed data:\n" @@ -312,6 +307,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) compressed_sectors_uncompressed << 9); } +static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = buf + PAGE_SIZE; + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + out += scnprintf(out, end - out, + "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); + + if (h->s) + out += scnprintf(out, end - out, + "\tpending: blocks %u allocated %u\n", + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); + + mutex_lock(&h->lock); + list_for_each_entry(s, &h->stripes, list) + out += scnprintf(out, end - out, + "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, + s->blocks.nr), + atomic_read(&s->pin)); + mutex_unlock(&h->lock); + + } + mutex_unlock(&c->ec_new_stripe_lock); + + return out - buf; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -348,8 +378,8 @@ SHOW(bch2_fs) sysfs_print(promote_whole_extents, c->promote_whole_extents); - sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true)); - sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false)); + sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); + sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); /* Debugging: */ @@ -371,6 +401,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) return bch2_compression_stats(c, buf); + if (attr == &sysfs_new_stripes) + return bch2_new_stripes(c, buf); + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM @@ -425,7 +458,7 @@ STORE(__bch2_fs) BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM - if (!bch2_fs_running(c)) + if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; /* Debugging: */ @@ -437,7 +470,13 @@ STORE(__bch2_fs) bch2_coalesce(c); if (attr == &sysfs_trigger_gc) - bch2_gc(c); + bch2_gc(c, NULL, false, false); + + if (attr == &sysfs_trigger_alloc_write) { + bool wrote; + + bch2_alloc_write(c, 0, &wrote); + } if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -446,7 +485,25 @@ STORE(__bch2_fs) sc.nr_to_scan = strtoul_or_return(buf); c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); } - +#ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; + char *test = strsep(&p, " \t\n"); + char *nr_str = strsep(&p, " \t\n"); + char *threads_str = strsep(&p, " \t\n"); + unsigned threads; + u64 nr; + int ret = -EINVAL; + + if (threads_str && + !(ret = kstrtouint(threads_str, 10, &threads)) && + !(ret = bch2_strtoull_h(nr_str, &nr))) + bch2_btree_perf_test(c, test, nr, threads); + else + size = ret; + kfree(tmp); + } +#endif return size; } @@ -477,6 +534,10 @@ struct attribute *bch2_fs_files[] = { &sysfs_promote_whole_extents, &sysfs_compression_stats, + +#ifdef CONFIG_BCACHEFS_TESTS + &sysfs_perf_test, +#endif NULL }; @@ -509,6 +570,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_flush, &sysfs_trigger_btree_coalesce, &sysfs_trigger_gc, + &sysfs_trigger_alloc_write, &sysfs_prune_cache, &sysfs_copy_gc_enabled, @@ -517,6 +579,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + &sysfs_new_stripes, + &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -530,16 +594,16 @@ struct attribute *bch2_fs_internal_files[] = { SHOW(bch2_fs_opts_dir) { - char *out = buf, *end = buf + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); int id = opt - bch2_opt_table; u64 v = bch2_opt_get_by_id(&c->opts, id); - out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST); - out += scnprintf(out, end - out, "\n"); + bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); + pr_buf(&out, "\n"); - return out - buf; + return out.pos - buf; } STORE(bch2_fs_opts_dir) @@ -560,14 +624,9 @@ STORE(bch2_fs_opts_dir) if (ret < 0) return ret; - if (id == Opt_compression || - id == Opt_background_compression) { - int ret = bch2_check_set_has_compressed_data(c, v); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } - } + ret = bch2_opt_check_may_set(c, id, v); + if (ret < 0) + return ret; if (opt->set_sb != SET_NO_SB_OPT) { mutex_lock(&c->sb_lock); @@ -598,7 +657,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) for (i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (i->mode == OPT_INTERNAL) + if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) continue; ret = sysfs_create_file(kobj, &i->attr); @@ -665,10 +724,10 @@ static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, static int unsigned_cmp(const void *_l, const void *_r) { - unsigned l = *((unsigned *) _l); - unsigned r = *((unsigned *) _r); + const unsigned *l = _l; + const unsigned *r = _r; - return (l > r) - (l < r); + return cmp_int(*l, *r); } static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, @@ -713,31 +772,35 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) { + struct printbuf out = _PBUF(buf, PAGE_SIZE); enum alloc_reserve i; - ssize_t ret; spin_lock(&ca->freelist_lock); - ret = scnprintf(buf, PAGE_SIZE, - "free_inc:\t%zu\t%zu\n", - fifo_used(&ca->free_inc), - ca->free_inc.size); + pr_buf(&out, "free_inc:\t%zu\t%zu\n", + fifo_used(&ca->free_inc), + ca->free_inc.size); for (i = 0; i < RESERVE_NR; i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "free[%u]:\t%zu\t%zu\n", i, - fifo_used(&ca->free[i]), - ca->free[i].size); + pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, + fifo_used(&ca->free[i]), + ca->free[i].size); spin_unlock(&ca->freelist_lock); - return ret; + return out.pos - buf; } static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) { struct bch_fs *c = ca->fs; struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + unsigned i, nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].type]++; return scnprintf(buf, PAGE_SIZE, "free_inc: %zu/%zu\n" @@ -752,16 +815,22 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " meta: %llu\n" " user: %llu\n" " cached: %llu\n" - " available: %llu\n" + " erasure coded: %llu\n" + " available: %lli\n" "sectors:\n" " sb: %llu\n" " journal: %llu\n" " meta: %llu\n" " user: %llu\n" " cached: %llu\n" + " fragmented: %llu\n" + " copygc threshold: %llu\n" "freelist_wait: %s\n" "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n", + "open_buckets_wait: %s\n" + "open_buckets_btree: %u\n" + "open_buckets_user: %u\n" + "btree reserve cache: %u\n", fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, @@ -773,15 +842,22 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_CACHED], - __dev_buckets_available(ca, stats), + stats.buckets_ec, + ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_JOURNAL], stats.sectors[BCH_DATA_BTREE], stats.sectors[BCH_DATA_USER], stats.sectors[BCH_DATA_CACHED], + stats.sectors_fragmented, + ca->copygc_threshold, c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty"); + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_BTREE], + nr[BCH_DATA_USER], + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -792,31 +868,26 @@ static const char * const bch2_rw[] = { static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) { - char *out = buf, *end = buf + PAGE_SIZE; - int rw, i, cpu; + struct printbuf out = _PBUF(buf, PAGE_SIZE); + int rw, i; for (rw = 0; rw < 2; rw++) { - out += scnprintf(out, end - out, "%s:\n", bch2_rw[rw]); + pr_buf(&out, "%s:\n", bch2_rw[rw]); - for (i = 1; i < BCH_DATA_NR; i++) { - u64 n = 0; - - for_each_possible_cpu(cpu) - n += per_cpu_ptr(ca->io_done, cpu)->sectors[rw][i]; - - out += scnprintf(out, end - out, "%-12s:%12llu\n", - bch2_data_types[i], n << 9); - } + for (i = 1; i < BCH_DATA_NR; i++) + pr_buf(&out, "%-12s:%12llu\n", + bch2_data_types[i], + percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } - return out - buf; + return out.pos - buf; } SHOW(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - char *out = buf, *end = buf + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_printf(uuid, "%pU\n", ca->uuid.b); @@ -830,41 +901,39 @@ SHOW(bch2_dev) if (attr == &sysfs_label) { if (ca->mi.group) { mutex_lock(&c->sb_lock); - out += bch2_disk_path_print(&c->disk_sb, out, end - out, - ca->mi.group - 1); + bch2_disk_path_to_text(&out, &c->disk_sb, + ca->mi.group - 1); mutex_unlock(&c->sb_lock); } else { - out += scnprintf(out, end - out, "none"); + pr_buf(&out, "none"); } - out += scnprintf(out, end - out, "\n"); - return out - buf; + pr_buf(&out, "\n"); + return out.pos - buf; } if (attr == &sysfs_has_data) { - out += bch2_scnprint_flag_list(out, end - out, - bch2_data_types, - bch2_dev_has_data(c, ca)); - out += scnprintf(out, end - out, "\n"); - return out - buf; + bch2_flags_to_text(&out, bch2_data_types, + bch2_dev_has_data(c, ca)); + pr_buf(&out, "\n"); + return out.pos - buf; } sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); if (attr == &sysfs_cache_replacement_policy) { - out += bch2_scnprint_string_list(out, end - out, - bch2_cache_replacement_policies, - ca->mi.replacement); - out += scnprintf(out, end - out, "\n"); - return out - buf; + bch2_string_opt_to_text(&out, + bch2_cache_replacement_policies, + ca->mi.replacement); + pr_buf(&out, "\n"); + return out.pos - buf; } if (attr == &sysfs_state_rw) { - out += bch2_scnprint_string_list(out, end - out, - bch2_dev_state, - ca->mi.state); - out += scnprintf(out, end - out, "\n"); - return out - buf; + bch2_string_opt_to_text(&out, bch2_dev_state, + ca->mi.state); + pr_buf(&out, "\n"); + return out.pos - buf; } if (attr == &sysfs_iodone) @@ -921,7 +990,7 @@ STORE(bch2_dev) } if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = bch2_read_string_list(buf, bch2_cache_replacement_policies); + ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); if (v < 0) return v; diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 1ba759fd6e8c..525fd05d91f7 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_SYSFS_H_ #define _BCACHEFS_SYSFS_H_ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 index 000000000000..fe0b987902fb --- /dev/null +++ b/fs/bcachefs/tests.c @@ -0,0 +1,678 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "journal_reclaim.h" +#include "tests.h" + +#include "linux/kthread.h" +#include "linux/random.h" + +static void delete_test_keys(struct bch_fs *c) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); +} + +/* unit tests */ + +static void test_delete(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + + pr_info("deleting once"); + ret = bch2_btree_delete_at(&trans, iter, 0); + BUG_ON(ret); + + pr_info("deleting twice"); + ret = bch2_btree_delete_at(&trans, iter, 0); + BUG_ON(ret); + + bch2_trans_exit(&trans); +} + +static void test_delete_written(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(iter); + BUG_ON(ret); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + + bch2_journal_flush_all_pins(&c->journal); + + ret = bch2_btree_delete_at(&trans, iter, 0); + BUG_ON(ret); + + bch2_trans_exit(&trans); +} + +static void test_iterate(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + POS_MIN, 0, k, ret) + BUG_ON(k.k->p.offset != i++); + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) + BUG_ON(k.k->p.offset != --i); + + BUG_ON(i); + + bch2_trans_exit(&trans); +} + +static void test_iterate_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test extents"); + + for (i = 0; i < nr; i += 8) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 8; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + POS_MIN, 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + } + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + } + + BUG_ON(i); + + bch2_trans_exit(&trans); +} + +static void test_iterate_slots(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + 0, k, ret) { + BUG_ON(k.k->p.offset != i); + i += 2; + } + bch2_trans_iter_free(&trans, iter); + + BUG_ON(i != nr * 2); + + pr_info("iterating forwards by slots"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(bkey_deleted(k.k) != (i & 1)); + BUG_ON(k.k->p.offset != i++); + + if (i == nr * 2) + break; + } + + bch2_trans_exit(&trans); +} + +static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i += 16) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 16; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } + + pr_info("iterating forwards"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + 0, k, ret) { + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + } + bch2_trans_iter_free(&trans, iter); + + BUG_ON(i != nr); + + pr_info("iterating forwards by slots"); + + i = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_SLOTS, k, ret) { + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + + if (i == nr) + break; + } + + bch2_trans_exit(&trans); +} + +/* + * XXX: we really want to make sure we've got a btree with depth > 0 for these + * tests + */ +static void test_peek_end(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + + bch2_trans_exit(&trans); +} + +static void test_peek_end_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + + k = bch2_btree_iter_peek(iter); + BUG_ON(k.k); + + bch2_trans_exit(&trans); +} + +/* extent unit tests */ + +u64 test_version; + +static void insert_test_extent(struct bch_fs *c, + u64 start, u64 end) +{ + struct bkey_i_cookie k; + int ret; + + //pr_info("inserting %llu-%llu v %llu", start, end, test_version); + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + + ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); +} + +static void __test_extent_overwrite(struct bch_fs *c, + u64 e1_start, u64 e1_end, + u64 e2_start, u64 e2_end) +{ + insert_test_extent(c, e1_start, e1_end); + insert_test_extent(c, e2_start, e2_end); + + delete_test_keys(c); +} + +static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 0, 32); + __test_extent_overwrite(c, 8, 64, 0, 32); +} + +static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 32, 64); + __test_extent_overwrite(c, 0, 64, 32, 72); +} + +static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 0, 64, 32, 40); +} + +static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) +{ + __test_extent_overwrite(c, 32, 64, 0, 64); + __test_extent_overwrite(c, 32, 64, 0, 128); + __test_extent_overwrite(c, 32, 64, 32, 64); + __test_extent_overwrite(c, 32, 64, 32, 128); +} + +/* perf tests */ + +static u64 test_rand(void) +{ + u64 v; +#if 0 + v = prandom_u32(); +#else + prandom_bytes(&v, sizeof(v)); +#endif + return v; +} + +static void rand_insert(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie k; + int ret; + u64 i; + + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + NULL, NULL, 0); + BUG_ON(ret); + } +} + +static void rand_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(iter); + bch2_trans_iter_free(&trans, iter); + } + + bch2_trans_exit(&trans); +} + +static void rand_mixed(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + POS(0, test_rand()), 0); + + k = bch2_btree_iter_peek(iter); + + if (!(i & 3) && k.k) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p = iter->pos; + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + } + + bch2_trans_iter_free(&trans, iter); + } + + bch2_trans_exit(&trans); +} + +static void rand_delete(struct bch_fs *c, u64 nr) +{ + struct bkey_i k; + int ret; + u64 i; + + for (i = 0; i < nr; i++) { + bkey_init(&k.k); + k.k.p.offset = test_rand(); + + ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, + NULL, NULL, 0); + BUG_ON(ret); + } +} + +static void seq_insert(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; + int ret; + u64 i = 0; + + bkey_cookie_init(&insert.k_i); + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + insert.k.p = iter->pos; + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + + if (++i == nr) + break; + } + bch2_trans_exit(&trans); +} + +static void seq_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) + ; + bch2_trans_exit(&trans); +} + +static void seq_overwrite(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + BTREE_ITER_INTENT, k, ret) { + struct bkey_i_cookie u; + + bkey_reassemble(&u.k_i, k); + + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i)); + ret = bch2_trans_commit(&trans, NULL, NULL, 0); + BUG_ON(ret); + } + bch2_trans_exit(&trans); +} + +static void seq_delete(struct bch_fs *c, u64 nr) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + POS(0, 0), POS(0, U64_MAX), + NULL); + BUG_ON(ret); +} + +typedef void (*perf_test_fn)(struct bch_fs *, u64); + +struct test_job { + struct bch_fs *c; + u64 nr; + unsigned nr_threads; + perf_test_fn fn; + + atomic_t ready; + wait_queue_head_t ready_wait; + + atomic_t done; + struct completion done_completion; + + u64 start; + u64 finish; +}; + +static int btree_perf_test_thread(void *data) +{ + struct test_job *j = data; + + if (atomic_dec_and_test(&j->ready)) { + wake_up(&j->ready_wait); + j->start = sched_clock(); + } else { + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + + j->fn(j->c, j->nr / j->nr_threads); + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); + complete(&j->done_completion); + } + + return 0; +} + +void bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) +{ + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20], nr_buf[20], per_sec_buf[20]; + unsigned i; + u64 time; + + atomic_set(&j.ready, nr_threads); + init_waitqueue_head(&j.ready_wait); + + atomic_set(&j.done, nr_threads); + init_completion(&j.done_completion); + +#define perf_test(_test) \ + if (!strcmp(testname, #_test)) j.fn = _test + + perf_test(rand_insert); + perf_test(rand_lookup); + perf_test(rand_mixed); + perf_test(rand_delete); + + perf_test(seq_insert); + perf_test(seq_lookup); + perf_test(seq_overwrite); + perf_test(seq_delete); + + /* a unit test, not a perf test: */ + perf_test(test_delete); + perf_test(test_delete_written); + perf_test(test_iterate); + perf_test(test_iterate_extents); + perf_test(test_iterate_slots); + perf_test(test_iterate_slots_extents); + perf_test(test_peek_end); + perf_test(test_peek_end_extents); + + perf_test(test_extent_overwrite_front); + perf_test(test_extent_overwrite_back); + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + + if (!j.fn) { + pr_err("unknown test %s", testname); + return; + } + + //pr_info("running test %s:", testname); + + if (nr_threads == 1) + btree_perf_test_thread(&j); + else + for (i = 0; i < nr_threads; i++) + kthread_run(btree_perf_test_thread, &j, + "bcachefs perf test[%u]", i); + + while (wait_for_completion_interruptible(&j.done_completion)) + ; + + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); + bch2_hprint(&PBUF(nr_buf), nr); + bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", + name_buf, nr_buf, nr_threads, + time / NSEC_PER_SEC, + time * nr_threads / nr, + per_sec_buf); +} + +#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h new file mode 100644 index 000000000000..551d0764225e --- /dev/null +++ b/fs/bcachefs/tests.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TEST_H +#define _BCACHEFS_TEST_H + +struct bch_fs; + +#ifdef CONFIG_BCACHEFS_TESTS + +void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); + +#else + +#endif /* CONFIG_BCACHEFS_TESTS */ + +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index 13f0fc24a3f7..59e8dfa3d245 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "alloc_types.h" #include "buckets.h" diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 60e1f1ff44eb..173f60f28512 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * random utiility code, for bcache but in theory not specific to bcache * @@ -24,68 +25,82 @@ #include "eytzinger.h" #include "util.h" -#define simple_strtoint(c, end, base) simple_strtol(c, end, base) -#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) +static const char si_units[] = "?kMGTPEZY"; + +static int __bch2_strtoh(const char *cp, u64 *res, + u64 t_max, bool t_signed) +{ + bool positive = *cp != '-'; + unsigned u; + u64 v = 0; + + if (*cp == '+' || *cp == '-') + cp++; + + if (!isdigit(*cp)) + return -EINVAL; + + do { + if (v > U64_MAX / 10) + return -ERANGE; + v *= 10; + if (v > U64_MAX - (*cp - '0')) + return -ERANGE; + v += *cp - '0'; + cp++; + } while (isdigit(*cp)); + + for (u = 1; u < strlen(si_units); u++) + if (*cp == si_units[u]) { + cp++; + goto got_unit; + } + u = 0; +got_unit: + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + + if (fls64(v) + u * 10 > 64) + return -ERANGE; + + v <<= u * 10; + + if (positive) { + if (v > t_max) + return -ERANGE; + } else { + if (v && !t_signed) + return -ERANGE; + + if (v > t_max + 1) + return -ERANGE; + v = -v; + } + + *res = v; + return 0; +} #define STRTO_H(name, type) \ int bch2_ ## name ## _h(const char *cp, type *res) \ { \ - int u = 0; \ - char *e; \ - type i = simple_ ## name(cp, &e, 10); \ - \ - switch (tolower(*e)) { \ - default: \ - return -EINVAL; \ - case 'y': \ - case 'z': \ - u++; \ - case 'e': \ - u++; \ - case 'p': \ - u++; \ - case 't': \ - u++; \ - case 'g': \ - u++; \ - case 'm': \ - u++; \ - case 'k': \ - u++; \ - if (e++ == cp) \ - return -EINVAL; \ - case '\n': \ - case '\0': \ - if (*e == '\n') \ - e++; \ - } \ - \ - if (*e) \ - return -EINVAL; \ - \ - while (u--) { \ - if ((type) ~0 > 0 && \ - (type) ~0 / 1024 <= i) \ - return -EINVAL; \ - if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ - (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ - return -EINVAL; \ - i *= 1024; \ - } \ - \ - *res = i; \ - return 0; \ -} \ + u64 v; \ + int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ + ANYSINT_MAX(type) != ((type) ~0ULL)); \ + *res = v; \ + return ret; \ +} STRTO_H(strtoint, int) STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +STRTO_H(strtou64, u64) -ssize_t bch2_hprint(char *buf, s64 v) +void bch2_hprint(struct printbuf *buf, s64 v) { - static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; int u, t = 0; for (u = 0; v >= 1024 || v <= -1024; u++) { @@ -93,78 +108,47 @@ ssize_t bch2_hprint(char *buf, s64 v) v >>= 10; } - if (!u) - return sprintf(buf, "%lli", v); + pr_buf(buf, "%lli", v); /* * 103 is magic: t is in the range [-1023, 1023] and we want * to turn it into [-9, 9] */ - if (v < 100 && v > -100) - scnprintf(dec, sizeof(dec), ".%i", t / 103); - - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + if (u && v < 100 && v > -100) + pr_buf(buf, ".%i", t / 103); + if (u) + pr_buf(buf, "%c", si_units[u]); } -ssize_t bch2_scnprint_string_list(char *buf, size_t size, - const char * const list[], - size_t selected) +void bch2_string_opt_to_text(struct printbuf *out, + const char * const list[], + size_t selected) { - char *out = buf; size_t i; - if (size) - *out = '\0'; - - for (i = 0; list[i]; i++) - out += scnprintf(out, buf + size - out, - i == selected ? "[%s] " : "%s ", list[i]); - - if (out != buf) - *--out = '\0'; - - return out - buf; -} - -ssize_t bch2_read_string_list(const char *buf, const char * const list[]) -{ - size_t i, len; - - buf = skip_spaces(buf); - - len = strlen(buf); - while (len && isspace(buf[len - 1])) - --len; - for (i = 0; list[i]; i++) - if (strlen(list[i]) == len && - !memcmp(buf, list[i], len)) - break; - - return list[i] ? i : -EINVAL; + pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); } -ssize_t bch2_scnprint_flag_list(char *buf, size_t size, - const char * const list[], u64 flags) +void bch2_flags_to_text(struct printbuf *out, + const char * const list[], u64 flags) { - char *out = buf, *end = buf + size; unsigned bit, nr = 0; + bool first = true; + + if (out->pos != out->end) + *out->pos = '\0'; while (list[nr]) nr++; - if (size) - *out = '\0'; - while (flags && (bit = __ffs(flags)) < nr) { - out += scnprintf(out, end - out, "%s,", list[bit]); + if (!first) + pr_buf(out, ","); + first = false; + pr_buf(out, "%s", list[bit]); flags ^= 1 << bit; } - - if (out != buf) - *--out = '\0'; - - return out - buf; } u64 bch2_read_flag_list(char *opt, const char * const list[]) @@ -178,7 +162,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) s = strim(d); while ((p = strsep(&s, ","))) { - int flag = bch2_read_string_list(p, list); + int flag = match_string(list, -1, p); if (flag < 0) { ret = -1; break; @@ -327,50 +311,50 @@ static const struct time_unit *pick_time_units(u64 ns) return u; } -static size_t pr_time_units(char *buf, size_t len, u64 ns) +static void pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); - return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name); + pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) { - char *out = buf, *end = buf + len; + struct printbuf out = _PBUF(buf, len); const struct time_unit *u; u64 freq = READ_ONCE(stats->average_frequency); u64 q, last_q = 0; int i; - out += scnprintf(out, end - out, "count:\t\t%llu\n", + pr_buf(&out, "count:\t\t%llu\n", stats->count); - out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n", - freq ? div64_u64(NSEC_PER_SEC, freq) : 0); + pr_buf(&out, "rate:\t\t%llu/sec\n", + freq ? div64_u64(NSEC_PER_SEC, freq) : 0); - out += scnprintf(out, end - out, "frequency:\t"); - out += pr_time_units(out, end - out, freq); + pr_buf(&out, "frequency:\t"); + pr_time_units(&out, freq); - out += scnprintf(out, end - out, "\navg duration:\t"); - out += pr_time_units(out, end - out, stats->average_duration); + pr_buf(&out, "\navg duration:\t"); + pr_time_units(&out, stats->average_duration); - out += scnprintf(out, end - out, "\nmax duration:\t"); - out += pr_time_units(out, end - out, stats->max_duration); + pr_buf(&out, "\nmax duration:\t"); + pr_time_units(&out, stats->max_duration); i = eytzinger0_first(NR_QUANTILES); u = pick_time_units(stats->quantiles.entries[i].m); - out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name); + pr_buf(&out, "\nquantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; q = max(stats->quantiles.entries[i].m, last_q); - out += scnprintf(out, end - out, "%llu%s", - div_u64(q, u->nsecs), - is_last ? "\n" : " "); + pr_buf(&out, "%llu%s", + div_u64(q, u->nsecs), + is_last ? "\n" : " "); last_q = q; } - return out - buf; + return out.pos - buf; } void bch2_time_stats_exit(struct time_stats *stats) @@ -422,27 +406,6 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) d->next = now - NSEC_PER_SEC * 2; } -int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d) -{ - bool kthread = (current->flags & PF_KTHREAD) != 0; - - while (1) { - u64 delay = bch2_ratelimit_delay(d); - - if (delay) - set_current_state(TASK_INTERRUPTIBLE); - - if (kthread && kthread_should_stop()) - return 1; - - if (!delay) - return 0; - - schedule_timeout(delay); - try_to_freeze(); - } -} - /* pd controller: */ /* @@ -518,12 +481,12 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) char change[21]; s64 next_io; - bch2_hprint(rate, pd->rate.rate); - bch2_hprint(actual, pd->last_actual); - bch2_hprint(target, pd->last_target); - bch2_hprint(proportional, pd->last_proportional); - bch2_hprint(derivative, pd->last_derivative); - bch2_hprint(change, pd->last_change); + bch2_hprint(&PBUF(rate), pd->rate.rate); + bch2_hprint(&PBUF(actual), pd->last_actual); + bch2_hprint(&PBUF(target), pd->last_target); + bch2_hprint(&PBUF(proportional), pd->last_proportional); + bch2_hprint(&PBUF(derivative), pd->last_derivative); + bch2_hprint(&PBUF(change), pd->last_change); next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); @@ -548,15 +511,17 @@ void bch2_bio_map(struct bio *bio, void *base) BUG_ON(!bio->bi_iter.bi_size); BUG_ON(bio->bi_vcnt); + BUG_ON(!bio->bi_max_vecs); bv->bv_offset = base ? offset_in_page(base) : 0; goto start; for (; size; bio->bi_vcnt++, bv++) { + BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); + bv->bv_offset = 0; start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, size); - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); if (base) { bv->bv_page = is_vmalloc_addr(base) ? vmalloc_to_page(base) @@ -571,10 +536,11 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) { - int i; + struct bvec_iter_all iter; struct bio_vec *bv; + int i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter) { bv->bv_page = alloc_page(gfp_mask); if (!bv->bv_page) { while (--bv >= bio->bi_io_vec) @@ -629,18 +595,17 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len) +void bch_scnmemcpy(struct printbuf *out, + const char *src, size_t len) { - size_t n; + size_t n = printbuf_remaining(out); - if (!size) - return 0; - - n = min(size - 1, len); - memcpy(buf, src, n); - buf[n] = '\0'; - - return n; + if (n) { + n = min(n - 1, len); + memcpy(out->pos, src, n); + out->pos += n; + *out->pos = '\0'; + } } #include "eytzinger.h" @@ -935,3 +900,28 @@ void eytzinger0_find_test(void) kfree(test_array); } #endif + +/* + * Accumulate percpu counters onto one cpu's copy - only valid when access + * against any percpu counter is guarded against + */ +u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) +{ + u64 *ret; + int cpu; + + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + + if (i != ret) { + acc_u64s(ret, i, nr); + memset(i, 0, nr * sizeof(u64)); + } + } + + return ret; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 184915593e86..310e958c6cdf 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_UTIL_H #define _BCACHEFS_UTIL_H @@ -10,6 +11,8 @@ #include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -61,13 +64,6 @@ struct closure; #endif -#ifndef __CHECKER__ -#define __flatten __attribute__((flatten)) -#else -/* sparse doesn't know about attribute((flatten)) */ -#define __flatten -#endif - #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define CPU_BIG_ENDIAN 0 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -83,6 +79,14 @@ struct closure; (__builtin_types_compatible_p(typeof(_val), _type) || \ __builtin_types_compatible_p(typeof(_val), const _type)) +/* Userspace doesn't align allocations as nicely as the kernel allocators: */ +static inline size_t buf_pages(void *p, size_t len) +{ + return DIV_ROUND_UP(len + + ((unsigned long) p & (PAGE_SIZE - 1)), + PAGE_SIZE); +} + static inline void vpfree(void *p, size_t size) { if (is_vmalloc_addr(p)) @@ -137,7 +141,19 @@ do { \ (heap)->data = NULL; \ } while (0) -#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) +#define heap_set_backpointer(h, i, _fn) \ +do { \ + void (*fn)(typeof(h), size_t) = _fn; \ + if (fn) \ + fn(h, i); \ +} while (0) + +#define heap_swap(h, i, j, set_backpointer) \ +do { \ + swap((h)->data[i], (h)->data[j]); \ + heap_set_backpointer(h, i, set_backpointer); \ + heap_set_backpointer(h, j, set_backpointer); \ +} while (0) #define heap_peek(h) \ ({ \ @@ -147,7 +163,7 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) -#define heap_sift_down(h, i, cmp) \ +#define heap_sift_down(h, i, cmp, set_backpointer) \ do { \ size_t _c, _j = i; \ \ @@ -159,132 +175,111 @@ do { \ \ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ break; \ - heap_swap(h, _c, _j); \ + heap_swap(h, _c, _j, set_backpointer); \ } \ } while (0) -#define heap_sift_up(h, i, cmp) \ +#define heap_sift_up(h, i, cmp, set_backpointer) \ do { \ while (i) { \ size_t p = (i - 1) / 2; \ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ break; \ - heap_swap(h, i, p); \ + heap_swap(h, i, p, set_backpointer); \ i = p; \ } \ } while (0) -#define __heap_add(h, d, cmp) \ -do { \ +#define __heap_add(h, d, cmp, set_backpointer) \ +({ \ size_t _i = (h)->used++; \ (h)->data[_i] = d; \ + heap_set_backpointer(h, _i, set_backpointer); \ \ - heap_sift_up(h, _i, cmp); \ -} while (0) + heap_sift_up(h, _i, cmp, set_backpointer); \ + _i; \ +}) -#define heap_add(h, d, cmp) \ +#define heap_add(h, d, cmp, set_backpointer) \ ({ \ bool _r = !heap_full(h); \ if (_r) \ - __heap_add(h, d, cmp); \ + __heap_add(h, d, cmp, set_backpointer); \ _r; \ }) -#define heap_add_or_replace(h, new, cmp) \ +#define heap_add_or_replace(h, new, cmp, set_backpointer) \ do { \ - if (!heap_add(h, new, cmp) && \ + if (!heap_add(h, new, cmp, set_backpointer) && \ cmp(h, new, heap_peek(h)) >= 0) { \ (h)->data[0] = new; \ - heap_sift_down(h, 0, cmp); \ + heap_set_backpointer(h, 0, set_backpointer); \ + heap_sift_down(h, 0, cmp, set_backpointer); \ } \ } while (0) -#define heap_del(h, i, cmp) \ +#define heap_del(h, i, cmp, set_backpointer) \ do { \ size_t _i = (i); \ \ BUG_ON(_i >= (h)->used); \ (h)->used--; \ - heap_swap(h, _i, (h)->used); \ - heap_sift_up(h, _i, cmp); \ - heap_sift_down(h, _i, cmp); \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ } while (0) -#define heap_pop(h, d, cmp) \ +#define heap_pop(h, d, cmp, set_backpointer) \ ({ \ bool _r = (h)->used; \ if (_r) { \ (d) = (h)->data[0]; \ - heap_del(h, 0, cmp); \ + heap_del(h, 0, cmp, set_backpointer); \ } \ _r; \ }) -#define heap_resort(heap, cmp) \ +#define heap_resort(heap, cmp, set_backpointer) \ do { \ ssize_t _i; \ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ - heap_sift_down(heap, _i, cmp); \ + heap_sift_down(heap, _i, cmp, set_backpointer); \ } while (0) -/* - * Simple array based allocator - preallocates a number of elements and you can - * never allocate more than that, also has no locking. - * - * Handy because if you know you only need a fixed number of elements you don't - * have to worry about memory allocation failure, and sometimes a mempool isn't - * what you want. - * - * We treat the free elements as entries in a singly linked list, and the - * freelist as a stack - allocating and freeing push and pop off the freelist. - */ - -#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ - struct { \ - type *freelist; \ - type data[size]; \ - } name - -#define array_alloc(array) \ -({ \ - typeof((array)->freelist) _ret = (array)->freelist; \ - \ - if (_ret) \ - (array)->freelist = *((typeof((array)->freelist) *) _ret);\ - \ - _ret; \ -}) +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -#define array_free(array, ptr) \ -do { \ - typeof((array)->freelist) _ptr = ptr; \ - \ - *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ - (array)->freelist = _ptr; \ -} while (0) +struct printbuf { + char *pos; + char *end; +}; -#define array_allocator_init(array) \ +static inline size_t printbuf_remaining(struct printbuf *buf) +{ + return buf->end - buf->pos; +} + +#define _PBUF(_buf, _len) \ + ((struct printbuf) { \ + .pos = _buf, \ + .end = _buf + _len, \ + }) + +#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) + +#define pr_buf(_out, ...) \ do { \ - typeof((array)->freelist) _i; \ - \ - BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ - (array)->freelist = NULL; \ - \ - for (_i = (array)->data; \ - _i < (array)->data + ARRAY_SIZE((array)->data); \ - _i++) \ - array_free(array, _i); \ + (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ + __VA_ARGS__); \ } while (0) -#define array_freelist_empty(array) ((array)->freelist == NULL) - -#define ANYSINT_MAX(t) \ - ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) +void bch_scnmemcpy(struct printbuf *, const char *, size_t); int bch2_strtoint_h(const char *, int *); int bch2_strtouint_h(const char *, unsigned int *); int bch2_strtoll_h(const char *, long long *); int bch2_strtoull_h(const char *, unsigned long long *); +int bch2_strtou64_h(const char *, u64 *); static inline int bch2_strtol_h(const char *cp, long *res) { @@ -353,15 +348,14 @@ static inline int bch2_strtoul_h(const char *cp, long *res) : type_is(var, char *) ? "%s\n" \ : "%i\n", var) -ssize_t bch2_hprint(char *buf, s64 v); +void bch2_hprint(struct printbuf *, s64); bool bch2_is_zero(const void *, size_t); -ssize_t bch2_scnprint_string_list(char *, size_t, const char * const[], size_t); +void bch2_string_opt_to_text(struct printbuf *, + const char * const [], size_t); -ssize_t bch2_read_string_list(const char *, const char * const[]); - -ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64); +void bch2_flags_to_text(struct printbuf *, const char * const[], u64); u64 bch2_read_flag_list(char *, const char * const[]); #define NR_QUANTILES 15 @@ -436,7 +430,6 @@ static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) u64 bch2_ratelimit_delay(struct bch_ratelimit *); void bch2_ratelimit_increment(struct bch_ratelimit *, u64); -int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *); struct bch_pd_controller { struct bch_ratelimit rate; @@ -500,96 +493,12 @@ do { \ (var)->p_term_inverse, 1, INT_MAX); \ } while (0) -#define __DIV_SAFE(n, d, zero) \ -({ \ - typeof(n) _n = (n); \ - typeof(d) _d = (d); \ - _d ? _n / _d : zero; \ -}) - -#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) - #define container_of_or_null(ptr, type, member) \ ({ \ typeof(ptr) _ptr = ptr; \ _ptr ? container_of(_ptr, type, member) : NULL; \ }) -#define RB_INSERT(root, new, member, cmp) \ -({ \ - __label__ dup; \ - struct rb_node **n = &(root)->rb_node, *parent = NULL; \ - typeof(new) this; \ - int res, ret = -1; \ - \ - while (*n) { \ - parent = *n; \ - this = container_of(*n, typeof(*(new)), member); \ - res = cmp(new, this); \ - if (!res) \ - goto dup; \ - n = res < 0 \ - ? &(*n)->rb_left \ - : &(*n)->rb_right; \ - } \ - \ - rb_link_node(&(new)->member, parent, n); \ - rb_insert_color(&(new)->member, root); \ - ret = 0; \ -dup: \ - ret; \ -}) - -#define RB_SEARCH(root, search, member, cmp) \ -({ \ - struct rb_node *n = (root)->rb_node; \ - typeof(&(search)) this, ret = NULL; \ - int res; \ - \ - while (n) { \ - this = container_of(n, typeof(search), member); \ - res = cmp(&(search), this); \ - if (!res) { \ - ret = this; \ - break; \ - } \ - n = res < 0 \ - ? n->rb_left \ - : n->rb_right; \ - } \ - ret; \ -}) - -#define RB_GREATER(root, search, member, cmp) \ -({ \ - struct rb_node *n = (root)->rb_node; \ - typeof(&(search)) this, ret = NULL; \ - int res; \ - \ - while (n) { \ - this = container_of(n, typeof(search), member); \ - res = cmp(&(search), this); \ - if (res < 0) { \ - ret = this; \ - n = n->rb_left; \ - } else \ - n = n->rb_right; \ - } \ - ret; \ -}) - -#define RB_FIRST(root, type, member) \ - container_of_or_null(rb_first(root), type, member) - -#define RB_LAST(root, type, member) \ - container_of_or_null(rb_last(root), type, member) - -#define RB_NEXT(ptr, member) \ - container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) - -#define RB_PREV(ptr, member) \ - container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) - /* Does linear interpolation between powers of two */ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) { @@ -748,8 +657,6 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio, #define bio_for_each_contig_segment(bv, bio, iter) \ __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) -size_t bch_scnmemcpy(char *, size_t, const char *, size_t); - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); @@ -793,4 +700,55 @@ do { \ } \ } while (0) +static inline u64 percpu_u64_get(u64 __percpu *src) +{ + u64 ret = 0; + int cpu; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(src, cpu); + return ret; +} + +static inline void percpu_u64_set(u64 __percpu *dst, u64 src) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(dst, cpu) = 0; + + preempt_disable(); + *this_cpu_ptr(dst) = src; + preempt_enable(); +} + +static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) + acc[i] += src[i]; +} + +static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, + unsigned nr) +{ + int cpu; + + for_each_possible_cpu(cpu) + acc_u64s(acc, per_cpu_ptr(src, cpu), nr); +} + +static inline void percpu_memset(void __percpu *p, int c, size_t bytes) +{ + int cpu; + + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(p, cpu), c, bytes); +} + +u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + +#define cmp_int(l, r) ((l > r) - (l < r)) + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h index 795664428876..c099cdc0605f 100644 --- a/fs/bcachefs/vstructs.h +++ b/fs/bcachefs/vstructs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _VSTRUCTS_H #define _VSTRUCTS_H diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index c89c7200a1b4..9b8f6f1f9a77 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -1,8 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" #include "btree_update.h" -#include "compress.h" #include "extents.h" #include "fs.h" #include "rebalance.h" @@ -13,24 +13,8 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> -static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -{ - return DIV_ROUND_UP(sizeof(struct bch_xattr) + - name_len + val_len, sizeof(u64)); -} - -#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) - static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -struct xattr_search_key { - u8 type; - struct qstr name; -}; - -#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ - { .type = _type, .name = QSTR_INIT(_name, _len) }) - static u64 bch2_xattr_hash(const struct bch_hash_info *info, const struct xattr_search_key *key) { @@ -78,8 +62,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) const struct bch_hash_desc bch2_xattr_hash_desc = { .btree_id = BTREE_ID_XATTRS, - .key_type = BCH_XATTR, - .whiteout_type = BCH_XATTR_WHITEOUT, + .key_type = KEY_TYPE_xattr, .hash_key = xattr_hash_key, .hash_bkey = xattr_hash_bkey, .cmp_key = xattr_cmp_key, @@ -89,90 +72,74 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct xattr_handler *handler; - struct bkey_s_c_xattr xattr; - unsigned u64s; - - switch (k.k->type) { - case BCH_XATTR: - if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) - return "value too small"; - - xattr = bkey_s_c_to_xattr(k); - u64s = xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len)); + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - if (bkey_val_u64s(k.k) < u64s) - return "value too small"; + if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) + return "value too small"; - if (bkey_val_u64s(k.k) > u64s) - return "value too big"; + if (bkey_val_u64s(k.k) < + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len))) + return "value too small"; - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (!handler) - return "invalid type"; + if (bkey_val_u64s(k.k) > + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4)) + return "value too big"; - if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) - return "xattr name has invalid characters"; + handler = bch2_xattr_type_to_handler(xattr.v->x_type); + if (!handler) + return "invalid type"; - return NULL; - case BCH_XATTR_WHITEOUT: - return bkey_val_bytes(k.k) != 0 - ? "value size should be zero" - : NULL; + if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) + return "xattr name has invalid characters"; - default: - return "invalid type"; - } + return NULL; } -void bch2_xattr_to_text(struct bch_fs *c, char *buf, - size_t size, struct bkey_s_c k) +void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { const struct xattr_handler *handler; - struct bkey_s_c_xattr xattr; - size_t n = 0; - - switch (k.k->type) { - case BCH_XATTR: - xattr = bkey_s_c_to_xattr(k); - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (handler && handler->prefix) - n += scnprintf(buf + n, size - n, "%s", handler->prefix); - else if (handler) - n += scnprintf(buf + n, size - n, "(type %u)", - xattr.v->x_type); - else - n += scnprintf(buf + n, size - n, "(unknown type %u)", - xattr.v->x_type); - - n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name, - xattr.v->x_name_len); - n += scnprintf(buf + n, size - n, ":"); - n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v), - le16_to_cpu(xattr.v->x_val_len)); - break; - case BCH_XATTR_WHITEOUT: - scnprintf(buf, size, "whiteout"); - break; - } + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + + handler = bch2_xattr_type_to_handler(xattr.v->x_type); + if (handler && handler->prefix) + pr_buf(out, "%s", handler->prefix); + else if (handler) + pr_buf(out, "(type %u)", xattr.v->x_type); + else + pr_buf(out, "(unknown type %u)", xattr.v->x_type); + + bch_scnmemcpy(out, xattr.v->x_name, + xattr.v->x_name_len); + pr_buf(out, ":"); + bch_scnmemcpy(out, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); } int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) + const char *name, void *buffer, size_t size, int type) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c_xattr xattr; int ret; - k = bch2_hash_lookup(bch2_xattr_hash_desc, &inode->ei_str_hash, c, - inode->v.i_ino, &iter, - &X_SEARCH(type, name, strlen(name))); - if (IS_ERR(k.k)) - return bch2_btree_iter_unlock(&iter) ?: -ENODATA; + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, + &inode->ei_str_hash, inode->v.i_ino, + &X_SEARCH(type, name, strlen(name)), + 0); + if (IS_ERR(iter)) { + bch2_trans_exit(&trans); + BUG_ON(PTR_ERR(iter) == -EINTR); - xattr = bkey_s_c_to_xattr(k); + return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); + } + + xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -181,48 +148,48 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, memcpy(buffer, xattr_val(xattr.v), ret); } - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); return ret; } -int __bch2_xattr_set(struct bch_fs *c, u64 inum, - const struct bch_hash_info *hash_info, - const char *name, const void *value, size_t size, - int flags, int type, u64 *journal_seq) +int bch2_xattr_set(struct btree_trans *trans, u64 inum, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int type, int flags) { - struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); int ret; - if (!value) { - ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info, - c, inum, - journal_seq, &search); - } else { + if (value) { struct bkey_i_xattr *xattr; + unsigned namelen = strlen(name); unsigned u64s = BKEY_U64s + - xattr_val_u64s(search.name.len, size); + xattr_val_u64s(namelen, size); if (u64s > U8_MAX) return -ERANGE; - xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (!xattr) - return -ENOMEM; + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); bkey_xattr_init(&xattr->k_i); xattr->k.u64s = u64s; xattr->v.x_type = type; - xattr->v.x_name_len = search.name.len; + xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, search.name.name, search.name.len); + memcpy(xattr->v.x_name, name, namelen); memcpy(xattr_val(&xattr->v), value, size); - ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c, - inum, journal_seq, - &xattr->k_i, - (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| - (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); - kfree(xattr); + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inum, &xattr->k_i, + (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| + (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + } else { + struct xattr_search_key search = + X_SEARCH(type, name, strlen(name)); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, + hash_info, inum, &search); } if (ret == -ENOENT) @@ -231,78 +198,115 @@ int __bch2_xattr_set(struct bch_fs *c, u64 inum, return ret; } -int bch2_xattr_set(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, const void *value, size_t size, - int flags, int type) +struct xattr_buf { + char *buf; + size_t len; + size_t used; +}; + +static int __bch2_xattr_emit(const char *prefix, + const char *name, size_t name_len, + struct xattr_buf *buf) { - return __bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash, - name, value, size, flags, type, - &inode->ei_journal_seq); + const size_t prefix_len = strlen(prefix); + const size_t total_len = prefix_len + name_len + 1; + + if (buf->buf) { + if (buf->used + total_len > buf->len) + return -ERANGE; + + memcpy(buf->buf + buf->used, prefix, prefix_len); + memcpy(buf->buf + buf->used + prefix_len, + name, name_len); + buf->buf[buf->used + prefix_len + name_len] = '\0'; + } + + buf->used += total_len; + return 0; } -static size_t bch2_xattr_emit(struct dentry *dentry, - const struct bch_xattr *xattr, - char *buffer, size_t buffer_size) +static int bch2_xattr_emit(struct dentry *dentry, + const struct bch_xattr *xattr, + struct xattr_buf *buf) { const struct xattr_handler *handler = bch2_xattr_type_to_handler(xattr->x_type); - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; - const size_t prefix_len = strlen(prefix); - const size_t total_len = prefix_len + xattr->x_name_len + 1; + return handler && (!handler->list || handler->list(dentry)) + ? __bch2_xattr_emit(handler->prefix ?: handler->name, + xattr->x_name, xattr->x_name_len, buf) + : 0; +} - if (buffer && total_len <= buffer_size) { - memcpy(buffer, prefix, prefix_len); - memcpy(buffer + prefix_len, - xattr->x_name, xattr->x_name_len); - buffer[prefix_len + xattr->x_name_len] = '\0'; - } +static int bch2_xattr_list_bcachefs(struct bch_fs *c, + struct bch_inode_info *inode, + struct xattr_buf *buf, + bool all) +{ + const char *prefix = all ? "bcachefs_effective." : "bcachefs."; + unsigned id; + int ret = 0; + u64 v; - return total_len; - } else { - return 0; + for (id = 0; id < Inode_opt_nr; id++) { + v = bch2_inode_opt_get(&inode->ei_inode, id); + if (!v) + continue; + + if (!all && + !(inode->ei_inode.bi_fields_set & (1 << id))) + continue; + + ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], + strlen(bch2_inode_opts[id]), buf); + if (ret) + break; } + + return ret; } ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; - struct btree_iter iter; + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct btree_trans trans; + struct btree_iter *iter; struct bkey_s_c k; - const struct bch_xattr *xattr; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 inum = dentry->d_inode->i_ino; - ssize_t ret = 0; - size_t len; + int ret; - for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), 0, k) { + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, + POS(inum, 0), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) break; - if (k.k->type != BCH_XATTR) + if (k.k->type != KEY_TYPE_xattr) continue; - xattr = bkey_s_c_to_xattr(k).v; + ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + if (ret) + break; + } + ret = bch2_trans_exit(&trans) ?: ret; - len = bch2_xattr_emit(dentry, xattr, buffer, buffer_size); - if (buffer) { - if (len > buffer_size) { - bch2_btree_iter_unlock(&iter); - return -ERANGE; - } + if (ret) + return ret; - buffer += len; - buffer_size -= len; - } + ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); + if (ret) + return ret; - ret += len; + ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); + if (ret) + return ret; - } - bch2_btree_iter_unlock(&iter); - - return ret; + return buf.used; } static int bch2_xattr_get_handler(const struct xattr_handler *handler, @@ -323,15 +327,18 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - return bch2_xattr_set(c, inode, name, value, size, flags, - handler->flags); + return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC, + bch2_xattr_set(&trans, inode->v.i_ino, + &inode->ei_str_hash, + name, value, size, + handler->flags, flags)); } static const struct xattr_handler bch_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = bch2_xattr_get_handler, .set = bch2_xattr_set_handler, - .flags = BCH_XATTR_INDEX_USER, + .flags = KEY_TYPE_XATTR_INDEX_USER, }; static bool bch2_xattr_trusted_list(struct dentry *dentry) @@ -344,44 +351,82 @@ static const struct xattr_handler bch_xattr_trusted_handler = { .list = bch2_xattr_trusted_list, .get = bch2_xattr_get_handler, .set = bch2_xattr_set_handler, - .flags = BCH_XATTR_INDEX_TRUSTED, + .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, }; static const struct xattr_handler bch_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = bch2_xattr_get_handler, .set = bch2_xattr_set_handler, - .flags = BCH_XATTR_INDEX_SECURITY, + .flags = KEY_TYPE_XATTR_INDEX_SECURITY, }; #ifndef NO_BCACHEFS_FS -static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *vinode, - const char *name, void *buffer, size_t size) +static int opt_to_inode_opt(int id) +{ + switch (id) { +#define x(name, ...) \ + case Opt_##name: return Inode_opt_##name; + BCH_INODE_OPTS() +#undef x + default: + return -1; + } +} + +static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size, + bool all) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_opts opts = bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); const struct bch_option *opt; - int ret, id; + int id, inode_opt_id; + char buf[512]; + struct printbuf out = PBUF(buf); + unsigned val_len; u64 v; id = bch2_opt_lookup(name); if (id < 0 || !bch2_opt_is_inode_opt(id)) return -EINVAL; + inode_opt_id = opt_to_inode_opt(id); + if (inode_opt_id < 0) + return -EINVAL; + opt = bch2_opt_table + id; if (!bch2_opt_defined_by_id(&opts, id)) return -ENODATA; + if (!all && + !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) + return -ENODATA; + v = bch2_opt_get_by_id(&opts, id); + bch2_opt_to_text(&out, c, opt, v, 0); + + val_len = out.pos - buf; - ret = bch2_opt_to_text(c, buffer, size, opt, v, 0); + if (buffer && val_len > size) + return -ERANGE; - return ret < size || !buffer ? ret : -ERANGE; + if (buffer) + memcpy(buffer, buf, val_len); + return val_len; +} + +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, false); } struct inode_opt_set { @@ -397,9 +442,12 @@ static int inode_opt_set_fn(struct bch_inode_info *inode, struct inode_opt_set *s = p; if (s->defined) - bch2_inode_opt_set(bi, s->id, s->v); + bi->bi_fields_set |= 1U << s->id; else - bch2_inode_opt_clear(bi, s->id); + bi->bi_fields_set &= ~(1U << s->id); + + bch2_inode_opt_set(bi, s->id, s->v); + return 0; } @@ -413,46 +461,68 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, const struct bch_option *opt; char *buf; struct inode_opt_set s; - int ret; + int opt_id, inode_opt_id, ret; - s.id = bch2_opt_lookup(name); - if (s.id < 0 || !bch2_opt_is_inode_opt(s.id)) + opt_id = bch2_opt_lookup(name); + if (opt_id < 0) return -EINVAL; - opt = bch2_opt_table + s.id; + opt = bch2_opt_table + opt_id; + + inode_opt_id = opt_to_inode_opt(opt_id); + if (inode_opt_id < 0) + return -EINVAL; + + s.id = inode_opt_id; if (value) { + u64 v = 0; + buf = kmalloc(size + 1, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(c, opt, buf, &s.v); + ret = bch2_opt_parse(c, opt, buf, &v); kfree(buf); if (ret < 0) return ret; - if (s.id == Opt_compression || - s.id == Opt_background_compression) { - ret = bch2_check_set_has_compressed_data(c, s.v); - if (ret) - return ret; - } + ret = bch2_opt_check_may_set(c, opt_id, v); + if (ret < 0) + return ret; + s.v = v + 1; s.defined = true; } else { + if (!IS_ROOT(dentry)) { + struct bch_inode_info *dir = + to_bch_ei(d_inode(dentry->d_parent)); + + s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); + } else { + s.v = 0; + } + s.defined = false; } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s); + if (inode_opt_id == Inode_opt_project) { + ret = bch2_set_projid(c, inode, s.v); + if (ret) + goto err; + } + + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); +err: mutex_unlock(&inode->ei_update_lock); if (value && - (s.id == Opt_background_compression || - s.id == Opt_background_target)) + (opt_id == Opt_background_compression || + opt_id == Opt_background_target)) bch2_rebalance_add_work(c, inode->v.i_blocks); return ret; @@ -464,6 +534,21 @@ static const struct xattr_handler bch_xattr_bcachefs_handler = { .set = bch2_xattr_bcachefs_set, }; +static int bch2_xattr_bcachefs_get_effective( + const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, true); +} + +static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { + .prefix = "bcachefs_effective.", + .get = bch2_xattr_bcachefs_get_effective, + .set = bch2_xattr_bcachefs_set, +}; + #endif /* NO_BCACHEFS_FS */ const struct xattr_handler *bch2_xattr_handlers[] = { @@ -474,18 +559,19 @@ const struct xattr_handler *bch2_xattr_handlers[] = { &bch_xattr_security_handler, #ifndef NO_BCACHEFS_FS &bch_xattr_bcachefs_handler, + &bch_xattr_bcachefs_effective_handler, #endif NULL }; static const struct xattr_handler *bch_xattr_handler_map[] = { - [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler, - [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] = + [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] = + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, - [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, - [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, + [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, + [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, }; static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index a58e7e303421..4151065ab853 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_XATTR_H #define _BCACHEFS_XATTR_H @@ -6,13 +7,30 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -#define bch2_bkey_xattr_ops (struct bkey_ops) { \ +#define bch2_bkey_ops_xattr (struct bkey_ops) { \ .key_invalid = bch2_xattr_invalid, \ .val_to_text = bch2_xattr_to_text, \ } +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) +{ + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + name_len + val_len, sizeof(u64)); +} + +#define xattr_val(_xattr) \ + ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + +struct xattr_search_key { + u8 type; + struct qstr name; +}; + +#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ + { .type = _type, .name = QSTR_INIT(_name, _len) }) + struct dentry; struct xattr_handler; struct bch_hash_info; @@ -20,10 +38,10 @@ struct bch_inode_info; int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int __bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *, - const char *, const void *, size_t, int, int, u64 *); -int bch2_xattr_set(struct bch_fs *, struct bch_inode_info *, - const char *, const void *, size_t, int, int); + +int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler *bch2_xattr_handlers[]; diff --git a/include/linux/six.h b/include/linux/six.h index 40e213f2fb40..0fb1b2f49345 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIX_H #define _LINUX_SIX_H @@ -50,12 +50,12 @@ * six_trylock_convert(lock, from, to) * * A lock may be held multiple types by the same thread (for read or intent, - * not write) - up to SIX_LOCK_MAX_RECURSE. However, the six locks code does - * _not_ implement the actual recursive checks itself though - rather, if your - * code (e.g. btree iterator code) knows that the current thread already has a - * lock held, and for the correct type, six_lock_increment() may be used to - * bump up the counter for that type - the only effect is that one more call to - * unlock will be required before the lock is unlocked. + * not write). However, the six locks code does _not_ implement the actual + * recursive checks itself though - rather, if your code (e.g. btree iterator + * code) knows that the current thread already has a lock held, and for the + * correct type, six_lock_increment() may be used to bump up the counter for + * that type - the only effect is that one more call to unlock will be required + * before the lock is unlocked. */ #include <linux/lockdep.h> @@ -80,8 +80,8 @@ union six_lock_state { }; struct { - unsigned read_lock:26; - unsigned intent_lock:3; + unsigned read_lock:28; + unsigned intent_lock:1; unsigned waiters:3; /* * seq works much like in seqlocks: it's incremented every time @@ -96,8 +96,6 @@ union six_lock_state { }; }; -#define SIX_LOCK_MAX_RECURSE ((1 << 3) - 1) - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -106,6 +104,7 @@ enum six_lock_type { struct six_lock { union six_lock_state state; + unsigned intent_lock_recurse; struct task_struct *owner; struct optimistic_spin_queue osq; @@ -139,8 +138,6 @@ do { \ #define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) -#ifdef SIX_LOCK_SEPARATE_LOCKFNS - #define __SIX_LOCK(type) \ bool six_trylock_##type(struct six_lock *); \ bool six_relock_##type(struct six_lock *, u32); \ @@ -185,41 +182,6 @@ static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type typ SIX_LOCK_DISPATCH(type, six_unlock, lock); } -#else - -bool six_trylock_type(struct six_lock *, enum six_lock_type); -bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned); -void six_lock_type(struct six_lock *, enum six_lock_type); -void six_unlock_type(struct six_lock *, enum six_lock_type); - -#define __SIX_LOCK(type) \ -static __always_inline bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return six_trylock_type(lock, SIX_LOCK_##type); \ -} \ - \ -static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\ -{ \ - return six_relock_type(lock, SIX_LOCK_##type, seq); \ -} \ - \ -static __always_inline void six_lock_##type(struct six_lock *lock) \ -{ \ - six_lock_type(lock, SIX_LOCK_##type); \ -} \ - \ -static __always_inline void six_unlock_##type(struct six_lock *lock) \ -{ \ - six_unlock_type(lock, SIX_LOCK_##type); \ -} - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK - -#endif - void six_lock_downgrade(struct six_lock *); bool six_lock_tryupgrade(struct six_lock *); bool six_trylock_convert(struct six_lock *, enum six_lock_type, diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 026ad55bf80c..d7e898b02491 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM bcachefs @@ -43,21 +44,6 @@ DECLARE_EVENT_CLASS(bkey, __entry->offset, __entry->size) ); -DECLARE_EVENT_CLASS(bch_dev, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); - ), - - TP_printk("%pU", __entry->uuid) -); - DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -179,7 +165,7 @@ TRACE_EVENT(btree_write, TP_ARGS(b, bytes, sectors), TP_STRUCT__entry( - __field(enum bkey_type, type) + __field(enum btree_node_type, type) __field(unsigned, bytes ) __field(unsigned, sectors ) ), @@ -296,6 +282,11 @@ DEFINE_EVENT(btree_node, btree_compact, TP_ARGS(c, b) ); +DEFINE_EVENT(btree_node, btree_merge, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + DEFINE_EVENT(btree_node, btree_set_root, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) @@ -355,16 +346,6 @@ DEFINE_EVENT(bch_fs, gc_coalesce_end, TP_ARGS(c) ); -DEFINE_EVENT(bch_dev, sectors_saturated, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(bch_fs, gc_sectors_saturated, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, TP_PROTO(struct bch_fs *c), TP_ARGS(c) @@ -518,6 +499,148 @@ TRACE_EVENT(copygc, __entry->buckets_moved, __entry->buckets_not_moved) ); +DECLARE_EVENT_CLASS(transaction_restart, + TP_PROTO(unsigned long ip), + TP_ARGS(ip), + + TP_STRUCT__entry( + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->ip = ip; + ), + + TP_printk("%pf", (void *) __entry->ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +TRACE_EVENT(trans_restart_iters_realloced, + TP_PROTO(unsigned long ip, unsigned nr), + TP_ARGS(ip, nr), + + TP_STRUCT__entry( + __field(unsigned long, ip ) + __field(unsigned, nr ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->nr = nr; + ), + + TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) +); + +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(unsigned long ip, unsigned long bytes), + TP_ARGS(ip, bytes), + + TP_STRUCT__entry( + __field(unsigned long, ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( + __entry->ip = ip; + __entry->bytes = bytes; + ), + + TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_mark, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_upgrade, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_traverse, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_atomic, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + +DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq), + + TP_STRUCT__entry( + __field(u32, level) + __field(u32, iter_seq) + __field(u32, node) + __field(u32, node_seq) + ), + + TP_fast_assign( + __entry->level = level; + __entry->iter_seq = iter_seq; + __entry->node = node; + __entry->node_seq = node_seq; + ), + + TP_printk("level %u iter seq %u node %u node seq %u", + __entry->level, __entry->iter_seq, + __entry->node, __entry->node_seq) +); + +DEFINE_EVENT(node_lock_fail, node_upgrade_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq) +); + +DEFINE_EVENT(node_lock_fail, node_relock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq) +); + #endif /* _TRACE_BCACHE_H */ /* This part must be outside protection */ diff --git a/kernel/locking/six.c b/kernel/locking/six.c index aceeabb03920..9fa58b6fadc9 100644 --- a/kernel/locking/six.c +++ b/kernel/locking/six.c @@ -76,17 +76,6 @@ static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, } } -static inline void six_clear_owner(struct six_lock *lock, enum six_lock_type type) -{ - if (type != SIX_LOCK_intent) - return; - - EBUG_ON(lock->owner != current); - - if (lock->state.intent_lock == 1) - lock->owner = NULL; -} - static __always_inline bool do_six_trylock_type(struct six_lock *lock, enum six_lock_type type) { @@ -393,16 +382,24 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) EBUG_ON(type == SIX_LOCK_write && !(lock->state.v & __SIX_LOCK_HELD_intent)); - six_clear_owner(lock, type); + six_release(&lock->dep_map); + + if (type == SIX_LOCK_intent) { + EBUG_ON(lock->owner != current); + + if (lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + lock->owner = NULL; + } state.v = atomic64_add_return_release(l[type].unlock_val, &lock->state.counter); - six_release(&lock->dep_map); six_lock_wakeup(lock, state, l[type].unlock_wakeup); } -#ifdef SIX_LOCK_SEPARATE_LOCKFNS - #define __SIX_LOCK(type) \ bool six_trylock_##type(struct six_lock *lock) \ { \ @@ -434,36 +431,6 @@ __SIX_LOCK(write) #undef __SIX_LOCK -#else - -bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - return __six_trylock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_trylock_type); - -bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - return __six_relock_type(lock, type, seq); - -} -EXPORT_SYMBOL_GPL(six_relock_type); - -void six_lock_type(struct six_lock *lock, enum six_lock_type type) -{ - __six_lock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_lock_type); - -void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - __six_unlock_type(lock, type); -} -EXPORT_SYMBOL_GPL(six_unlock_type); - -#endif - /* Convert from intent to read: */ void six_lock_downgrade(struct six_lock *lock) { @@ -530,6 +497,16 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) /* XXX: assert already locked, and that we don't overflow: */ - atomic64_add(l[type].lock_val, &lock->state.counter); + switch (type) { + case SIX_LOCK_read: + atomic64_add(l[type].lock_val, &lock->state.counter); + break; + case SIX_LOCK_intent: + lock->intent_lock_recurse++; + break; + case SIX_LOCK_write: + BUG(); + break; + } } EXPORT_SYMBOL_GPL(six_lock_increment); |