diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-11-27 16:20:13 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-11-27 23:23:00 -0500 |
commit | 8d547a161a2e3d94360fd4979270db7ca9794a2c (patch) | |
tree | f3c9ee9bbe2ccb9cb050a17cbc0d312ebf8dffb6 | |
parent | 5e2bedbf8ac4e178f8570cf75fdfa931c960556f (diff) |
Merge with d9565fc1a8 bcachefs: Improve tracing of btree_path leaks
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
99 files changed, 11644 insertions, 7337 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 57c5d58c2d87..27742ce276cd 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -20,6 +20,7 @@ config BCACHEFS_FS select SIXLOCKS select RAID6_PQ select XOR_BLOCKS + select XXHASH select SRCU help The bcachefs filesystem - a modern, copy on write filesystem, with diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index ee5e6dbd5ede..71cda24e6d08 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -49,6 +49,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ siphash.o \ + subvolume.o \ super.o \ super-io.o \ sysfs.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 594e1f1a1291..5070caf8f349 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -212,50 +212,61 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct inode *vinode, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; + struct bkey_s_c k; + int ret; + + if (rcu) + return ERR_PTR(-ECHILD); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &hash, inode->v.i_ino, + ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); - if (IS_ERR(iter)) { - if (PTR_ERR(iter) == -EINTR) + if (ret) { + if (ret == -EINTR) goto retry; + if (ret != -ENOENT) + acl = ERR_PTR(ret); + goto out; + } - if (PTR_ERR(iter) != -ENOENT) - acl = ERR_CAST(iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + acl = ERR_PTR(ret); goto out; } - xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); - bch2_trans_iter_put(&trans, iter); out: + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return acl; } -int bch2_set_acl_trans(struct btree_trans *trans, +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); int ret; if (type == ACL_TYPE_DEFAULT && @@ -268,27 +279,27 @@ int bch2_set_acl_trans(struct btree_trans *trans, if (IS_ERR(xattr)) return PTR_ERR(xattr); - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &xattr->k_i, 0); + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &search); + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); } return ret == -ENOENT ? 0 : ret; } -int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) +int bch2_set_acl(struct user_namespace *mnt_userns, + struct inode *vinode, struct posix_acl *_acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -299,42 +310,37 @@ retry: bch2_trans_begin(&trans); acl = _acl; - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); if (ret) goto btree_err; mode = inode_u.bi_mode; if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(&inode->v, &mode, &acl); + ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); if (ret) goto btree_err; } - hash_info = bch2_hash_info_init(c, &inode_u); - - ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); + ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; inode_u.bi_ctime = bch2_current_time(c); inode_u.bi_mode = mode; - ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOUNLOCK); + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, 0); btree_err: - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); if (ret == -EINTR) goto retry; if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); @@ -345,31 +351,35 @@ err: return ret; } -int bch2_acl_chmod(struct btree_trans *trans, +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; struct posix_acl *acl; + struct bkey_s_c k; int ret; - iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, - &hash_info, inode->bi_inum, + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); if (ret) return ret == -ENOENT ? 0 : ret; - xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + k = bch2_btree_iter_peek_slot(&iter); + xattr = bkey_s_c_to_xattr(k); + if (ret) + goto err; + acl = bch2_acl_from_disk(xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); - if (ret || !acl) + if (IS_ERR_OR_NULL(acl)) goto err; ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); @@ -382,13 +392,14 @@ int bch2_acl_chmod(struct btree_trans *trans, goto err; } - new->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, &new->k_i, 0); + new->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); *new_acl = acl; acl = NULL; err: - bch2_trans_iter_put(trans, iter); - kfree(acl); + bch2_trans_iter_exit(trans, &iter); + if (!IS_ERR_OR_NULL(acl)) + kfree(acl); return ret; } diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index ba210c26d5c1..2d76a4897ba8 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -26,27 +26,26 @@ typedef struct { __le32 a_version; } bch_acl_header; -struct posix_acl *bch2_get_acl(struct inode *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); -int bch2_set_acl_trans(struct btree_trans *, +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, - const struct bch_hash_info *, struct posix_acl *, int); -int bch2_set_acl(struct inode *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, +int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else -static inline int bch2_set_acl_trans(struct btree_trans *trans, +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { return 0; } -static inline int bch2_acl_chmod(struct btree_trans *trans, +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2324b81c09ce..b2735c8591d6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, #define x(_name, _bits) \ if (fieldnr < a.v->nr_fields) { \ - ret = bch2_varint_decode(in, end, &v); \ + ret = bch2_varint_decode_fast(in, end, &v); \ if (ret < 0) \ return ret; \ in += ret; \ @@ -147,10 +147,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, return 0; } -static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, +static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + out->journal_seq = le64_to_cpu(a.v->journal_seq); + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, const struct bkey_alloc_unpacked src) { - struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); unsigned nr_fields = 0, last_nonzero_fieldnr = 0; u8 *out = a->v.data; u8 *end = (void *) &dst[1]; @@ -161,12 +195,13 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, a->v.gen = src.gen; a->v.oldest_gen = src.oldest_gen; a->v.data_type = src.data_type; + a->v.journal_seq = cpu_to_le64(src.journal_seq); #define x(_name, _bits) \ nr_fields++; \ \ if (src._name) { \ - out += bch2_varint_encode(out, src._name); \ + out += bch2_varint_encode_fast(out, src._name); \ \ last_nonzero_field = out; \ last_nonzero_fieldnr = nr_fields; \ @@ -194,10 +229,17 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) .gen = 0, }; - if (k.k->type == KEY_TYPE_alloc_v2) - bch2_alloc_unpack_v2(&ret, k); - else if (k.k->type == KEY_TYPE_alloc) + switch (k.k->type) { + case KEY_TYPE_alloc: bch2_alloc_unpack_v1(&ret, k); + break; + case KEY_TYPE_alloc_v2: + bch2_alloc_unpack_v2(&ret, k); + break; + case KEY_TYPE_alloc_v3: + bch2_alloc_unpack_v3(&ret, k); + break; + } return ret; } @@ -206,7 +248,7 @@ void bch2_alloc_pack(struct bch_fs *c, struct bkey_alloc_buf *dst, const struct bkey_alloc_unpacked src) { - bch2_alloc_pack_v2(dst, src); + bch2_alloc_pack_v3(dst, src); } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -249,26 +291,41 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } +const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u; + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + + if (bch2_alloc_unpack_v3(&u, k)) + return "unpack error"; + + return NULL; +} + void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - pr_buf(out, "gen %u oldest_gen %u data_type %s", - u.gen, u.oldest_gen, bch2_data_types[u.data_type]); + pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", + u.gen, u.oldest_gen, bch2_data_types[u.data_type], + u.journal_seq); #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); BCH_ALLOC_FIELDS_V2() #undef x } -static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) +static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct bch_dev *ca; struct bucket *g; struct bkey_alloc_unpacked u; - if (k.k->type != KEY_TYPE_alloc && - k.k->type != KEY_TYPE_alloc_v2) + if (!bkey_is_alloc(k.k)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); @@ -289,11 +346,14 @@ static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) int bch2_alloc_read(struct bch_fs *c) { + struct btree_trans trans; int ret; + bch2_trans_init(&trans, c, 0, 0); down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn); + ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); up_read(&c->gc_lock); + bch2_trans_exit(&trans); if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; @@ -353,32 +413,30 @@ err: int bch2_alloc_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bch_dev *ca; unsigned i; int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); for_each_member_device(ca, c, i) { - bch2_btree_iter_set_pos(iter, + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, ca->mi.first_bucket)); - while (iter->pos.offset < ca->mi.nbuckets) { - bch2_trans_cond_resched(&trans); - - ret = bch2_alloc_write_key(&trans, iter, flags); + while (iter.pos.offset < ca->mi.nbuckets) { + ret = bch2_alloc_write_key(&trans, &iter, flags); if (ret) { percpu_ref_put(&ca->ref); goto err; } - bch2_btree_iter_next_slot(iter); + bch2_btree_iter_advance(&iter); } } err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -390,18 +448,18 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - struct btree_iter *iter; + struct btree_iter iter; struct bucket *g; struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; u64 *time, now; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; @@ -412,7 +470,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, percpu_down_read(&c->mark_lock); g = bucket(ca, bucket_nr); - u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); + u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); time = rw == READ ? &u.read_time : &u.write_time; @@ -423,10 +481,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, *time = now; bch2_alloc_pack(c, a, u); - ret = bch2_trans_update(trans, iter, &a->k, 0) ?: + ret = bch2_trans_update(trans, &iter, &a->k, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -695,27 +753,28 @@ static int bucket_invalidate_btree(struct btree_trans *trans, struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; - struct btree_iter *iter = - bch2_trans_get_iter(trans, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); + struct btree_iter iter; int ret; + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, b), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + a = bch2_trans_kmalloc(trans, sizeof(*a)); ret = PTR_ERR_OR_ZERO(a); if (ret) goto err; - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto err; percpu_down_read(&c->mark_lock); g = bucket(ca, b); m = READ_ONCE(g->mark); - u = alloc_mem_to_key(iter, g, m); + u = alloc_mem_to_key(&iter, g, m); percpu_up_read(&c->mark_lock); u.gen++; @@ -726,10 +785,10 @@ static int bucket_invalidate_btree(struct btree_trans *trans, u.write_time = atomic64_read(&c->io_clock[WRITE].now); bch2_alloc_pack(c, a, u); - ret = bch2_trans_update(trans, iter, &a->k, + ret = bch2_trans_update(trans, &iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -856,10 +915,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) /* If we used NOWAIT, don't return the error: */ if (!fifo_empty(&ca->free_inc)) ret = 0; - if (ret) { + if (ret < 0) bch_err(ca, "error invalidating buckets: %i", ret); + if (ret) return ret; - } if (journal_seq) ret = bch2_journal_flush_seq(&c->journal, journal_seq); @@ -1015,7 +1074,7 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; } @@ -1232,3 +1291,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); } + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) { + pr_buf(out, "%zu ref %u type %s\n", + ob - c->open_buckets, + atomic_read(&ob->pin), + bch2_data_types[ob->type]); + } + spin_unlock(&ob->lock); + } + +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 9cadfdb5b83d..370573f8e05d 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -9,6 +9,7 @@ extern const char * const bch2_allocator_states[]; struct bkey_alloc_unpacked { + u64 journal_seq; u64 bucket; u8 dev; u8 gen; @@ -21,19 +22,11 @@ struct bkey_alloc_unpacked { struct bkey_alloc_buf { struct bkey_i k; + struct bch_alloc_v3 v; - union { - struct { #define x(_name, _bits) + _bits / 8 - u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; + u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; #undef x - } _v1; - struct { -#define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; -#undef x - } _v2; - }; } __attribute__((packed, aligned(8))); /* How out of date a pointer gen is allowed to be: */ @@ -79,6 +72,7 @@ alloc_mem_to_key(struct btree_iter *iter, const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ @@ -91,6 +85,18 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ } +#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +static inline bool bkey_is_alloc(const struct bkey *k) +{ + return k->type == KEY_TYPE_alloc || + k->type == KEY_TYPE_alloc_v2 || + k->type == KEY_TYPE_alloc_v3; +} + int bch2_alloc_read(struct bch_fs *); static inline void bch2_wake_allocator(struct bch_dev *ca) @@ -132,4 +138,6 @@ int bch2_dev_allocator_start(struct bch_dev *); int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b35d008bcc04..fdf3a777ae16 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -179,6 +179,7 @@ #undef pr_fmt #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ +#include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> #include <linux/closure.h> @@ -217,8 +218,8 @@ #define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) fmt "\n" -#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) +#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -352,6 +353,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "subvolume_types.h" #include "super_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ @@ -380,6 +382,8 @@ enum gc_phase { GC_PHASE_BTREE_alloc, GC_PHASE_BTREE_quotas, GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, GC_PHASE_PENDING_DELETE, }; @@ -491,12 +495,14 @@ struct bch_dev { enum { /* startup: */ + BCH_FS_INITIALIZED, BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOC_CLEAN, BCH_FS_ALLOCATOR_RUNNING, BCH_FS_ALLOCATOR_STOPPING, BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, + BCH_FS_TOPOLOGY_REPAIR_DONE, BCH_FS_BTREE_INTERIOR_REPLAY_DONE, BCH_FS_FSCK_DONE, BCH_FS_STARTED, @@ -556,12 +562,27 @@ struct journal_keys { u64 journal_seq_base; }; -struct btree_iter_buf { - struct btree_iter *iter; +struct btree_path_buf { + struct btree_path *path; }; #define REPLICAS_DELTA_LIST_MAX (1U << 16) +struct snapshot_t { + u32 parent; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 equiv; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + struct bch_fs { struct closure cl; @@ -633,6 +654,15 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; + /* snapshot.c: */ + GENRADIX(struct snapshot_t) snapshots; + struct bch_snapshot_table __rcu *snapshot_table; + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + struct snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + /* BTREE CACHE */ struct bio_set btree_bio; struct workqueue_struct *io_complete_wq; @@ -665,16 +695,16 @@ struct bch_fs { /* btree_iter.c: */ struct mutex btree_trans_lock; struct list_head btree_trans_list; - mempool_t btree_iters_pool; + mempool_t btree_paths_pool; mempool_t btree_trans_mem_pool; - struct btree_iter_buf __percpu *btree_iters_bufs; + struct btree_path_buf __percpu *btree_paths_bufs; struct srcu_struct btree_trans_barrier; struct btree_key_cache btree_key_cache; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_error_wq; + struct workqueue_struct *btree_io_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; @@ -774,7 +804,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -790,6 +820,10 @@ struct bch_fs { struct write_point copygc_write_point; s64 copygc_wait; + /* DATA PROGRESS STATS */ + struct list_head data_progress_list; + struct mutex data_progress_lock; + /* STRIPES: */ GENRADIX(struct stripe) stripes[2]; @@ -825,8 +859,6 @@ struct bch_fs { atomic64_t btree_writes_nr; atomic64_t btree_writes_sectors; - struct bio_list btree_write_error_list; - struct work_struct btree_write_error_work; spinlock_t btree_write_error_lock; /* ERRORS */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 79c0876aab8b..b115bd1fa5a3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -148,7 +148,8 @@ static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) } #define POS_MIN SPOS(0, 0, 0) -#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) +#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) #define POS(_inode, _offset) SPOS(_inode, _offset, 0) /* Empty placeholder struct, for container_of() */ @@ -322,7 +323,7 @@ static inline void bkey_init(struct bkey *k) */ #define BCH_BKEY_TYPES() \ x(deleted, 0) \ - x(discard, 1) \ + x(whiteout, 1) \ x(error, 2) \ x(cookie, 3) \ x(hash_whiteout, 4) \ @@ -341,7 +342,11 @@ static inline void bkey_init(struct bkey *k) x(inline_data, 17) \ x(btree_ptr_v2, 18) \ x(indirect_inline_data, 19) \ - x(alloc_v2, 20) + x(alloc_v2, 20) \ + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ + x(alloc_v3, 24) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -354,7 +359,7 @@ struct bch_deleted { struct bch_val v; }; -struct bch_discard { +struct bch_whiteout { struct bch_val v; }; @@ -678,6 +683,16 @@ struct bch_inode { __u8 fields[0]; } __attribute__((packed, aligned(8))); +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[0]; +} __attribute__((packed, aligned(8))); + struct bch_inode_generation { struct bch_val v; @@ -685,6 +700,10 @@ struct bch_inode_generation { __le32 pad; } __attribute__((packed, aligned(8))); +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + #define BCH_INODE_FIELDS() \ x(bi_atime, 96) \ x(bi_ctime, 96) \ @@ -708,7 +727,9 @@ struct bch_inode_generation { x(bi_erasure_code, 16) \ x(bi_fields_set, 16) \ x(bi_dir, 64) \ - x(bi_dir_offset, 64) + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -763,6 +784,9 @@ LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + /* Dirents */ /* @@ -780,7 +804,13 @@ struct bch_dirent { struct bch_val v; /* Target inode number: */ + union { __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; /* * Copy of mode bits 12-15 from the target inode - so userspace can get @@ -791,6 +821,9 @@ struct bch_dirent { __u8 d_name[]; } __attribute__((packed, aligned(8))); +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + #define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ sizeof(struct bkey) - \ offsetof(struct bch_dirent, d_name)) @@ -848,6 +881,17 @@ struct bch_alloc_v2 { x(stripe, 32) \ x(stripe_redundancy, 8) +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __attribute__((packed, aligned(8))); + enum { #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, BCH_ALLOC_FIELDS_V1() @@ -901,18 +945,24 @@ struct bch_stripe { struct bch_reflink_p { struct bch_val v; __le64 idx; - - __le32 reservation_generation; - __u8 nr_replicas; - __u8 pad[3]; -}; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __attribute__((packed, aligned(8))); struct bch_reflink_v { struct bch_val v; __le64 refcount; union bch_extent_entry start[0]; __u64 _data[0]; -}; +} __attribute__((packed, aligned(8))); struct bch_indirect_inline_data { struct bch_val v; @@ -927,6 +977,43 @@ struct bch_inline_data { u8 data[0]; }; +/* Subvolumes: */ + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +/* Snapshots */ + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + __le32 pad; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -983,8 +1070,6 @@ LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -#define BCH_TIER_MAX 4U - #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); @@ -1209,7 +1294,12 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_inode_btree_change = 11, bcachefs_metadata_version_snapshot = 12, bcachefs_metadata_version_inode_backpointers = 13, - bcachefs_metadata_version_max = 14, + bcachefs_metadata_version_btree_ptr_sectors_written = 14, + bcachefs_metadata_version_snapshot_2 = 15, + bcachefs_metadata_version_reflink_p_fix = 16, + bcachefs_metadata_version_subvol_dirent = 17, + bcachefs_metadata_version_inode_v2 = 18, + bcachefs_metadata_version_max = 19, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1345,6 +1435,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); /* * Features: @@ -1352,7 +1443,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist * reflink: gates KEY_TYPE_reflink * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_SIPHASH + * new_siphash: gates BCH_STR_HASH_siphash * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE */ #define BCH_SB_FEATURES() \ @@ -1428,12 +1519,17 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash_old, 2) \ + x(siphash, 3) + enum bch_str_hash_type { - BCH_STR_HASH_CRC32C = 0, - BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH_OLD = 2, - BCH_STR_HASH_SIPHASH = 3, - BCH_STR_HASH_NR = 4, +#define x(t, n) BCH_STR_HASH_##t = n, + BCH_STR_HASH_TYPES() +#undef x + BCH_STR_HASH_NR }; #define BCH_STR_HASH_OPTS() \ @@ -1448,32 +1544,39 @@ enum bch_str_hash_opts { BCH_STR_HASH_OPT_NR }; +#define BCH_CSUM_TYPES() \ + x(none, 0) \ + x(crc32c_nonzero, 1) \ + x(crc64_nonzero, 2) \ + x(chacha20_poly1305_80, 3) \ + x(chacha20_poly1305_128, 4) \ + x(crc32c, 5) \ + x(crc64, 6) \ + x(xxhash, 7) + enum bch_csum_type { - BCH_CSUM_NONE = 0, - BCH_CSUM_CRC32C_NONZERO = 1, - BCH_CSUM_CRC64_NONZERO = 2, - BCH_CSUM_CHACHA20_POLY1305_80 = 3, - BCH_CSUM_CHACHA20_POLY1305_128 = 4, - BCH_CSUM_CRC32C = 5, - BCH_CSUM_CRC64 = 6, - BCH_CSUM_NR = 7, +#define x(t, n) BCH_CSUM_##t = n, + BCH_CSUM_TYPES() +#undef x + BCH_CSUM_NR }; static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C_NONZERO] = 4, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64_NONZERO] = 8, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, + [BCH_CSUM_none] = 0, + [BCH_CSUM_crc32c_nonzero] = 4, + [BCH_CSUM_crc32c] = 4, + [BCH_CSUM_crc64_nonzero] = 8, + [BCH_CSUM_crc64] = 8, + [BCH_CSUM_xxhash] = 8, + [BCH_CSUM_chacha20_poly1305_80] = 10, + [BCH_CSUM_chacha20_poly1305_128] = 16, }; static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: return true; default: return false; @@ -1483,7 +1586,8 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) #define BCH_CSUM_OPTS() \ x(none, 0) \ x(crc32c, 1) \ - x(crc64, 2) + x(crc64, 2) \ + x(xxhash, 3) enum bch_csum_opts { #define x(t, n) BCH_CSUM_OPT_##t = n, @@ -1689,7 +1793,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(alloc, 4) \ x(quotas, 5) \ x(stripes, 6) \ - x(reflink, 7) + x(reflink, 7) \ + x(subvolumes, 8) \ + x(snapshots, 9) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, @@ -1736,6 +1842,9 @@ LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, struct bset, flags, 5, 6); +/* Sector offset within the btree node: */ +LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); + struct btree_node { struct bch_csum csum; __le64 magic; diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f679fc2151bc..930981ad5535 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -78,6 +78,9 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) +#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) +#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal { __u64 nbuckets; }; +struct bch_ioctl_subvolume { + __u32 flags; + __u32 dirfd; + __u16 mode; + __u16 pad[3]; + __u64 dst_ptr; + __u64 src_ptr; +}; + +#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) +#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 0053f32c0076..946dd27f09fc 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -620,22 +620,22 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) if (f->nr_fields != BKEY_NR_FIELDS) return "incorrect number of fields"; + /* + * Verify that the packed format can't represent fields larger than the + * unpacked format: + */ for (i = 0; i < f->nr_fields; i++) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (f->bits_per_field[i] > unpacked_bits) + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) return "field too large"; - if ((f->bits_per_field[i] == unpacked_bits) && field_offset) - return "offset + bits overflow"; - - if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & - unpacked_mask) < - field_offset) - return "offset + bits overflow"; - bits += f->bits_per_field[i]; } diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 2e45d88fab03..7dee3d8e0a3d 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) #define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) enum bkey_lr_packed { BKEY_PACKED_BOTH, @@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r) return bpos_cmp(l, r) > 0 ? l : r; } -#define sbb(a, b, borrow) \ -do { \ - typeof(a) d1, d2; \ - \ - d1 = a - borrow; \ - borrow = d1 > a; \ - \ - d2 = d1 - b; \ - borrow += d2 > d1; \ - a = d2; \ -} while (0) - -/* returns a - b: */ -static inline struct bpos bpos_sub(struct bpos a, struct bpos b) -{ - int borrow = 0; - - sbb(a.snapshot, b.snapshot, borrow); - sbb(a.offset, b.offset, borrow); - sbb(a.inode, b.inode, borrow); - return a; -} - -static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -{ - if (bpos_cmp(l, r) > 0) - swap(l, r); - - return bpos_sub(r, l); -} - void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 9f869bed9f1c..5c900cf8a8a2 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -11,6 +11,7 @@ #include "inode.h" #include "quota.h" #include "reflink.h" +#include "subvolume.h" #include "xattr.h" const char * const bch2_bkey_types[] = { @@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c, .key_invalid = deleted_key_invalid, \ } -#define bch2_bkey_ops_discard (struct bkey_ops) { \ +#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ .key_invalid = deleted_key_invalid, \ } @@ -84,7 +85,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ } -static const struct bkey_ops bch2_bkey_ops[] = { +const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x @@ -100,31 +101,54 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_extents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_error)| + (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_extent)| (1U << KEY_TYPE_reservation)| (1U << KEY_TYPE_reflink_p)| (1U << KEY_TYPE_inline_data), [BKEY_TYPE_inodes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_v2)| (1U << KEY_TYPE_inode_generation), [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_dirent), [BKEY_TYPE_xattrs] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_xattr), [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_alloc)| - (1U << KEY_TYPE_alloc_v2), + (1U << KEY_TYPE_alloc_v2)| + (1U << KEY_TYPE_alloc_v3), [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_quota), [BKEY_TYPE_stripes] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_stripe), [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_reflink_v)| (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_subvolumes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_subvolume), + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| (1U << KEY_TYPE_btree_ptr_v2), }; @@ -132,21 +156,18 @@ static unsigned bch2_key_types_allowed[] = { const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type) { - unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| - bch2_key_types_allowed[type] ; - if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (!(key_types_allowed & (1U << k.k->type))) + if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) return "invalid key type for this btree"; if (type == BKEY_TYPE_btree && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - if (btree_node_type_is_extents(type)) { - if ((k.k->size == 0) != bkey_deleted(k.k)) + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (k.k->size == 0) return "bad size field"; if (k.k->size > k.k->p.offset) @@ -163,7 +184,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type != BKEY_TYPE_btree && btree_type_has_snapshots(type) && - k.k->p.snapshot != U32_MAX) + !k.k->p.snapshot) return "invalid snapshot field"; if (type != BKEY_TYPE_btree && @@ -213,6 +234,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) pr_buf(out, "POS_MIN"); else if (!bpos_cmp(pos, POS_MAX)) pr_buf(out, "POS_MAX"); + else if (!bpos_cmp(pos, SPOS_MAX)) + pr_buf(out, "SPOS_MAX"); else { if (pos.inode == U64_MAX) pr_buf(out, "U64_MAX"); @@ -267,7 +290,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, { bch2_bkey_to_text(out, k.k); - if (k.k) { + if (bkey_val_bytes(k.k)) { pr_buf(out, ": "); bch2_val_to_text(out, c, k); } @@ -290,24 +313,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) : false; } -enum merge_result bch2_bkey_merge(struct bch_fs *c, - struct bkey_s l, struct bkey_s r) +bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; - enum merge_result ret; - - if (bch2_key_merging_disabled || - !ops->key_merge || - l.k->type != r.k->type || - bversion_cmp(l.k->version, r.k->version) || - bpos_cmp(l.k->p, bkey_start_pos(r.k))) - return BCH_MERGE_NOMERGE; - - ret = ops->key_merge(c, l, r); - if (ret != BCH_MERGE_NOMERGE) - l.k->needs_whiteout |= r.k->needs_whiteout; - return ret; + return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); } static const struct old_bkey_type { diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index bfa6f112aeed..3012035db1a3 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -11,17 +11,6 @@ enum btree_node_type; extern const char * const bch2_bkey_types[]; -enum merge_result { - BCH_MERGE_NOMERGE, - - /* - * The keys were mergeable, but would have overflowed size - so instead - * l was changed to the maximum size, and both keys were modified: - */ - BCH_MERGE_PARTIAL, - BCH_MERGE_MERGE, -}; - struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, @@ -30,13 +19,14 @@ struct bkey_ops { struct bkey_s_c); void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); - enum merge_result (*key_merge)(struct bch_fs *, - struct bkey_s, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); }; +extern const struct bkey_ops bch2_bkey_ops[]; + const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type); @@ -57,8 +47,17 @@ void bch2_bkey_swab_val(struct bkey_s); bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -enum merge_result bch2_bkey_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) +{ + return l->type == r->type && + !bversion_cmp(l->version, r->version) && + !bpos_cmp(l->p, bkey_start_pos(r)) && + (u64) l->size + r->size <= KEY_SIZE_MAX && + bch2_bkey_ops[l->type].key_merge && + !bch2_key_merging_disabled; +} + +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 8a149e21d0b4..59e4c1d1a2a5 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, return; /* Verify no duplicates: */ - btree_node_iter_for_each(iter, set) + btree_node_iter_for_each(iter, set) { + BUG_ON(set->k > set->end); btree_node_iter_for_each(iter, s2) BUG_ON(set != s2 && set->end == s2->end); + } /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { @@ -1193,7 +1195,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, static inline void prefetch_four_cachelines(void *p) { -#if CONFIG_X86_64 +#ifdef CONFIG_X86_64 asm("prefetcht0 (-127 + 64 * 0)(%0);" "prefetcht0 (-127 + 64 * 1)(%0);" "prefetcht0 (-127 + 64 * 2)(%0);" diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 986f9b4b1a21..5ae61e5d3923 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -13,6 +13,8 @@ #include <linux/sched/mm.h> #include <trace/events/bcachefs.h> +struct lock_class_key bch2_btree_node_lock_key; + void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; @@ -76,8 +78,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; #ifdef __KERNEL__ - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, @@ -99,7 +100,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c) return NULL; bkey_btree_ptr_init(&b->key); - six_lock_init(&b->c.lock); + __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); b->byte_order = ilog2(btree_bytes(c)); @@ -127,7 +128,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { - rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + BUG_ON(ret); /* Cause future lookups for this node to fail: */ b->hash_val = 0; @@ -185,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); +wait_on_io: + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + return -ENOMEM; + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } if (!six_trylock_intent(&b->c.lock)) return -ENOMEM; @@ -192,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + goto out_unlock; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + if (btree_node_noevict(b)) goto out_unlock; if (!btree_node_may_write(b)) goto out_unlock; - if (btree_node_dirty(b) && - test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) - goto out_unlock; - - if (btree_node_dirty(b) || - btree_node_write_in_flight(b) || - btree_node_read_in_flight(b)) { - if (!flush) + if (btree_node_dirty(b)) { + if (!flush || + test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) goto out_unlock; - - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); - /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -220,10 +234,11 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); - /* wait for any in flight btree write */ - btree_node_wait_on_io(b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; } out: if (b->hash_val && !ret) @@ -286,7 +301,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, list_for_each_entry_safe(b, t, &bc->freeable, list) { touched++; - if (freed >= nr) + if (touched >= nr) break; if (++i > 3 && @@ -301,7 +316,7 @@ restart: list_for_each_entry_safe(b, t, &bc->live, list) { touched++; - if (freed >= nr) { + if (touched >= nr) { /* Save position */ if (&t->list != &bc->live) list_move_tail(&bc->live, &t->list); @@ -573,6 +588,7 @@ got_node: } BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); out: b->flags = 0; @@ -617,7 +633,8 @@ err: /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_iter *iter, + struct btree_trans *trans, + struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level, @@ -626,14 +643,17 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, { struct btree_cache *bc = &c->btree_cache; struct btree *b; + u32 seq; BUG_ON(level + 1 >= BTREE_MAX_DEPTH); /* * Parent node must be locked, else we could read in a btree node that's * been freed: */ - if (iter && !bch2_btree_node_relock(iter, level + 1)) + if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + btree_trans_restart(trans); return ERR_PTR(-EINTR); + } b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) @@ -655,25 +675,32 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, return NULL; } - /* - * Unlock before doing IO: - * - * XXX: ideally should be dropping all btree node locks here - */ - if (iter && btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); - - bch2_btree_node_read(c, b, sync); + set_btree_node_read_in_flight(b); six_unlock_write(&b->c.lock); + seq = b->c.lock.state.seq; + six_unlock_intent(&b->c.lock); - if (!sync) { - six_unlock_intent(&b->c.lock); + /* Unlock before doing IO: */ + if (trans && sync) + bch2_trans_unlock(trans); + + bch2_btree_node_read(c, b, sync); + + if (!sync) return NULL; + + if (trans && + (!bch2_trans_relock(trans) || + !bch2_btree_path_relock_intent(trans, path))) { + BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); } - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->c.lock); + if (!six_relock_type(&b->c.lock, lock_type, seq)) { + btree_trans_restart(trans); + return ERR_PTR(-EINTR); + } return b; } @@ -688,26 +715,25 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { - char buf1[100], buf2[100], buf3[100], buf4[100]; + char buf1[200], buf2[100], buf3[100]; if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) return; - bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2 - ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key - : POS_MIN); + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key)); bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); + bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); - bch2_bpos_to_text(&PBUF(buf3), b->key.k.p); - bch2_bpos_to_text(&PBUF(buf4), b->data->max_key); bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" - "btree: ptr %u header %llu\n" - "level: ptr %u header %llu\n" - "min ptr %s node header %s\n" - "max ptr %s node header %s", - b->c.btree_id, BTREE_NODE_ID(b->data), - b->c.level, BTREE_NODE_LEVEL(b->data), - buf1, buf2, buf3, buf4); + "btree %s level %u\n" + "ptr: %s\n" + "header: btree %s level %llu\n" + "min %s max %s\n", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, + bch2_btree_ids[BTREE_NODE_ID(b->data)], + BTREE_NODE_LEVEL(b->data), + buf2, buf3); } static inline void btree_check_header(struct bch_fs *c, struct btree *b) @@ -730,20 +756,28 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) * The btree node will have either a read or a write lock held, depending on * the @write parameter. */ -struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type, unsigned long trace_ip) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; EBUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_node_mem_ptr(k); - if (b) - goto lock_node; + if (c->opts.btree_node_mem_ptr_optimization) { + b = btree_node_mem_ptr(k); + /* + * Check b->hash_val _before_ calling btree_node_lock() - this + * might not be the node we want anymore, and trying to lock the + * wrong node could cause an unneccessary transaction restart: + */ + if (b && b->hash_val == btree_ptr_hash_val(k)) + goto lock_node; + } retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -752,7 +786,7 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, level, lock_type, true); /* We raced and found the btree node in the cache */ @@ -791,12 +825,12 @@ lock_node: * the parent was modified, when the pointer to the node we want * was removed - and we'll bail out: */ - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, lock_type, + if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type, lock_node_check_fn, (void *) k, trace_ip)) { - if (b->hash_val != btree_ptr_hash_val(k)) + if (!trans->restarted) goto retry; return ERR_PTR(-EINTR); } @@ -805,20 +839,40 @@ lock_node: b->c.level != level || race_fault())) { six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(iter, level + 1)) + if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(iter->trans->ip, + trace_trans_restart_btree_node_reused(trans->ip, trace_ip, - iter->btree_id, - &iter->real_pos); + path->btree_id, + &path->pos); + btree_trans_restart(trans); return ERR_PTR(-EINTR); } } - /* XXX: waiting on IO with btree locks held: */ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = b->c.lock.state.seq; + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(trans); + + bch2_btree_node_wait_on_read(b); + + /* + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ + if (trans && + (!bch2_trans_relock(trans) || + !bch2_btree_path_relock_intent(trans, path))) { + BUG_ON(!trans->restarted); + return ERR_PTR(-EINTR); + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; + } prefetch(b->aux_data); @@ -839,7 +893,7 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(b->c.btree_id != path->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); @@ -859,16 +913,18 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, EBUG_ON(level >= BTREE_MAX_DEPTH); - b = btree_node_mem_ptr(k); - if (b) - goto lock_node; + if (c->opts.btree_node_mem_ptr_optimization) { + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; + } retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { if (nofill) goto out; - b = bch2_btree_node_fill(c, NULL, k, btree_id, + b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, level, SIX_LOCK_read, true); /* We raced and found the btree node in the cache */ @@ -896,8 +952,7 @@ lock_node: } /* XXX: waiting on IO with btree locks held: */ - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + __bch2_btree_node_wait_on_read(b); prefetch(b->aux_data); @@ -927,21 +982,25 @@ out: return b; } -void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, - const struct bkey_i *k, - enum btree_id btree_id, unsigned level) +int bch2_btree_node_prefetch(struct bch_fs *c, + struct btree_trans *trans, + struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(iter && !btree_node_locked(iter, level + 1)); + BUG_ON(trans && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) - return; + return 0; - bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); + b = bch2_btree_node_fill(c, trans, path, k, btree_id, + level, SIX_LOCK_read, false); + return PTR_ERR_OR_ZERO(b); } void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) @@ -952,16 +1011,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; +wait_on_io: + /* not allowed to wait on io with btree locks held: */ + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked + * */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); six_lock_intent(&b->c.lock, NULL, NULL); six_lock_write(&b->c.lock, NULL, NULL); - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); - __bch2_btree_node_write(c, b); - - /* wait for any in flight btree write */ - btree_node_wait_on_io(b); + if (btree_node_dirty(b)) { + __bch2_btree_node_write(c, b, false); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } BUG_ON(btree_node_dirty(b)); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 40dd263a7caa..402cec1802bc 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -5,6 +5,8 @@ #include "bcachefs.h" #include "btree_types.h" +extern struct lock_class_key bch2_btree_node_lock_key; + struct btree_iter; void bch2_recalc_btree_reserve(struct bch_fs *); @@ -20,15 +22,15 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned, bool); -void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, - const struct bkey_i *, enum btree_id, unsigned); +int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, + const struct bkey_i *, enum btree_id, unsigned); void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index ba560fbd5f36..091bddee575d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -36,6 +36,9 @@ #include <linux/sched/task.h> #include <trace/events/bcachefs.h> +#define DROP_THIS_NODE 10 +#define DROP_PREV_NODE 11 + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -83,12 +86,17 @@ static int bch2_gc_check_topology(struct bch_fs *c, if (bpos_cmp(expected_start, bp->v.min_key)) { bch2_topology_error(c); - if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, - (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) { + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, + (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); return FSCK_ERR_START_TOPOLOGY_REPAIR; } else { @@ -100,12 +108,17 @@ static int bch2_gc_check_topology(struct bch_fs *c, if (is_last && bpos_cmp(cur.k->k.p, node_end)) { bch2_topology_error(c); - if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_ids[b->c.btree_id], b->c.level, - (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), - (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) { + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), + (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); return FSCK_ERR_START_TOPOLOGY_REPAIR; } else { @@ -203,8 +216,8 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_start(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur) { struct bpos expected_start = !prev ? b->data->min_key @@ -220,22 +233,50 @@ static int btree_repair_node_start(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key)); } - if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_ids[b->c.btree_id], b->c.level, - buf1, - (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) { - if (prev && - bpos_cmp(expected_start, cur->data->min_key) > 0 && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)); + + if (prev && + bpos_cmp(expected_start, cur->data->min_key) > 0 && + BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { + /* cur overwrites prev: */ + + if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key, + cur->data->min_key) >= 0, c, + "btree node overwritten by next node at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, buf2)) + return DROP_PREV_NODE; + + if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, buf2)) ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - else - ret = set_node_min(c, cur, expected_start); - if (ret) - return ret; + bpos_predecessor(cur->data->min_key)); + } else { + /* prev overwrites cur: */ + + if (mustfix_fsck_err_on(bpos_cmp(expected_start, + cur->data->max_key) >= 0, c, + "btree node overwritten by prev node at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, buf2)) + return DROP_THIS_NODE; + + if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, buf2)) + ret = set_node_min(c, cur, expected_start); } fsck_err: return ret; @@ -262,13 +303,11 @@ fsck_err: return ret; } -#define DROP_THIS_NODE 10 - static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) { struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf tmp; + struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; bool have_child, dropped_children = false; char buf[200]; @@ -277,15 +316,20 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) if (!b->c.level) return 0; again: + prev = NULL; have_child = dropped_children = false; - bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_init(&prev_k); + bch2_bkey_buf_init(&cur_k); bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + bch2_btree_and_journal_iter_advance(&iter); - bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_bkey_buf_reassemble(&cur_k, c, k); - cur = bch2_btree_node_get_noiter(c, tmp.k, + cur = bch2_btree_node_get_noiter(c, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -295,12 +339,12 @@ again: " %s", bch2_btree_ids[b->c.btree_id], b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) { - bch2_btree_node_evict(c, tmp.k); + (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) { + bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, tmp.k->k.p); + b->c.level, cur_k.k->k.p); if (ret) - goto err; + break; continue; } @@ -310,14 +354,39 @@ again: break; } - ret = btree_repair_node_start(c, b, prev, cur); + ret = btree_repair_node_boundaries(c, b, prev, cur); + + if (ret == DROP_THIS_NODE) { + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(c, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + if (ret) + break; + continue; + } + if (prev) six_unlock_read(&prev->c.lock); - prev = cur; - cur = NULL; + prev = NULL; - if (ret) + if (ret == DROP_PREV_NODE) { + bch2_btree_node_evict(c, prev_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, prev_k.k->k.p); + if (ret) + break; + + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); + goto again; + } else if (ret) break; + + prev = cur; + cur = NULL; + bch2_bkey_buf_copy(&prev_k, c, cur_k.k); } if (!ret && !IS_ERR_OR_NULL(prev)) { @@ -339,10 +408,10 @@ again: bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_bkey_buf_reassemble(&cur_k, c, k); bch2_btree_and_journal_iter_advance(&iter); - cur = bch2_btree_node_get_noiter(c, tmp.k, + cur = bch2_btree_node_get_noiter(c, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -358,9 +427,9 @@ again: cur = NULL; if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(c, tmp.k); + bch2_btree_node_evict(c, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, tmp.k->k.p); + b->c.level, cur_k.k->k.p); dropped_children = true; } @@ -385,7 +454,8 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); if (!ret && dropped_children) goto again; @@ -428,28 +498,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, char buf[200]; int ret = 0; + /* + * XXX + * use check_bucket_ref here + */ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); - if (fsck_err_on(g->mark.data_type && - g->mark.data_type != data_type, c, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->mark.data_type], - bch2_data_types[data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - if (data_type == BCH_DATA_btree) { - g2->_mark.data_type = g->_mark.data_type = data_type; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); - } else { - do_update = true; - } - } - if (fsck_err_on(!g->gen_valid, c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", @@ -466,6 +524,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, } } + if (fsck_err_on(data_type == BCH_DATA_btree && + g->mark.gen != p.ptr.gen, c, + "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + g2->_mark.data_type = g->_mark.data_type = data_type; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", @@ -486,6 +557,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, } } + if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" @@ -496,6 +576,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) do_update = true; + if (p.ptr.gen != g->mark.gen) + continue; + + if (fsck_err_on(g->mark.data_type && + g->mark.data_type != data_type, c, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[g->mark.data_type], + bch2_data_types[data_type], + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (data_type == BCH_DATA_btree) { + g2->_mark.data_type = g->_mark.data_type = data_type; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + if (p.has_ec) { struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); @@ -557,6 +657,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || (!ptr->cached && gen_cmp(ptr->gen, g->mark.gen) < 0) || + gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || (g->mark.data_type && g->mark.data_type != data_type); })); @@ -601,16 +702,18 @@ fsck_err: /* marking of btree keys/nodes: */ -static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k, u8 *max_stale, bool initial) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const struct bch_extent_ptr *ptr; unsigned flags = BTREE_TRIGGER_GC| (initial ? BTREE_TRIGGER_NOATOMIC : 0); + char buf[200]; int ret = 0; if (initial) { @@ -629,8 +732,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, - "superblock not marked as containing replicas (type %u)", - k->k->type)) { + "superblock not marked as containing replicas\n" + " while marking %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { ret = bch2_mark_bkey_replicas(c, *k); if (ret) { bch_err(c, "error marking bkey replicas: %i", ret); @@ -650,7 +754,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags); + ret = bch2_mark_key(trans, *k, flags); fsck_err: err: if (ret) @@ -658,9 +762,10 @@ err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale, bool initial) { + struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; @@ -678,7 +783,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, max_stale, initial); if (ret) break; @@ -700,11 +805,11 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, return ret; } -static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, bool initial, bool metadata_only) { - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct btree *b; unsigned depth = metadata_only ? 1 : bch2_expensive_debug_checks ? 0 @@ -713,39 +818,32 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, u8 max_stale = 0; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b) { + __for_each_btree_node(trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { bch2_verify_btree_nr_keys(b); gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(c, b, &max_stale, initial); + ret = btree_gc_mark_node(trans, b, &max_stale, initial); if (ret) break; if (!initial) { if (max_stale > 64) - bch2_btree_node_rewrite(c, iter, - b->data->keys.seq, + bch2_btree_node_rewrite(trans, &iter, b, BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); else if (!bch2_btree_gc_rewrite_disabled && (bch2_btree_gc_always_rewrite || max_stale > 16)) - bch2_btree_node_rewrite(c, iter, - b->data->keys.seq, - BTREE_INSERT_NOWAIT| + bch2_btree_node_rewrite(trans, &iter, + b, BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); } - - bch2_trans_cond_resched(&trans); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; @@ -754,7 +852,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, if (!btree_node_fake(b)) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, &k, &max_stale, initial); } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); @@ -763,9 +861,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return ret; } -static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, +static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, unsigned target_depth) { + struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; @@ -782,7 +881,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, &max_stale, true); if (ret) { bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); @@ -823,11 +922,16 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, if (ret == -EIO) { bch2_topology_error(c); - if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n" - " %s", - bch2_btree_ids[b->c.btree_id], - b->c.level - 1, - (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) { + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "Unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, + (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { ret = FSCK_ERR_START_TOPOLOGY_REPAIR; bch_info(c, "Halting mark and sweep to start topology repair pass"); goto fsck_err; @@ -844,7 +948,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, break; } - ret = bch2_gc_btree_init_recurse(c, child, + ret = bch2_gc_btree_init_recurse(trans, child, target_depth); six_unlock_read(&child->c.lock); @@ -859,10 +963,11 @@ fsck_err: return ret; } -static int bch2_gc_btree_init(struct bch_fs *c, +static int bch2_gc_btree_init(struct btree_trans *trans, enum btree_id btree_id, bool metadata_only) { + struct bch_fs *c = trans->c; struct btree *b; unsigned target_depth = metadata_only ? 1 : bch2_expensive_debug_checks ? 0 @@ -886,7 +991,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, goto fsck_err; } - if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, + if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, "btree root with incorrect max_key: %s", (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { bch_err(c, "repair unimplemented"); @@ -895,12 +1000,12 @@ static int bch2_gc_btree_init(struct bch_fs *c, } if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(c, b, target_depth); + ret = bch2_gc_btree_init_recurse(trans, b, target_depth); if (!ret) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, &k, &max_stale, true); } fsck_err: @@ -919,21 +1024,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) { + struct btree_trans trans; enum btree_id ids[BTREE_ID_NR]; unsigned i; int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); for (i = 0; i < BTREE_ID_NR && !ret; i++) ret = initial - ? bch2_gc_btree_init(c, ids[i], metadata_only) - : bch2_gc_btree(c, ids[i], initial, metadata_only); + ? bch2_gc_btree_init(&trans, ids[i], metadata_only) + : bch2_gc_btree(&trans, ids[i], initial, metadata_only); if (ret < 0) bch_err(c, "%s: ret %i", __func__, ret); + + bch2_trans_exit(&trans); return ret; } @@ -1020,9 +1130,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - 0, 0, NULL, 0, - BTREE_TRIGGER_GC); + bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -1283,8 +1391,10 @@ static int bch2_gc_start(struct bch_fs *c, return 0; } -static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k) +static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, + struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct reflink_gc *r; const __le64 *refcount = bkey_refcount_c(k); char buf[200]; @@ -1339,7 +1449,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bool metadata_only) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; size_t idx = 0; @@ -1349,16 +1459,16 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (metadata_only) return 0; + bch2_trans_init(&trans, c, 0, 0); + if (initial) { c->reflink_gc_idx = 0; - ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink, + ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, bch2_gc_reflink_done_initial_fn); goto out; } - bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1405,17 +1515,19 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, } } fsck_err: - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(&trans, &iter); out: genradix_free(&c->reflink_gc_table); c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); return ret; } -static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k) +static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, + struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct reflink_gc *r; const __le64 *refcount = bkey_refcount_c(k); @@ -1437,22 +1549,23 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bool metadata_only) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; - int ret; + int ret = 0; if (metadata_only) return 0; + bch2_trans_init(&trans, c, 0, 0); genradix_free(&c->reflink_gc_table); c->reflink_gc_nr = 0; - if (initial) - return bch2_btree_and_journal_walk(c, BTREE_ID_reflink, - bch2_gc_reflink_start_initial_fn); - - bch2_trans_init(&trans, c, 0, 0); + if (initial) { + ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, + bch2_gc_reflink_start_initial_fn); + goto out; + } for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { @@ -1472,10 +1585,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, r->size = k.k->size; r->refcount = 0; } - bch2_trans_iter_put(&trans, iter); - + bch2_trans_iter_exit(&trans, &iter); +out: bch2_trans_exit(&trans); - return 0; + return ret; } /** @@ -1519,7 +1632,7 @@ again: bch2_mark_superblocks(c); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) && + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && c->opts.fix_errors != FSCK_OPT_NO) { bch_info(c, "starting topology repair pass"); @@ -1527,11 +1640,14 @@ again: if (ret) goto out; bch_info(c, "topology repair pass done"); + + set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); } ret = bch2_gc_btrees(c, initial, metadata_only); if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); ret = 0; @@ -1644,7 +1760,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0, commit_err = 0; @@ -1652,22 +1768,28 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - c->gc_gens_pos = iter->pos; + while ((bch2_trans_begin(&trans), + k = bch2_btree_iter_peek(&iter)).k) { + ret = bkey_err(k); + + if (ret == -EINTR) + continue; + if (ret) + break; + + c->gc_gens_pos = iter.pos; if (gc_btree_gens_key(c, k) && !commit_err) { bch2_bkey_buf_reassemble(&sk, c, k); bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - commit_err = - bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_trans_update(&trans, &iter, sk.k, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOWAIT| BTREE_INSERT_NOFAIL); @@ -1677,9 +1799,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) } } - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index e9a87394370a..59dfb069e699 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -87,7 +87,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) */ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) { - return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); + return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); } static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 47cfd8a08f91..f11fcab61902 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -22,6 +22,51 @@ #include <linux/sched/mm.h> #include <trace/events/bcachefs.h> +void bch2_btree_node_io_unlock(struct btree *b) +{ + EBUG_ON(!btree_node_write_in_flight(b)); + + clear_btree_node_write_in_flight_inner(b); + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +void bch2_btree_node_io_lock(struct btree *b) +{ + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_read(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_write(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_read(struct btree *b) +{ + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_write(struct btree *b) +{ + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + static void verify_no_dups(struct btree *b, struct bkey_packed *start, struct bkey_packed *end) @@ -420,17 +465,17 @@ void bch2_btree_build_aux_trees(struct btree *b) * * Returns true if we sorted (i.e. invalidated iterators */ -void bch2_btree_init_next(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) +void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_node_entry *bne; bool reinit_iter = false; EBUG_ON(!(b->c.lock.state.seq & 1)); - EBUG_ON(iter && iter->l[b->c.level].b != b); BUG_ON(bset_written(b, bset(b, &b->set[1]))); - if (b->nsets == MAX_BSETS) { + if (b->nsets == MAX_BSETS && + !btree_node_write_in_flight(b)) { unsigned log_u64s[] = { ilog2(bset_u64s(&b->set[0])), ilog2(bset_u64s(&b->set[1])), @@ -455,8 +500,8 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_build_aux_trees(b); - if (iter && reinit_iter) - bch2_btree_iter_reinit_node(iter, b); + if (reinit_iter) + bch2_trans_node_reinit_iter(trans, b); } static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, @@ -565,13 +610,16 @@ out: \ void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k; for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - if (bkey_cmp_left_packed(b, k, &b->data->min_key) < 0) + if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) break; if (k != i->start) { @@ -596,11 +644,17 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) } bch2_btree_build_aux_trees(b); + + for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + } } static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned sectors, int write, bool have_retry) + unsigned offset, unsigned sectors, + int write, bool have_retry) { unsigned version = le16_to_cpu(i->version); const char *err; @@ -638,18 +692,23 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_FATAL, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(b->written + sectors > c->opts.btree_node_size, + if (btree_err_on(offset + sectors > c->opts.btree_node_size, BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; return 0; } - btree_err_on(b->written && !i->u64s, + btree_err_on(offset && !i->u64s, BTREE_ERR_FIXABLE, c, ca, b, i, "empty bset"); - if (!b->written) { + btree_err_on(BSET_OFFSET(i) && + BSET_OFFSET(i) != offset, + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "bset at wrong sector offset"); + + if (!offset) { struct btree_node *bn = container_of(i, struct btree_node, keys); /* These indicate that we read the wrong btree node: */ @@ -815,7 +874,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - unsigned nonblacklisted_written = 0; + unsigned blacklisted_written, nonblacklisted_written = 0; + unsigned ptr_written = btree_ptr_sectors_written(&b->key); int ret, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; @@ -846,7 +906,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->data->keys.seq, bp->seq); } - while (b->written < c->opts.btree_node_size) { + while (b->written < (ptr_written ?: c->opts.btree_node_size)) { unsigned sectors, whiteout_u64s = 0; struct nonce nonce; struct bch_csum csum; @@ -902,7 +962,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, sectors, + ret = validate_bset(c, ca, b, i, b->written, sectors, READ, have_retry); if (ret) goto fsck_err; @@ -926,6 +986,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, BTREE_ERR_FIXABLE, c, ca, b, i, "first btree node bset has blacklisted journal seq"); + + btree_err_on(blacklisted && ptr_written, + BTREE_ERR_FIXABLE, c, ca, b, i, + "found blacklisted bset in btree node with sectors_written"); if (blacklisted && !first) continue; @@ -939,26 +1003,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonblacklisted_written = b->written; } - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); - bne = (void *) bne + block_bytes(c)) - btree_err_on(bne->keys.seq == b->data->keys.seq && - !bch2_journal_seq_is_blacklisted(c, - le64_to_cpu(bne->keys.journal_seq), - true), + if (ptr_written) { + btree_err_on(b->written < ptr_written, BTREE_ERR_WANT_RETRY, c, ca, b, NULL, - "found bset signature after last bset"); + "btree node data missing: expected %u sectors, found %u", + ptr_written, b->written); + } else { + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + btree_err_on(bne->keys.seq == b->data->keys.seq && + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "found bset signature after last bset"); - /* - * Blacklisted bsets are those that were written after the most recent - * (flush) journal write. Since there wasn't a flush, they may not have - * made it to all devices - which means we shouldn't write new bsets - * after them, as that could leave a gap and then reads from that device - * wouldn't find all the bsets in that btree node - which means it's - * important that we start writing new bsets after the most recent _non_ - * blacklisted bset: - */ - b->written = nonblacklisted_written; + /* + * Blacklisted bsets are those that were written after the most recent + * (flush) journal write. Since there wasn't a flush, they may not have + * made it to all devices - which means we shouldn't write new bsets + * after them, as that could leave a gap and then reads from that device + * wouldn't find all the bsets in that btree node - which means it's + * important that we start writing new bsets after the most recent _non_ + * blacklisted bset: + */ + blacklisted_written = b->written; + b->written = nonblacklisted_written; + } sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); sorted->keys.u64s = 0; @@ -1026,6 +1098,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ca->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } + + if (!ptr_written) + set_btree_node_need_rewrite(b); out: mempool_free(iter, &c->fill_iter); return retry_read; @@ -1179,31 +1254,27 @@ static void btree_node_read_all_replicas_done(struct closure *cl) container_of(cl, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; - bool have_good_copy = false; bool dump_bset_maps = false; bool have_retry = false; - int ret = 0, write = READ; - unsigned i, written, written2; + int ret = 0, best = -1, write = READ; + unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + if (ra->err[i]) continue; - if (!have_good_copy) { - memcpy(b->data, ra->buf[i], btree_bytes(c)); - have_good_copy = true; - written = btree_node_sectors_written(c, b->data); - } + if (le64_to_cpu(bn->magic) != bset_magic(c) || + (seq && seq != bn->keys.seq)) + continue; - /* Try to get the right btree node: */ - if (have_good_copy && - seq && - b->data->keys.seq != seq && - ((struct btree_node *) ra->buf[i])->keys.seq == seq) { - memcpy(b->data, ra->buf[i], btree_bytes(c)); - written = btree_node_sectors_written(c, b->data); + if (best < 0) { + best = i; + written = btree_node_sectors_written(c, bn); + continue; } written2 = btree_node_sectors_written(c, ra->buf[i]); @@ -1213,14 +1284,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl) btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), BTREE_ERR_FIXABLE, c, NULL, b, NULL, "found bset signature after last bset") || - btree_err_on(memcmp(b->data, ra->buf[i], written << 9), + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), BTREE_ERR_FIXABLE, c, NULL, b, NULL, "btree node replicas content mismatch")) dump_bset_maps = true; if (written2 > written) { written = written2; - memcpy(b->data, ra->buf[i], btree_bytes(c)); + best = i; } } fsck_err: @@ -1273,9 +1344,14 @@ fsck_err: } } - if (have_good_copy) - bch2_btree_node_read_done(c, NULL, b, false); - else + if (best >= 0) { + memcpy(b->data, ra->buf[best], btree_bytes(c)); + ret = bch2_btree_node_read_done(c, NULL, b, false); + } else { + ret = -1; + } + + if (ret) set_btree_node_read_error(b); for (i = 0; i < ra->nr; i++) { @@ -1390,8 +1466,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); - set_btree_node_read_in_flight(b); - if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) return; @@ -1467,6 +1541,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, bkey_copy(&b->key, k); BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); + set_btree_node_read_in_flight(b); + bch2_btree_node_read(c, b, true); if (btree_node_read_error(b)) { @@ -1510,83 +1586,50 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, static void btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; bch2_btree_complete_write(c, b, w); - btree_node_io_unlock(b); -} - -static void bch2_btree_node_write_error(struct bch_fs *c, - struct btree_write_bio *wbio) -{ - struct btree *b = wbio->wbio.bio.bi_private; - struct bkey_buf k; - struct bch_extent_ptr *ptr; - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - bch2_bkey_buf_init(&k); - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, 0); -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto err; - - /* has node been freed? */ - if (iter->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - BUG_ON(!btree_node_hashed(b)); - - bch2_bkey_buf_copy(&k, c, &b->key); + v = READ_ONCE(b->flags); + do { + old = new = v; - bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, - bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + if (old & (1U << BTREE_NODE_need_write)) + goto do_write; - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) - goto err; + new &= ~(1U << BTREE_NODE_write_in_flight); + new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } while ((v = cmpxchg(&b->flags, old, new)) != old); - ret = bch2_btree_node_update_key(c, iter, b, k.k); - if (ret == -EINTR) - goto retry; - if (ret) - goto err; -out: - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&k, c); - bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); return; -err: - set_btree_node_noevict(b); - bch2_fs_fatal_error(c, "fatal error writing btree node"); - goto out; -} -void bch2_btree_write_error_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - btree_write_error_work); - struct bio *bio; +do_write: + six_lock_read(&b->c.lock, NULL, NULL); + v = READ_ONCE(b->flags); + do { + old = new = v; - while (1) { - spin_lock_irq(&c->btree_write_error_lock); - bio = bio_list_pop(&c->btree_write_error_list); - spin_unlock_irq(&c->btree_write_error_lock); + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && + btree_node_may_write(b)) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); + new |= (1U << BTREE_NODE_write_in_flight_inner); + new |= (1U << BTREE_NODE_just_written); + new ^= (1U << BTREE_NODE_write_idx); + } else { + new &= ~(1U << BTREE_NODE_write_in_flight); + new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } + } while ((v = cmpxchg(&b->flags, old, new)) != old); - if (!bio) - break; + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, true); - bch2_btree_node_write_error(c, - container_of(bio, struct btree_write_bio, wbio.bio)); - } + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) @@ -1595,25 +1638,39 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + struct bch_extent_ptr *ptr; + int ret; btree_bounce_free(c, - wbio->bytes, + wbio->data_bytes, wbio->wbio.used_mempool, wbio->data); - if (wbio->wbio.failed.nr) { - unsigned long flags; + bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - spin_lock_irqsave(&c->btree_write_error_lock, flags); - bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); - spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + goto err; - queue_work(c->btree_error_wq, &c->btree_write_error_work); - return; - } + if (wbio->wbio.first_btree_write) { + if (wbio->wbio.failed.nr) { + } + } else { + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, + !wbio->wbio.failed.nr)); + if (ret) + goto err; + } +out: bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); + return; +err: + set_btree_node_noevict(b); + bch2_fs_fatal_error(c, "fatal error writing btree node"); + goto out; } static void btree_node_write_endio(struct bio *bio) @@ -1621,7 +1678,9 @@ static void btree_node_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_write_bio *orig = parent ?: wbio; + struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); struct bch_fs *c = wbio->c; + struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); unsigned long flags; @@ -1642,13 +1701,13 @@ static void btree_node_write_endio(struct bio *bio) if (parent) { bio_put(bio); bio_endio(&parent->bio); - } else { - struct btree_write_bio *wb = - container_of(orig, struct btree_write_bio, wbio); - - INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->io_complete_wq, &wb->work); + return; } + + clear_btree_node_write_in_flight_inner(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); + INIT_WORK(&wb->work, btree_node_write_work); + queue_work(c->btree_io_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, @@ -1661,7 +1720,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return -1; ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: - validate_bset(c, NULL, b, i, sectors, WRITE, false); + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -1673,18 +1732,24 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); + struct bch_extent_ptr *ptr; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); + bkey_copy(&tmp.k, &wbio->key); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) + ptr->offset += wbio->sector_offset; + + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); } -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started) { struct btree_write_bio *wbio; struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; unsigned bytes_to_write, sectors_to_write, bytes, u64s; @@ -1694,6 +1759,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) bool validate_before_checksum = false; void *data; + if (already_started) + goto do_write; + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1716,22 +1784,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) if (old & (1 << BTREE_NODE_never_write)) return; - if (old & (1 << BTREE_NODE_write_in_flight)) { - /* - * XXX waiting on btree writes with btree locks held - - * this can deadlock, and we hit the write error path - */ - btree_node_wait_on_io(b); - continue; - } + BUG_ON(old & (1 << BTREE_NODE_write_in_flight)); new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_need_write); new |= (1 << BTREE_NODE_write_in_flight); + new |= (1 << BTREE_NODE_write_in_flight_inner); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + if (new & (1U << BTREE_NODE_need_write)) + return; +do_write: atomic_dec(&c->btree_cache.dirty); BUG_ON(btree_node_fake(b)); @@ -1818,6 +1883,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) i->version = c->sb.version < bcachefs_metadata_version_new_versioning ? cpu_to_le16(BCH_BSET_VERSION_OLD) : cpu_to_le16(c->sb.version); + SET_BSET_OFFSET(i, b->written); SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) @@ -1876,37 +1942,30 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; - wbio->bytes = bytes; + wbio->data_bytes = bytes; + wbio->sector_offset = b->written; wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; + wbio->wbio.first_btree_write = !b->written; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); - /* - * If we're appending to a leaf node, we don't technically need FUA - - * this write just needs to be persisted before the next journal write, - * which will be marked FLUSH|FUA. - * - * Similarly if we're writing a new btree root - the pointer is going to - * be in the next journal entry. - * - * But if we're writing a new btree node (that isn't a root) or - * appending to a non leaf btree node, we need either FUA or a flush - * when we write the parent with the new pointer. FUA is cheaper than a - * flush, and writes appending to leaf nodes aren't blocking anything so - * just make all btree node writes FUA to keep things sane. - */ - bkey_copy(&wbio->key, &b->key); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) - ptr->offset += b->written; - b->written += sectors_to_write; + if (wbio->wbio.first_btree_write && + b->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = + cpu_to_le16(b->written); + + if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = + cpu_to_le16(b->written); + atomic64_inc(&c->btree_writes_nr); atomic64_add(sectors_to_write, &c->btree_writes_sectors); @@ -1915,6 +1974,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) return; err: set_btree_node_noevict(b); + if (!b->written && + b->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = + cpu_to_le16(sectors_to_write); b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); @@ -1986,7 +2049,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_intent || (lock_type_held == SIX_LOCK_read && six_lock_tryupgrade(&b->c.lock))) { - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); /* don't cycle lock unnecessarily: */ if (btree_node_just_written(b) && @@ -1998,7 +2061,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (lock_type_held == SIX_LOCK_read) six_lock_downgrade(&b->c.lock); } else { - __bch2_btree_node_write(c, b); + __bch2_btree_node_write(c, b, false); if (lock_type_held == SIX_LOCK_write && btree_node_just_written(b)) bch2_btree_post_write_cleanup(c, b); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index eeb9d23caf88..0f20224e2a77 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) atomic_dec(&c->btree_cache.dirty); } +static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) + : 0; +} + struct btree_read_bio { struct bch_fs *c; struct btree *b; @@ -48,28 +55,17 @@ struct btree_write_bio { struct work_struct work; __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); void *data; - unsigned bytes; + unsigned data_bytes; + unsigned sector_offset; struct bch_write_bio wbio; }; -static inline void btree_node_io_unlock(struct btree *b) -{ - EBUG_ON(!btree_node_write_in_flight(b)); - clear_btree_node_write_in_flight(b); - wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -} - -static inline void btree_node_io_lock(struct btree *b) -{ - wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} - -static inline void btree_node_wait_on_io(struct btree *b) -{ - wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); -} +void bch2_btree_node_io_unlock(struct btree *); +void bch2_btree_node_io_lock(struct btree *); +void __bch2_btree_node_wait_on_read(struct btree *); +void __bch2_btree_node_wait_on_write(struct btree *); +void bch2_btree_node_wait_on_read(struct btree *); +void bch2_btree_node_wait_on_write(struct btree *); static inline bool btree_node_may_write(struct btree *b) { @@ -126,7 +122,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -138,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); void bch2_btree_node_drop_keys_outside_node(struct btree *); void bch2_btree_build_aux_trees(struct btree *); -void bch2_btree_init_next(struct bch_fs *, struct btree *, - struct btree_iter *); +void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool); @@ -149,9 +144,8 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id, void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); -void bch2_btree_write_error_work(struct work_struct *); -void __bch2_btree_node_write(struct bch_fs *, struct btree *); +void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -160,18 +154,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *, static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, enum six_lock_type lock_held) { - while (b->written && - btree_node_need_write(b) && - btree_node_may_write(b)) { - if (!btree_node_write_in_flight(b)) { - bch2_btree_node_write(c, b, lock_held); - break; - } - - six_unlock_type(&b->c.lock, lock_held); - btree_node_wait_on_io(b); - btree_node_lock_type(c, b, lock_held); - } + if (b->written && + btree_node_need_write(b) && + btree_node_may_write(b) && + !btree_node_write_in_flight(b)) + bch2_btree_node_write(c, b, lock_held); } #define bch2_btree_node_write_cond(_c, _b, cond) \ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index cd714dc2df3c..eb7e1cde1bf3 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -13,16 +13,55 @@ #include "extents.h" #include "journal.h" #include "replicas.h" +#include "subvolume.h" #include <linux/prefetch.h> #include <trace/events/bcachefs.h> -static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); +static void btree_trans_verify_sorted(struct btree_trans *); +static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); -static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + +/* + * Unlocks before scheduling + * Note: does not revalidate iterator + */ +static inline int bch2_trans_cond_resched(struct btree_trans *trans) { - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + if (need_resched() || race_fault()) { + bch2_trans_unlock(trans); + schedule(); + return bch2_trans_relock(trans) ? 0 : -EINTR; + } else { + return 0; + } +} +static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, + struct bpos r_pos, + unsigned r_level) +{ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: + -cmp_int(l->level, r_level); +} + +static inline int btree_path_cmp(const struct btree_path *l, + const struct btree_path *r) +{ + return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); +} + +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ /* Are we iterating over keys in all snapshots? */ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { p = bpos_successor(p); @@ -36,8 +75,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); - /* Are we iterating over keys in all snapshots? */ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { p = bpos_predecessor(p); @@ -49,10 +86,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos return p; } -static inline bool is_btree_node(struct btree_iter *iter, unsigned l) +static inline bool is_btree_node(struct btree_path *path, unsigned l) { return l < BTREE_MAX_DEPTH && - (unsigned long) iter->l[l].b >= 128; + (unsigned long) path->l[l].b >= 128; } static inline struct bpos btree_iter_search_key(struct btree_iter *iter) @@ -65,41 +102,40 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) return pos; } -static inline bool btree_iter_pos_before_node(struct btree_iter *iter, +static inline bool btree_path_pos_before_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(iter->real_pos, b->data->min_key) < 0; + return bpos_cmp(path->pos, b->data->min_key) < 0; } -static inline bool btree_iter_pos_after_node(struct btree_iter *iter, +static inline bool btree_path_pos_after_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(b->key.k.p, iter->real_pos) < 0; + return bpos_cmp(b->key.k.p, path->pos) < 0; } -static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +static inline bool btree_path_pos_in_node(struct btree_path *path, struct btree *b) { - return iter->btree_id == b->c.btree_id && - !btree_iter_pos_before_node(iter, b) && - !btree_iter_pos_after_node(iter, b); + return path->btree_id == b->c.btree_id && + !btree_path_pos_before_node(path, b) && + !btree_path_pos_after_node(path, b); } /* Btree node locking: */ -void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { - bch2_btree_node_unlock_write_inlined(b, iter); + bch2_btree_node_unlock_write_inlined(trans, path, b); } -void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) { - struct btree_iter *linked; + struct btree_path *linked; unsigned readers = 0; - EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); - - trans_for_each_iter(iter->trans, linked) + trans_for_each_path(trans, linked) if (linked->l[b->c.level].b == b && btree_node_read_locked(linked, b->c.level)) readers++; @@ -112,138 +148,141 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) */ atomic64_sub(__SIX_VAL(read_lock, readers), &b->c.lock.state.counter); - btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); + btree_node_lock_type(trans->c, b, SIX_LOCK_write); atomic64_add(__SIX_VAL(read_lock, readers), &b->c.lock.state.counter); } -bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree *b = btree_iter_node(iter, level); - int want = __btree_lock_want(iter, level); + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); - if (!is_btree_node(iter, level)) + if (!is_btree_node(path, level)) return false; if (race_fault()) return false; - if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || - (btree_node_lock_seq_matches(iter, b, level) && - btree_node_lock_increment(iter->trans, b, level, want))) { - mark_btree_node_locked(iter, level, want); + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { + mark_btree_node_locked(path, level, want); return true; } else { return false; } } -static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree *b = iter->l[level].b; + struct btree *b = path->l[level].b; - EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); - - if (!is_btree_node(iter, level)) + if (!is_btree_node(path, level)) return false; - if (btree_node_intent_locked(iter, level)) + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + } + + if (btree_node_intent_locked(path, level)) return true; if (race_fault()) return false; - if (btree_node_locked(iter, level) + if (btree_node_locked(path, level) ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) + : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) goto success; - if (btree_node_lock_seq_matches(iter, b, level) && - btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(iter, level); + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(path, level); goto success; } return false; success: - mark_btree_node_intent_locked(iter, level); + mark_btree_node_intent_locked(path, level); return true; } -static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, - unsigned long trace_ip) +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, unsigned long trace_ip) { - unsigned l = iter->level; + unsigned l = path->level; int fail_idx = -1; do { - if (!btree_iter_node(iter, l)) + if (!btree_path_node(path, l)) break; if (!(upgrade - ? bch2_btree_node_upgrade(iter, l) - : bch2_btree_node_relock(iter, l))) { - (upgrade - ? trace_node_upgrade_fail - : trace_node_relock_fail)(iter->trans->ip, trace_ip, - iter->btree_id, &iter->real_pos, - l, iter->l[l].lock_seq, - is_btree_node(iter, l) - ? 0 - : (unsigned long) iter->l[l].b, - is_btree_node(iter, l) - ? iter->l[l].b->c.lock.state.seq - : 0); - + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) fail_idx = l; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - } l++; - } while (l < iter->locks_want); + } while (l < path->locks_want); /* * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_iter_traverse has to walk back up to + * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - while (fail_idx >= 0) { - btree_node_unlock(iter, fail_idx); - iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; - --fail_idx; + if (fail_idx >= 0) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; + --fail_idx; + } while (fail_idx >= 0); } - if (iter->uptodate == BTREE_ITER_NEED_RELOCK) - iter->uptodate = BTREE_ITER_NEED_PEEK; + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_trans_verify_locks(iter->trans); + bch2_trans_verify_locks(trans); - return iter->uptodate < BTREE_ITER_NEED_RELOCK; + return path->uptodate < BTREE_ITER_NEED_RELOCK; } static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, - enum btree_iter_type type) + bool cached) { - return type != BTREE_ITER_CACHED + return !cached ? container_of(_b, struct btree, c)->key.k.p : container_of(_b, struct bkey_cached, c)->key.pos; } /* Slowpath: */ -bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, - unsigned level, struct btree_iter *iter, +bool __bch2_btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_trans *trans = iter->trans; - struct btree_iter *linked, *deadlock_iter = NULL; + struct btree_path *linked, *deadlock_path = NULL; u64 start_time = local_clock(); unsigned reason = 9; bool ret; /* Check if it's safe to block: */ - trans_for_each_iter(trans, linked) { + trans_for_each_path(trans, linked) { if (!linked->nodes_locked) continue; @@ -261,25 +300,25 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - deadlock_iter = linked; + deadlock_path = linked; reason = 1; } - if (linked->btree_id != iter->btree_id) { - if (linked->btree_id > iter->btree_id) { - deadlock_iter = linked; + if (linked->btree_id != path->btree_id) { + if (linked->btree_id > path->btree_id) { + deadlock_path = linked; reason = 3; } continue; } /* - * Within the same btree, cached iterators come before non - * cached iterators: + * Within the same btree, cached paths come before non + * cached paths: */ - if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { - if (btree_iter_is_cached(iter)) { - deadlock_iter = linked; + if (linked->cached != path->cached) { + if (path->cached) { + deadlock_path = linked; reason = 4; } continue; @@ -287,33 +326,34 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* * Interior nodes must be locked before their descendants: if - * another iterator has possible descendants locked of the node + * another path has possible descendants locked of the node * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - deadlock_iter = linked; + deadlock_path = linked; reason = 5; } /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, - btree_iter_type(linked))) <= 0) { - deadlock_iter = linked; + linked->cached)) <= 0) { + deadlock_path = linked; reason = 7; BUG_ON(trans->in_traverse_all); } } - if (unlikely(deadlock_iter)) { - trace_trans_restart_would_deadlock(iter->trans->ip, ip, + if (unlikely(deadlock_path)) { + trace_trans_restart_would_deadlock(trans->ip, ip, trans->in_traverse_all, reason, - deadlock_iter->btree_id, - btree_iter_type(deadlock_iter), - &deadlock_iter->real_pos, - iter->btree_id, - btree_iter_type(iter), + deadlock_path->btree_id, + deadlock_path->cached, + &deadlock_path->pos, + path->btree_id, + path->cached, &pos); + btree_trans_restart(trans); return false; } @@ -321,9 +361,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, return true; #ifdef CONFIG_BCACHEFS_DEBUG - trans->locking_iter_idx = iter->idx; + trans->locking_path_idx = path->idx; trans->locking_pos = pos; - trans->locking_btree_id = iter->btree_id; + trans->locking_btree_id = path->btree_id; trans->locking_level = level; trans->locking = b; #endif @@ -342,52 +382,79 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Btree iterator locking: */ #ifdef CONFIG_BCACHEFS_DEBUG -static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + +static void bch2_btree_path_verify_locks(struct btree_path *path) { unsigned l; - if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { - BUG_ON(iter->nodes_locked); + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); return; } - for (l = 0; is_btree_node(iter, l); l++) { - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && - !btree_node_locked(iter, l)) - continue; - - BUG_ON(btree_lock_want(iter, l) != - btree_node_locked_type(iter, l)); - } + for (l = 0; btree_path_node(path, l); l++) + BUG_ON(btree_lock_want(path, l) != + btree_node_locked_type(path, l)); } -void bch2_btree_trans_verify_locks(struct btree_trans *trans) +void bch2_trans_verify_locks(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - bch2_btree_iter_verify_locks(iter); + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); } #else -static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} #endif +/* Btree path locking: */ + +/* + * Only for btree_cache.c - only relocks intent locks + */ +bool bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned l; + + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_trans_restart(trans); + return false; + } + } + + return true; +} + __flatten -static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) +static bool bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) { - return btree_iter_get_locks(iter, false, trace_ip); + bool ret = btree_path_get_locks(trans, path, false, trace_ip); + + if (!ret) + btree_trans_restart(trans); + return ret; } -bool __bch2_btree_iter_upgrade(struct btree_iter *iter, +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned new_locks_want) { - struct btree_iter *linked; + struct btree_path *linked; - EBUG_ON(iter->locks_want >= new_locks_want); + EBUG_ON(path->locks_want >= new_locks_want); - iter->locks_want = new_locks_want; + path->locks_want = new_locks_want; - if (btree_iter_get_locks(iter, true, _THIS_IP_)) + if (btree_path_get_locks(trans, path, true, _THIS_IP_)) return true; /* @@ -395,7 +462,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, * iterators in the btree_trans here. * * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_iter_traverse() + * calling get_locks() is sufficient to make bch2_btree_path_traverse() * get the locks we want on transaction restart. * * But if this iterator was a clone, on transaction restart what we did @@ -407,71 +474,68 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, * * The code below used to be needed to ensure ancestor nodes get locked * before interior nodes - now that's handled by - * bch2_btree_iter_traverse_all(). + * bch2_btree_path_traverse_all(). */ - trans_for_each_iter(iter->trans, linked) - if (linked != iter && - btree_iter_type(linked) == btree_iter_type(iter) && - linked->btree_id == iter->btree_id && + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_iter_get_locks(linked, true, _THIS_IP_); + btree_path_get_locks(trans, linked, true, _THIS_IP_); } return false; } -void __bch2_btree_iter_downgrade(struct btree_iter *iter, +void __bch2_btree_path_downgrade(struct btree_path *path, unsigned new_locks_want) { unsigned l; - EBUG_ON(iter->locks_want < new_locks_want); + EBUG_ON(path->locks_want < new_locks_want); - iter->locks_want = new_locks_want; + path->locks_want = new_locks_want; - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) >= iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); + while (path->nodes_locked && + (l = __fls(path->nodes_locked)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(path, l); } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->c.lock); - iter->nodes_intent_locked ^= 1 << l; + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + path->nodes_intent_locked ^= 1 << l; } break; } } - bch2_btree_trans_verify_locks(iter->trans); + bch2_btree_path_verify_locks(path); } void bch2_trans_downgrade(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - bch2_btree_iter_downgrade(iter); + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(path); } /* Btree transaction locking: */ -static inline bool btree_iter_should_be_locked(struct btree_trans *trans, - struct btree_iter *iter) -{ - return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || - iter->should_be_locked; -} - bool bch2_trans_relock(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; + + if (unlikely(trans->restarted)) + return false; - trans_for_each_iter(trans, iter) - if (!bch2_btree_iter_relock(iter, _RET_IP_) && - btree_iter_should_be_locked(trans, iter)) { + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock(trans, path, _RET_IP_)) { trace_trans_restart_relock(trans->ip, _RET_IP_, - iter->btree_id, &iter->real_pos); + path->btree_id, &path->pos); + BUG_ON(!trans->restarted); return false; } return true; @@ -479,36 +543,39 @@ bool bch2_trans_relock(struct btree_trans *trans) void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - __bch2_btree_iter_unlock(iter); + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(path); + + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); } /* Btree iterator: */ #ifdef CONFIG_BCACHEFS_DEBUG -static void bch2_btree_iter_verify_cached(struct btree_iter *iter) +static void bch2_btree_path_verify_cached(struct btree_trans *trans, + struct btree_path *path) { struct bkey_cached *ck; - bool locked = btree_node_locked(iter, 0); + bool locked = btree_node_locked(path, 0); - if (!bch2_btree_node_relock(iter, 0)) + if (!bch2_btree_node_relock(trans, path, 0)) return; - ck = (void *) iter->l[0].b; - BUG_ON(ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)); + ck = (void *) path->l[0].b; + BUG_ON(ck->key.btree_id != path->btree_id || + bkey_cmp(ck->key.pos, path->pos)); if (!locked) - btree_node_unlock(iter, 0); + btree_node_unlock(path, 0); } -static void bch2_btree_iter_verify_level(struct btree_iter *iter, - unsigned level) +static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree_iter_level *l; + struct btree_path_level *l; struct btree_node_iter tmp; bool locked; struct bkey_packed *p, *k; @@ -518,65 +585,52 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, if (!bch2_debug_check_iterators) return; - l = &iter->l[level]; + l = &path->l[level]; tmp = l->iter; - locked = btree_node_locked(iter, level); + locked = btree_node_locked(path, level); - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + if (path->cached) { if (!level) - bch2_btree_iter_verify_cached(iter); + bch2_btree_path_verify_cached(trans, path); return; } - BUG_ON(iter->level < iter->min_depth); - - if (!btree_iter_node(iter, level)) + if (!btree_path_node(path, level)) return; - if (!bch2_btree_node_relock(iter, level)) + if (!bch2_btree_node_relock(trans, path, level)) return; - BUG_ON(!btree_iter_pos_in_node(iter, l->b)); - - /* - * node iterators don't use leaf node iterator: - */ - if (btree_iter_type(iter) == BTREE_ITER_NODES && - level <= iter->min_depth) - goto unlock; + BUG_ON(!btree_path_pos_in_node(path, l->b)); bch2_btree_node_iter_verify(&l->iter, l->b); /* - * For interior nodes, the iterator will have skipped past - * deleted keys: - * - * For extents, the iterator may have skipped past deleted keys (but not - * whiteouts) + * For interior nodes, the iterator will have skipped past deleted keys: */ - p = level || btree_node_type_is_extents(iter->btree_id) + p = level ? bch2_btree_node_iter_prev(&tmp, l->b) : bch2_btree_node_iter_prev_all(&tmp, l->b); k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { + if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { msg = "before"; goto err; } - if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { msg = "after"; goto err; } -unlock: + if (!locked) - btree_node_unlock(iter, level); + btree_node_unlock(path, level); return; err: strcpy(buf2, "(none)"); strcpy(buf3, "(none)"); - bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_bpos_to_text(&PBUF(buf1), path->pos); if (p) { struct bkey uk = bkey_unpack_key(l->b, p); @@ -588,19 +642,49 @@ err: bch2_bkey_to_text(&PBUF(buf3), &uk); } - panic("iterator should be %s key at level %u:\n" - "iter pos %s\n" + panic("path should be %s key at level %u:\n" + "path pos %s\n" "prev key %s\n" "cur key %s\n", msg, level, buf1, buf2, buf3); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) { - enum btree_iter_type type = btree_iter_type(iter); + struct bch_fs *c = trans->c; unsigned i; - EBUG_ON(iter->btree_id >= BTREE_ID_NR); + EBUG_ON(path->btree_id >= BTREE_ID_NR); + + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { + BUG_ON(!path->cached && + c->btree_roots[path->btree_id].b->c.level > i); + break; + } + + bch2_btree_path_verify_level(trans, path, i); + } + + bch2_btree_path_verify_locks(path); +} + +void bch2_trans_verify_paths(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify(trans, path); +} + +static void bch2_btree_iter_verify(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + BUG_ON(iter->btree_id >= BTREE_ID_NR); + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && iter->pos.snapshot != iter->snapshot); @@ -608,51 +692,125 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - BUG_ON(type == BTREE_ITER_NODES && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - - BUG_ON(type != BTREE_ITER_NODES && + BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(iter->btree_id)); - bch2_btree_iter_verify_locks(iter); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - bch2_btree_iter_verify_level(iter, i); + bch2_btree_path_verify(trans, iter->path); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - enum btree_iter_type type = btree_iter_type(iter); + BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !iter->pos.snapshot); BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && iter->pos.snapshot != iter->snapshot); - BUG_ON((type == BTREE_ITER_KEYS || - type == BTREE_ITER_CACHED) && - (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || - bkey_cmp(iter->pos, iter->k.p) > 0)); + BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0); } -void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) +static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { - struct btree_iter *iter; + struct btree_trans *trans = iter->trans; + struct btree_iter copy; + struct bkey_s_c prev; + int ret = 0; if (!bch2_debug_check_iterators) - return; + return 0; - trans_for_each_iter_with_node(trans, b, iter) - bch2_btree_iter_verify_level(iter, b->c.level); + if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + return 0; + + if (bkey_err(k) || !k.k) + return 0; + + BUG_ON(!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) + goto out; + + ret = bkey_err(prev); + if (ret) + goto out; + + if (!bkey_cmp(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { + char buf1[100], buf2[200]; + + bch2_bkey_to_text(&PBUF(buf1), k.k); + bch2_bkey_to_text(&PBUF(buf2), prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, + buf1, buf2); + } +out: + bch2_trans_iter_exit(trans, ©); + return ret; +} + +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + char buf[100]; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!(path->nodes_locked & 1) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && + bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) + return; + } else { + if (!bkey_cmp(pos, path->pos)) + return; + } + } + + bch2_dump_trans_paths_updates(trans); + panic("not locked: %s %s%s\n", + bch2_btree_ids[id], + (bch2_bpos_to_text(&PBUF(buf), pos), buf), + key_cache ? " cached" : ""); } #else -static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} +static inline void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned l) {} +static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) {} static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } #endif +/* Btree path: fixups after btree updates */ + static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, struct btree *b, struct bset_tree *t, @@ -670,40 +828,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); } -static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +static void __bch2_btree_path_fix_key_modified(struct btree_path *path, struct btree *b, struct bkey_packed *where) { - struct btree_iter_level *l = &iter->l[b->c.level]; + struct btree_path_level *l = &path->l[b->c.level]; if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) return; - if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) + if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) bch2_btree_node_iter_advance(&l->iter, l->b); - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *b, struct bkey_packed *where) { - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter_with_node(iter->trans, b, linked) { - __bch2_btree_iter_fix_key_modified(linked, b, where); - bch2_btree_iter_verify_level(linked, b->c.level); + trans_for_each_path_with_node(trans, b, path) { + __bch2_btree_path_fix_key_modified(path, b, where); + bch2_btree_path_verify_level(trans, path, b->c.level); } } -static void __bch2_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) +static void __bch2_btree_node_iter_fix(struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) { const struct bkey_packed *end = btree_bkey_last(b, t); struct btree_node_iter_set *set; @@ -721,7 +877,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -736,7 +892,7 @@ found: return; if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -762,8 +918,7 @@ fixup_done: */ if (!bch2_btree_node_iter_end(node_iter) && iter_current_key_modified && - (b->c.level || - btree_node_type_is_extents(iter->btree_id))) { + b->c.level) { struct bset_tree *t; struct bkey_packed *k, *k2, *p; @@ -788,14 +943,10 @@ fixup_done: b, t, k2); } } - - if (!b->c.level && - node_iter == &iter->l[0].iter && - iter_current_key_modified) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -void bch2_btree_node_iter_fix(struct btree_iter *iter, +void bch2_btree_node_iter_fix(struct btree_trans *trans, + struct btree_path *path, struct btree *b, struct btree_node_iter *node_iter, struct bkey_packed *where, @@ -803,26 +954,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, unsigned new_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct btree_iter *linked; + struct btree_path *linked; - if (node_iter != &iter->l[b->c.level].iter) { - __bch2_btree_node_iter_fix(iter, b, node_iter, t, + if (node_iter != &path->l[b->c.level].iter) { + __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); if (bch2_debug_check_iterators) bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_iter_with_node(iter->trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); - bch2_btree_iter_verify_level(linked, b->c.level); + bch2_btree_path_verify_level(trans, linked, b->c.level); } } -static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, - struct btree_iter_level *l, +/* Btree path level: pointer to a particular btree node and node iter */ + +static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + struct btree_path_level *l, struct bkey *u, struct bkey_packed *k) { @@ -847,49 +1000,52 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, * assertion here: */ if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + bch2_bkey_debugcheck(c, l->b, ret); return ret; } -/* peek_all() doesn't skip deleted keys */ -static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, - struct btree_iter_level *l, +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + struct btree_path_level *l, struct bkey *u) { - return __btree_iter_unpack(iter, l, u, + return __btree_iter_unpack(c, l, u, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(c, l, u, bch2_btree_node_iter_peek(&l->iter, l->b)); - iter->real_pos = k.k ? k.k->p : l->b->key.k.p; + path->pos = k.k ? k.k->p : l->b->key.k.p; return k; } -static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(c, l, u, bch2_btree_node_iter_prev(&l->iter, l->b)); - iter->real_pos = k.k ? k.k->p : l->b->data->min_key; + path->pos = k.k ? k.k->p : l->b->data->min_key; return k; } -static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, - struct btree_iter_level *l, +static inline bool btree_path_advance_to_pos(struct btree_path *path, + struct btree_path_level *l, int max_advance) { struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -903,9 +1059,10 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, /* * Verify that iterator for parent node points to child node: */ -static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) +static void btree_path_verify_new_node(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { - struct btree_iter_level *l; + struct btree_path_level *l; unsigned plevel; bool parent_locked; struct bkey_packed *k; @@ -914,15 +1071,15 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) return; plevel = b->c.level + 1; - if (!btree_iter_node(iter, plevel)) + if (!btree_path_node(path, plevel)) return; - parent_locked = btree_node_locked(iter, plevel); + parent_locked = btree_node_locked(path, plevel); - if (!bch2_btree_node_relock(iter, plevel)) + if (!bch2_btree_node_relock(trans, path, plevel)) return; - l = &iter->l[plevel]; + l = &path->l[plevel]; k = bch2_btree_node_iter_peek_all(&l->iter, l->b); if (!k || bkey_deleted(k) || @@ -933,8 +1090,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) char buf4[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_dump_btree_node(iter->trans->c, l->b); - bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_dump_btree_node(trans->c, l->b); + bch2_bpos_to_text(&PBUF(buf1), path->pos); bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); @@ -942,20 +1099,20 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) "iter pos %s %s\n" "iter key %s\n" "new node %s-%s\n", - bch2_btree_ids[iter->btree_id], buf1, + bch2_btree_ids[path->btree_id], buf1, buf2, buf3, buf4); } if (!parent_locked) - btree_node_unlock(iter, b->c.level + 1); + btree_node_unlock(path, plevel); } -static inline void __btree_iter_init(struct btree_iter *iter, - unsigned level) +static inline void __btree_path_level_init(struct btree_path *path, + unsigned level) { - struct btree_iter_level *l = &iter->l[level]; + struct btree_path_level *l = &path->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); /* * Iterators to interior nodes should always be pointed at the first non @@ -963,63 +1120,48 @@ static inline void __btree_iter_init(struct btree_iter *iter, */ if (level) bch2_btree_node_iter_peek(&l->iter, l->b); - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -static inline void btree_iter_node_set(struct btree_iter *iter, - struct btree *b) +static inline void btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + BUG_ON(path->cached); - btree_iter_verify_new_node(iter, b); + btree_path_verify_new_node(trans, path, b); - EBUG_ON(!btree_iter_pos_in_node(iter, b)); + EBUG_ON(!btree_path_pos_in_node(path, b)); EBUG_ON(b->c.lock.state.seq & 1); - iter->l[b->c.level].lock_seq = b->c.lock.state.seq; - iter->l[b->c.level].b = b; - __btree_iter_init(iter, b->c.level); + path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].b = b; + __btree_path_level_init(path, b->c.level); } +/* Btree path: fixups after btree node updates: */ + /* * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) { - enum btree_node_locked_type t; - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter(iter->trans, linked) - if (btree_iter_type(linked) != BTREE_ITER_CACHED && - btree_iter_pos_in_node(linked, b)) { - /* - * bch2_btree_iter_node_drop() has already been called - - * the old node we're replacing has already been - * unlocked and the pointer invalidated - */ - BUG_ON(btree_node_locked(linked, b->c.level)); + trans_for_each_path(trans, path) + if (!path->cached && + btree_path_pos_in_node(path, b)) { + enum btree_node_locked_type t = + btree_lock_want(path, b->c.level); - t = btree_lock_want(linked, b->c.level); - if (t != BTREE_NODE_UNLOCKED) { + if (path->nodes_locked && + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(linked, b->c.level, t); + mark_btree_node_locked(path, b->c.level, t); } - btree_iter_node_set(linked, b); - } -} - -void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - unsigned level = b->c.level; - - trans_for_each_iter(iter->trans, linked) - if (linked->l[level].b == b) { - btree_node_unlock(linked, level); - linked->l[level].b = BTREE_ITER_NO_NODE_DROP; + btree_path_level_init(trans, path, b); } } @@ -1027,14 +1169,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) * A btree node has been modified in such a way as to invalidate iterators - fix * them: */ -void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) +void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter_with_node(iter->trans, b, linked) - __btree_iter_init(linked, b->c.level); + trans_for_each_path_with_node(trans, b, path) + __btree_path_level_init(path, b->c.level); } +/* Btree path: traverse, set_pos: */ + static int lock_root_check_fn(struct six_lock *lock, void *p) { struct btree *b = container_of(lock, struct btree, c.lock); @@ -1043,52 +1187,56 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) return b == *rootp ? 0 : -1; } -static inline int btree_iter_lock_root(struct btree_iter *iter, +static inline int btree_path_lock_root(struct btree_trans *trans, + struct btree_path *path, unsigned depth_want, unsigned long trace_ip) { - struct bch_fs *c = iter->trans->c; - struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; + struct bch_fs *c = trans->c; + struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; enum six_lock_type lock_type; unsigned i; - EBUG_ON(iter->nodes_locked); + EBUG_ON(path->nodes_locked); while (1) { b = READ_ONCE(*rootp); - iter->level = READ_ONCE(b->c.level); + path->level = READ_ONCE(b->c.level); - if (unlikely(iter->level < depth_want)) { + if (unlikely(path->level < depth_want)) { /* * the root is at a lower depth than the depth we want: * got to the end of the btree, or we're walking nodes * greater than some depth and there are no nodes >= * that depth */ - iter->level = depth_want; - for (i = iter->level; i < BTREE_MAX_DEPTH; i++) - iter->l[i].b = NULL; + path->level = depth_want; + for (i = path->level; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; return 1; } - lock_type = __btree_lock_want(iter, iter->level); - if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, - iter, lock_type, + lock_type = __btree_lock_want(path, path->level); + if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, + path->level, lock_type, lock_root_check_fn, rootp, - trace_ip))) - return -EINTR; + trace_ip))) { + if (trans->restarted) + return -EINTR; + continue; + } if (likely(b == READ_ONCE(*rootp) && - b->c.level == iter->level && + b->c.level == path->level && !race_fault())) { - for (i = 0; i < iter->level; i++) - iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; - iter->l[iter->level].b = b; - for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) - iter->l[i].b = NULL; - - mark_btree_node_locked(iter, iter->level, lock_type); - btree_iter_node_set(iter, b); + for (i = 0; i < path->level; i++) + path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; + path->l[path->level].b = b; + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + + mark_btree_node_locked(path, path->level, lock_type); + btree_path_level_init(trans, path, b); return 0; } @@ -1097,22 +1245,23 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, } noinline -static void btree_iter_prefetch(struct btree_iter *iter) +static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) { - struct bch_fs *c = iter->trans->c; - struct btree_iter_level *l = &iter->l[iter->level]; + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) - ? (iter->level > 1 ? 0 : 2) - : (iter->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(iter, iter->level); + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; bch2_bkey_buf_init(&tmp); - while (nr) { - if (!bch2_btree_node_relock(iter, iter->level)) + while (nr && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) break; bch2_btree_node_iter_advance(&node_iter, l->b); @@ -1121,25 +1270,27 @@ static void btree_iter_prefetch(struct btree_iter *iter) break; bch2_bkey_buf_unpack(&tmp, c, l->b, k); - bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, - iter->level - 1); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); } if (!was_locked) - btree_node_unlock(iter, iter->level); + btree_node_unlock(path, path->level); bch2_bkey_buf_exit(&tmp, c); + return ret; } -static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, unsigned plevel, struct btree *b) { - struct btree_iter_level *l = &iter->l[plevel]; - bool locked = btree_node_locked(iter, plevel); + struct btree_path_level *l = &path->l[plevel]; + bool locked = btree_node_locked(path, plevel); struct bkey_packed *k; struct bch_btree_ptr_v2 *bp; - if (!bch2_btree_node_relock(iter, plevel)) + if (!bch2_btree_node_relock(trans, path, plevel)) return; k = bch2_btree_node_iter_peek_all(&l->iter, l->b); @@ -1149,92 +1300,84 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, bp->mem_ptr = (unsigned long)b; if (!locked) - btree_node_unlock(iter, plevel); + btree_node_unlock(path, plevel); } -static __always_inline int btree_iter_down(struct btree_iter *iter, +static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, unsigned long trace_ip) { - struct bch_fs *c = iter->trans->c; - struct btree_iter_level *l = &iter->l[iter->level]; + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); struct btree *b; - unsigned level = iter->level - 1; - enum six_lock_type lock_type = __btree_lock_want(iter, level); + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); struct bkey_buf tmp; int ret; - EBUG_ON(!btree_node_locked(iter, iter->level)); + EBUG_ON(!btree_node_locked(path, path->level)); bch2_bkey_buf_init(&tmp); bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) goto err; - mark_btree_node_locked(iter, level, lock_type); - btree_iter_node_set(iter, b); + mark_btree_node_locked(path, level, lock_type); + btree_path_level_init(trans, path, b); if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && unlikely(b != btree_node_mem_ptr(tmp.k))) - btree_node_mem_ptr_set(iter, level + 1, b); + btree_node_mem_ptr_set(trans, path, level + 1, b); - if (iter->flags & BTREE_ITER_PREFETCH) - btree_iter_prefetch(iter); + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch(trans, path); - iter->level = level; + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); + path->level = level; + + bch2_btree_path_verify_locks(path); err: bch2_bkey_buf_exit(&tmp, c); return ret; } -static int btree_iter_traverse_one(struct btree_iter *, unsigned long); +static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); -static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, +static int __btree_path_traverse_all(struct btree_trans *trans, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_iter *iter; - u8 sorted[BTREE_ITER_MAX]; - int i, nr_sorted = 0; - bool relock_fail; + struct btree_path *path; + int i; if (trans->in_traverse_all) return -EINTR; trans->in_traverse_all = true; retry_all: - nr_sorted = 0; - relock_fail = false; - - trans_for_each_iter(trans, iter) { - if (!bch2_btree_iter_relock(iter, _THIS_IP_)) - relock_fail = true; - sorted[nr_sorted++] = iter->idx; - } - - if (!relock_fail) { - trans->in_traverse_all = false; - return 0; - } + trans->restarted = false; -#define btree_iter_cmp_by_idx(_l, _r) \ - btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) + trans_for_each_path(trans, path) + path->should_be_locked = false; - bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); -#undef btree_iter_cmp_by_idx + btree_trans_verify_sorted(trans); - for (i = nr_sorted - 2; i >= 0; --i) { - struct btree_iter *iter1 = trans->iters + sorted[i]; - struct btree_iter *iter2 = trans->iters + sorted[i + 1]; + for (i = trans->nr_sorted - 2; i >= 0; --i) { + struct btree_path *path1 = trans->paths + trans->sorted[i]; + struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; - if (iter1->btree_id == iter2->btree_id && - iter1->locks_want < iter2->locks_want) - __bch2_btree_iter_upgrade(iter1, iter2->locks_want); - else if (!iter1->locks_want && iter2->locks_want) - __bch2_btree_iter_upgrade(iter1, 1); + if (path1->btree_id == path2->btree_id && + path1->locks_want < path2->locks_want) + __bch2_btree_path_upgrade(trans, path1, path2->locks_want); + else if (!path1->locks_want && path2->locks_want) + __bch2_btree_path_upgrade(trans, path1, 1); } bch2_trans_unlock(trans); @@ -1251,37 +1394,36 @@ retry_all: } while (ret); } - if (unlikely(ret == -EIO)) { - trans->error = true; + if (unlikely(ret == -EIO)) goto out; - } BUG_ON(ret && ret != -EINTR); /* Now, redo traversals in correct order: */ - for (i = 0; i < nr_sorted; i++) { - unsigned idx = sorted[i]; + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; - /* - * sucessfully traversing one iterator can cause another to be - * unlinked, in btree_key_cache_fill() - */ - if (!(trans->iters_linked & (1ULL << idx))) - continue; + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); if (ret) goto retry_all; + + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + + if (path->nodes_locked || + !btree_path_node(path, path->level)) + i++; } - if (hweight64(trans->iters_live) > 1) - ret = -EINTR; - else - trans_for_each_iter(trans, iter) - if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { - ret = -EINTR; - break; - } + /* + * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() + * and relock(), relock() won't relock since path->should_be_locked + * isn't set yet, which is all fine + */ + trans_for_each_path(trans, path) + BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); out: bch2_btree_cache_cannibalize_unlock(c); @@ -1291,37 +1433,50 @@ out: return ret; } -int bch2_btree_iter_traverse_all(struct btree_trans *trans) +static int bch2_btree_path_traverse_all(struct btree_trans *trans) { - return __btree_iter_traverse_all(trans, 0, _RET_IP_); + return __btree_path_traverse_all(trans, 0, _RET_IP_); } -static inline bool btree_iter_good_node(struct btree_iter *iter, +static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, unsigned l, int check_pos) { - if (!is_btree_node(iter, l) || - !bch2_btree_node_relock(iter, l)) + if (!is_btree_node(path, l) || + !bch2_btree_node_relock(trans, path, l)) return false; - if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) return false; - if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) return false; return true; } -static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, +static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, int check_pos) { - unsigned l = iter->level; + unsigned i, l = path->level; - while (btree_iter_node(iter, l) && - !btree_iter_good_node(iter, l, check_pos)) { - btree_node_unlock(iter, l); - iter->l[l].b = BTREE_ITER_NO_NODE_UP; + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) { + btree_node_unlock(path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; l++; } + /* If we need intent locks, take them too: */ + for (i = l + 1; + i < path->locks_want && btree_path_node(path, i); + i++) + if (!bch2_btree_node_relock(trans, path, i)) + while (l <= i) { + btree_node_unlock(path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } + return l; } @@ -1334,102 +1489,436 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_iter_traverse_one(struct btree_iter *iter, +static int btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, unsigned long trace_ip) { - unsigned depth_want = iter->level; + unsigned depth_want = path->level; int ret = 0; + if (unlikely(trans->restarted)) { + ret = -EINTR; + goto out; + } + /* - * if we need interior nodes locked, call btree_iter_relock() to make - * sure we walk back up enough that we lock them: + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: */ - if (iter->uptodate == BTREE_ITER_NEED_RELOCK || - iter->locks_want > 1) - bch2_btree_iter_relock(iter, _THIS_IP_); - - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { - ret = bch2_btree_iter_traverse_cached(iter); + if (path->should_be_locked) { + ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; goto out; } - if (iter->uptodate < BTREE_ITER_NEED_RELOCK) + if (path->cached) { + ret = bch2_btree_path_traverse_cached(trans, path, flags); goto out; + } - if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; - iter->level = btree_iter_up_until_good_node(iter, 0); + path->level = btree_path_up_until_good_node(trans, path, 0); /* - * Note: iter->nodes[iter->level] may be temporarily NULL here - that + * Note: path->nodes[path->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, * here it indicates that relocking the root failed - it's critical that - * btree_iter_lock_root() comes next and that it can't fail + * btree_path_lock_root() comes next and that it can't fail */ - while (iter->level > depth_want) { - ret = btree_iter_node(iter, iter->level) - ? btree_iter_down(iter, trace_ip) - : btree_iter_lock_root(iter, depth_want, trace_ip); + while (path->level > depth_want) { + ret = btree_path_node(path, path->level) + ? btree_path_down(trans, path, flags, trace_ip) + : btree_path_lock_root(trans, path, depth_want, trace_ip); if (unlikely(ret)) { if (ret == 1) { /* - * Got to the end of the btree (in - * BTREE_ITER_NODES mode) + * No nodes at this level - got to the end of + * the btree: */ ret = 0; goto out; } - iter->level = depth_want; + __bch2_btree_path_unlock(path); + path->level = depth_want; - if (ret == -EIO) { - iter->flags |= BTREE_ITER_ERROR; - iter->l[iter->level].b = + if (ret == -EIO) + path->l[path->level].b = BTREE_ITER_NO_NODE_ERROR; - } else { - iter->l[iter->level].b = + else + path->l[path->level].b = BTREE_ITER_NO_NODE_DOWN; - } goto out; } } - iter->uptodate = BTREE_ITER_NEED_PEEK; + path->uptodate = BTREE_ITER_UPTODATE; out: - trace_iter_traverse(iter->trans->ip, trace_ip, - iter->btree_id, &iter->real_pos, ret); - bch2_btree_iter_verify(iter); + BUG_ON((ret == -EINTR) != !!trans->restarted); + bch2_btree_path_verify(trans, path); return ret; } -static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); + +int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) { - struct btree_trans *trans = iter->trans; - int ret; + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; - ret = bch2_trans_cond_resched(trans) ?: - btree_iter_traverse_one(iter, _RET_IP_); - if (unlikely(ret)) - ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); + return bch2_trans_cond_resched(trans) ?: + btree_path_traverse_one(trans, path, flags, _RET_IP_); +} - return ret; +static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + struct btree_path *src) +{ + unsigned i; + + memcpy(&dst->pos, &src->pos, + sizeof(struct btree_path) - offsetof(struct btree_path, pos)); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) + six_lock_increment(&dst->l[i].b->c.lock, + __btree_lock_want(dst, i)); + + btree_path_check_sort(trans, dst, 0); } -/* - * Note: - * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is - * for internal btree iterator users - * - * bch2_btree_iter_traverse sets iter->real_pos to iter->pos, - * btree_iter_traverse() does not: - */ -static inline int __must_check -btree_iter_traverse(struct btree_iter *iter) +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, + bool intent) +{ + struct btree_path *new = btree_path_alloc(trans, src); + + btree_path_copy(trans, new, src); + __btree_path_get(new, intent); + return new; +} + +inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + if (path->ref > 1 || path->preserve) { + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; +#ifdef CONFIG_BCACHEFS_DEBUG + path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + + return path; +} + +static struct btree_path * __must_check +btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip) +{ + int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; + + EBUG_ON(trans->restarted); + EBUG_ON(!path->ref); + + if (!cmp) + return path; + + path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + path->should_be_locked = false; + + btree_path_check_sort(trans, path, cmp); + + if (unlikely(path->cached)) { + btree_node_unlock(path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + goto out; + } + + l = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, l)) { + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp < 0 || + !btree_path_advance_to_pos(path, &path->l[l], 8)) + __btree_path_level_init(path, l); + } + + if (l != path->level) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(path); + } +out: + bch2_btree_path_verify(trans, path); + return path; +} + +/* Btree path: main interface: */ + +static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) { - return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? __bch2_btree_iter_traverse(iter) - : 0; + struct btree_path *next; + + next = prev_btree_path(trans, path); + if (next && !btree_path_cmp(next, path)) + return next; + + next = next_btree_path(trans, path); + if (next && !btree_path_cmp(next, path)) + return next; + + return NULL; +} + +static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *next; + + next = prev_btree_path(trans, path); + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; + + next = next_btree_path(trans, path); + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; + + return NULL; +} + +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +{ + __bch2_btree_path_unlock(path); + btree_path_list_remove(trans, path); + trans->paths_allocated &= ~(1ULL << path->idx); +} + +void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +{ + struct btree_path *dup; + + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + /* + * Perhaps instead we should check for duplicate paths in traverse_all: + */ + if (path->preserve && + (dup = have_path_at_pos(trans, path))) { + dup->preserve = true; + path->preserve = false; + goto free; + } + + if (!path->preserve && + (dup = have_node_at_pos(trans, path))) + goto free; + return; +free: + if (path->should_be_locked && + !btree_node_locked(dup, path->level)) + return; + + dup->should_be_locked |= path->should_be_locked; + __bch2_path_free(trans, path); +} + +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + struct btree_path *path; + struct btree_insert_entry *i; + unsigned idx; + char buf1[300], buf2[300]; + + btree_trans_verify_sorted(trans); + + trans_for_each_path_inorder(trans, path, idx) + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], + (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + path->nodes_locked, +#ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated +#else + NULL +#endif + ); + + trans_for_each_update(trans, i) { + struct bkey u; + struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); + + printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", + bch2_btree_ids[i->btree_id], + (void *) i->ip_allocated, + (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), + (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); + } +} + +static struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) +{ + struct btree_path *path; + unsigned idx; + + if (unlikely(trans->paths_allocated == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); + } + + idx = __ffs64(~trans->paths_allocated); + trans->paths_allocated |= 1ULL << idx; + + path = &trans->paths[idx]; + + path->idx = idx; + path->ref = 0; + path->intent_ref = 0; + path->nodes_locked = 0; + path->nodes_intent_locked = 0; + + btree_path_list_add(trans, pos, path); + return path; +} + +struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + bool intent, unsigned long ip) +{ + struct btree_path *path, *path_pos = NULL; + int i; + + BUG_ON(trans->restarted); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, + btree_id, + cached, + pos, + level) > 0) + break; + + path_pos = path; + } + + if (path_pos && + path_pos->cached == cached && + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); + path = btree_path_set_pos(trans, path_pos, pos, intent, ip); + path->preserve = true; + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; + + __btree_path_get(path, intent); + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; + path->preserve = true; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; + path->locks_want = locks_want; + path->nodes_locked = 0; + path->nodes_intent_locked = 0; + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = BTREE_ITER_NO_NODE_INIT; +#ifdef CONFIG_BCACHEFS_DEBUG + path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + + /* + * If the path has locks_want greater than requested, we don't downgrade + * it here - on transaction restart because btree node split needs to + * upgrade locks, we might be putting/getting the iterator again. + * Downgrading iterators only happens via bch2_trans_downgrade(), after + * a successful transaction commit. + */ + + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) { + path->locks_want = locks_want; + btree_path_get_locks(trans, path, true, _THIS_IP_); + } + + return path; +} + +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +{ + + struct bkey_s_c k; + + BUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + if (!path->cached) { + struct btree_path_level *l = path_l(path); + struct bkey_packed *_k = + bch2_btree_node_iter_peek_all(&l->iter, l->b); + + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); + + if (!k.k || bpos_cmp(path->pos, k.k->p)) + goto hole; + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + + EBUG_ON(path->btree_id != ck->key.btree_id || + bkey_cmp(path->pos, ck->key.pos)); + + /* BTREE_ITER_CACHED_NOFILL? */ + if (unlikely(!ck->valid)) + goto hole; + + k = bkey_i_to_s_c(ck->k); + } + + return k; +hole: + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +/* Btree iterators: */ + +int __must_check +__bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); } int __must_check @@ -1437,13 +1926,16 @@ bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; - btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + iter->path = btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); if (ret) return ret; - iter->should_be_locked = true; + iter->path->should_be_locked = true; return 0; } @@ -1451,143 +1943,132 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) { - struct btree *b; + struct btree_trans *trans = iter->trans; + struct btree *b = NULL; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) - return NULL; + goto err; - b = btree_iter_node(iter, iter->level); + b = btree_path_node(iter->path, iter->path->level); if (!b) - return NULL; + goto out; BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = iter->real_pos = b->key.k.p; + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; return b; +err: + b = ERR_PTR(ret); + goto out; } struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { - struct btree *b; + struct btree_trans *trans = iter->trans; + struct btree_path *path = iter->path; + struct btree *b = NULL; + unsigned l; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + BUG_ON(trans->restarted); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); - /* already got to end? */ - if (!btree_iter_node(iter, iter->level)) - return NULL; - - bch2_trans_cond_resched(iter->trans); - - btree_node_unlock(iter, iter->level); - iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; - iter->level++; - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = btree_iter_traverse(iter); - if (ret) + /* already at end? */ + if (!btree_path_node(path, path->level)) return NULL; /* got to end? */ - b = btree_iter_node(iter, iter->level); - if (!b) + if (!btree_path_node(path, path->level + 1)) { + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; return NULL; + } - if (bpos_cmp(iter->pos, b->key.k.p) < 0) { + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + __bch2_btree_path_unlock(path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + btree_trans_restart(trans); + ret = -EINTR; + goto err; + } + + b = btree_path_node(path, path->level + 1); + + if (!bpos_cmp(iter->pos, b->key.k.p)) { + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; + } else { /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ - btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); + path = iter->path = + btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); + + path->level = iter->min_depth; - /* Unlock to avoid screwing up our lock invariants: */ - btree_node_unlock(iter, iter->level); + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(path, l); - iter->level = iter->min_depth; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_iter_verify(iter); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(trans, path, iter->flags); if (ret) - return NULL; + goto err; - b = iter->l[iter->level].b; + b = path->l[path->level].b; } - iter->pos = iter->real_pos = b->key.k.p; + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; return b; +err: + b = ERR_PTR(ret); + goto out; } /* Iterate across keys (in leaf nodes only) */ -static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) -{ - struct bpos old_pos = iter->real_pos; - int cmp = bpos_cmp(new_pos, iter->real_pos); - unsigned l = iter->level; - - if (!cmp) - goto out; - - iter->real_pos = new_pos; - iter->should_be_locked = false; - - if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { - btree_node_unlock(iter, 0); - iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - return; - } - - l = btree_iter_up_until_good_node(iter, cmp); - - if (btree_iter_node(iter, l)) { - /* - * We might have to skip over many keys, or just a few: try - * advancing the node iterator, and if we have to skip over too - * many keys just reinit it (or if we're rewinding, since that - * is expensive). - */ - if (cmp < 0 || - !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) - __btree_iter_init(iter, l); - - /* Don't leave it locked if we're not supposed to: */ - if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, l); - } -out: - if (l != iter->level) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - else - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - bch2_btree_iter_verify(iter); -#ifdef CONFIG_BCACHEFS_DEBUG - trace_iter_set_search_pos(iter->trans->ip, _RET_IP_, - iter->btree_id, - &old_pos, &new_pos, l); -#endif -} - inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = bpos_cmp(pos, POS_MAX) != 0; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, SPOS_MAX) + : bkey_cmp(pos, SPOS_MAX)) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) pos = bkey_successor(iter, pos); @@ -1598,7 +2079,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = bpos_cmp(pos, POS_MIN) != 0; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, POS_MIN) + : bkey_cmp(pos, POS_MIN)) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) pos = bkey_predecessor(iter, pos); @@ -1606,91 +2089,84 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) -{ - struct bpos next_pos = iter->l[0].b->key.k.p; - bool ret = bpos_cmp(next_pos, POS_MAX) != 0; - - /* - * Typically, we don't want to modify iter->pos here, since that - * indicates where we searched from - unless we got to the end of the - * btree, in that case we want iter->pos to reflect that: - */ - if (ret) - btree_iter_set_search_pos(iter, bpos_successor(next_pos)); - else - bch2_btree_iter_set_pos(iter, POS_MAX); - - return ret; -} - -static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) -{ - struct bpos next_pos = iter->l[0].b->data->min_key; - bool ret = bpos_cmp(next_pos, POS_MIN) != 0; - - if (ret) - btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); - else - bch2_btree_iter_set_pos(iter, POS_MIN); - - return ret; -} - -static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_insert_entry *i; - - trans_for_each_update2(trans, i) - if ((cmp_int(btree_id, i->iter->btree_id) ?: - bkey_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->iter->btree_id) - return i->k; - break; - } - - return NULL; -} - -static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_i *next_update; struct bkey_s_c k; - int ret; + int ret, cmp; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + EBUG_ON(iter->path->cached || iter->path->level); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); -start: - next_update = with_updates - ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) - : NULL; - btree_iter_set_search_pos(iter, search_key); while (1) { - ret = btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); - k = btree_iter_level_peek(iter, &iter->l[0]); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out; + } + + next_update = iter->flags & BTREE_ITER_WITH_UPDATES + ? btree_trans_peek_updates(trans, iter->btree_id, search_key) + : NULL; + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + + /* * In the btree, deleted keys sort before non deleted: */ + if (k.k && bkey_deleted(k.k) && + (!next_update || + bpos_cmp(k.k->p, next_update->k.p) <= 0)) { + search_key = k.k->p; + continue; + } if (next_update && - bpos_cmp(next_update->k.p, iter->real_pos) <= 0) + bpos_cmp(next_update->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_update->k; k = bkey_i_to_s_c(next_update); + } if (likely(k.k)) { - if (bkey_deleted(k.k)) { + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { search_key = bkey_successor(iter, k.k->p); - goto start; + continue; } break; + } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ + search_key = bpos_successor(iter->path->l[0].b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; } - - if (!btree_iter_set_pos_to_next_leaf(iter)) - return bkey_s_c_null; } /* @@ -1702,19 +2178,27 @@ start: else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; + + cmp = bpos_cmp(k.k->p, iter->path->pos); + if (cmp) { + iter->path = bch2_btree_path_make_mut(trans, iter->path, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); + iter->path->pos = k.k->p; + btree_path_check_sort(trans, iter->path, cmp); + } +out: + iter->path->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; - return k; -} + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); -/** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position - */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -{ - return __btree_iter_peek(iter, false); + return k; } /** @@ -1729,55 +2213,105 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } -struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -{ - return __btree_iter_peek(iter, true); -} - -struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) -{ - if (!bch2_btree_iter_advance(iter)) - return bkey_s_c_null; - - return bch2_btree_iter_peek_with_updates(iter); -} - /** * bch2_btree_iter_peek_prev: returns first key less than or equal to * iterator's current position */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct btree_path *saved_path = NULL; struct bkey_s_c k; + struct bkey saved_k; + const struct bch_val *saved_v; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - btree_iter_set_search_pos(iter, iter->pos); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; while (1) { - ret = btree_iter_traverse(iter); + iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto no_key; + goto out; } - k = btree_iter_level_peek(iter, l); + k = btree_path_level_peek(trans->c, iter->path, + &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 - : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)) - k = btree_iter_level_prev(iter, l); + ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 + : bpos_cmp(k.k->p, search_key) > 0)) + k = btree_path_level_prev(trans->c, iter->path, + &iter->path->l[0], &iter->k); - if (likely(k.k)) - break; + btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (k.k->p.snapshot == iter->snapshot) + goto got_key; + + /* + * If we have a saved candidate, and we're no + * longer at the same _key_ (not pos), return + * that candidate + */ + if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; + iter->k = saved_k; + k.v = saved_v; + goto got_key; + } + + if (bch2_snapshot_is_ancestor(iter->trans->c, + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + saved_k = *k.k; + saved_v = k.v; + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +got_key: + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_predecessor(iter, k.k->p); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + continue; + } - if (!btree_iter_set_pos_to_prev_leaf(iter)) { + break; + } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; - goto no_key; + goto out; } } @@ -1786,20 +2320,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* Extents can straddle iter->pos: */ if (bkey_cmp(k.k->p, iter->pos) < 0) iter->pos = k.k->p; + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; out: + if (saved_path) + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + iter->path->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; + return k; -no_key: - /* - * btree_iter_level_peek() may have set iter->k to a key we didn't want, and - * then we errored going to the previous leaf - make sure it's - * consistent with iter->pos: - */ - bkey_init(&iter->k); - iter->k.p = iter->pos; - goto out; } /** @@ -1814,82 +2346,98 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) return bch2_btree_iter_peek_prev(iter); } -static inline struct bkey_s_c -__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; + struct bpos search_key; struct bkey_s_c k; - struct bpos pos, next_start; + int ret; + + EBUG_ON(iter->path->level); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); - /* keys & holes can't span inode numbers: */ - if (iter->pos.offset == KEY_OFFSET_MAX) { + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); } - pos = iter->pos; - k = bch2_btree_iter_peek(iter); - iter->pos = pos; - - if (bkey_err(k)) - return k; - - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) - return k; - - next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; + search_key = btree_iter_search_key(iter); + iter->path = btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + iter->ip_allocated); - bkey_init(&iter->k); - iter->k.p = iter->pos; - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next_start.inode == iter->pos.inode - ? next_start.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) + return bkey_s_c_err(ret); - EBUG_ON(!iter->k.size); + if ((iter->flags & BTREE_ITER_CACHED) || + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; - bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + next_update = iter->flags & BTREE_ITER_WITH_UPDATES + ? btree_trans_peek_updates(trans, iter->btree_id, search_key) + : NULL; - return (struct bkey_s_c) { &iter->k, NULL }; -} + if (next_update && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } else { + k = bch2_btree_path_peek_slot(iter->path, &iter->k); + } + } else { + struct bpos next; -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c k; - int ret; + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); + bch2_trans_copy_iter(&iter2, iter); + k = bch2_btree_iter_peek(&iter2); - btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; + k.k = &iter->k; + } + bch2_trans_iter_exit(trans, &iter2); + } else { + struct bpos pos = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return __bch2_btree_iter_peek_slot_extents(iter); + k = bch2_btree_iter_peek(iter); + iter->pos = pos; + } - ret = btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(bkey_err(k))) + return k; - k = btree_iter_level_peek_all(iter, l, &iter->k); + next = k.k ? bkey_start_pos(k.k) : POS_MAX; - EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); + if (bkey_cmp(iter->pos, next) < 0) { + bkey_init(&iter->k); + iter->k.p = iter->pos; + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); - if (!k.k || bkey_cmp(iter->pos, k.k->p)) { - /* hole */ - bkey_init(&iter->k); - iter->k.p = iter->pos; - k = (struct bkey_s_c) { &iter->k, NULL }; + k = (struct bkey_s_c) { &iter->k, NULL }; + EBUG_ON(!k.k->size); + } } + iter->path->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); return k; } @@ -1910,298 +2458,200 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } -struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) -{ - struct bkey_cached *ck; - int ret; - - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); - bch2_btree_iter_verify(iter); - - ret = btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - ck = (void *) iter->l[0].b; - - EBUG_ON(iter->btree_id != ck->key.btree_id || - bkey_cmp(iter->pos, ck->key.pos)); - BUG_ON(!ck->valid); - - iter->should_be_locked = true; +/* new transactional stuff: */ - return bkey_i_to_s_c(ck->k); +static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, + struct btree_path *path) +{ + EBUG_ON(path->sorted_idx >= trans->nr_sorted); + EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); } -static inline void bch2_btree_iter_init(struct btree_trans *trans, - struct btree_iter *iter, enum btree_id btree_id) +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) { - struct bch_fs *c = trans->c; +#ifdef CONFIG_BCACHEFS_DEBUG unsigned i; - iter->trans = trans; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - iter->btree_id = btree_id; - iter->real_pos = POS_MIN; - iter->level = 0; - iter->min_depth = 0; - iter->locks_want = 0; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - for (i = 0; i < ARRAY_SIZE(iter->l); i++) - iter->l[i].b = BTREE_ITER_NO_NODE_INIT; - - prefetch(c->btree_roots[btree_id].b); + for (i = 0; i < trans->nr_sorted; i++) + btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); +#endif } -/* new transactional stuff: */ - -static inline void __bch2_trans_iter_free(struct btree_trans *trans, - unsigned idx) +static void btree_trans_verify_sorted(struct btree_trans *trans) { - __bch2_btree_iter_unlock(&trans->iters[idx]); - trans->iters_linked &= ~(1ULL << idx); - trans->iters_live &= ~(1ULL << idx); - trans->iters_touched &= ~(1ULL << idx); +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_path *path, *prev = NULL; + unsigned i; + + trans_for_each_path_inorder(trans, path, i) { + BUG_ON(prev && btree_path_cmp(prev, path) > 0); + prev = path; + } +#endif } -int bch2_trans_iter_put(struct btree_trans *trans, - struct btree_iter *iter) +static inline void btree_path_swap(struct btree_trans *trans, + struct btree_path *l, struct btree_path *r) { - int ret; + swap(l->sorted_idx, r->sorted_idx); + swap(trans->sorted[l->sorted_idx], + trans->sorted[r->sorted_idx]); - if (IS_ERR_OR_NULL(iter)) - return 0; - - BUG_ON(trans->iters + iter->idx != iter); - BUG_ON(!btree_iter_live(trans, iter)); - - ret = btree_iter_err(iter); - - if (!(trans->iters_touched & (1ULL << iter->idx)) && - !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) - __bch2_trans_iter_free(trans, iter->idx); - - trans->iters_live &= ~(1ULL << iter->idx); - return ret; + btree_path_verify_sorted_ref(trans, l); + btree_path_verify_sorted_ref(trans, r); } -int bch2_trans_iter_free(struct btree_trans *trans, - struct btree_iter *iter) +static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, + int cmp) { - if (IS_ERR_OR_NULL(iter)) - return 0; + struct btree_path *n; + + if (cmp <= 0) { + n = prev_btree_path(trans, path); + if (n && btree_path_cmp(n, path) > 0) { + do { + btree_path_swap(trans, n, path); + n = prev_btree_path(trans, path); + } while (n && btree_path_cmp(n, path) > 0); - set_btree_iter_dontneed(trans, iter); + goto out; + } + } - return bch2_trans_iter_put(trans, iter); + if (cmp >= 0) { + n = next_btree_path(trans, path); + if (n && btree_path_cmp(path, n) > 0) { + do { + btree_path_swap(trans, path, n); + n = next_btree_path(trans, path); + } while (n && btree_path_cmp(path, n) > 0); + } + } +out: + btree_trans_verify_sorted(trans); } -noinline __cold -static void btree_trans_iter_alloc_fail(struct btree_trans *trans) +static inline void btree_path_list_remove(struct btree_trans *trans, + struct btree_path *path) { + unsigned i; - struct btree_iter *iter; - struct btree_insert_entry *i; - char buf[100]; + EBUG_ON(path->sorted_idx >= trans->nr_sorted); - trans_for_each_iter(trans, iter) - printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", - bch2_btree_ids[iter->btree_id], - (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), - btree_iter_live(trans, iter) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); + array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); - trans_for_each_update(trans, i) { - char buf[300]; + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; - bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); - printk(KERN_ERR "update: btree %s %s\n", - bch2_btree_ids[i->iter->btree_id], buf); - } - panic("trans iter oveflow\n"); + path->sorted_idx = U8_MAX; + + btree_trans_verify_sorted_refs(trans); } -static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) +static inline void btree_path_list_add(struct btree_trans *trans, + struct btree_path *pos, + struct btree_path *path) { - unsigned idx; - - if (unlikely(trans->iters_linked == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) - btree_trans_iter_alloc_fail(trans); + unsigned i; - idx = __ffs64(~trans->iters_linked); + btree_trans_verify_sorted_refs(trans); - trans->iters_linked |= 1ULL << idx; - trans->iters[idx].idx = idx; - trans->iters[idx].flags = 0; - return &trans->iters[idx]; -} + path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; -static inline void btree_iter_copy(struct btree_iter *dst, - struct btree_iter *src) -{ - unsigned i, idx = dst->idx; + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); - *dst = *src; - dst->idx = idx; - dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_locked(dst, i)) - six_lock_increment(&dst->l[i].b->c.lock, - __btree_lock_want(dst, i)); + btree_trans_verify_sorted_refs(trans); +} - dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; +void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) +{ + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; } -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) +static void __bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) { - struct btree_iter *iter, *best = NULL; - struct bpos real_pos, pos_min = POS_MIN; + EBUG_ON(trans->restarted); - if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && - btree_node_type_is_extents(btree_id) && - !(flags & BTREE_ITER_NOT_EXTENTS) && - !(flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + btree_node_type_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; - if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(btree_id)) flags &= ~BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) - pos.snapshot = btree_type_has_snapshots(btree_id) - ? U32_MAX : 0; - - real_pos = pos; - - if ((flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(pos, POS_MAX)) - real_pos = bpos_nosnap_successor(pos); - - trans_for_each_iter(trans, iter) { - if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) - continue; - - if (iter->btree_id != btree_id) - continue; - - if (best) { - int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos), - bpos_diff(iter->real_pos, real_pos)); - - if (cmp < 0 || - ((cmp == 0 && btree_iter_keep(trans, iter)))) - continue; - } - - best = iter; - } - - if (!best) { - iter = btree_trans_iter_alloc(trans); - bch2_btree_iter_init(trans, iter, btree_id); - } else if (btree_iter_keep(trans, best)) { - iter = btree_trans_iter_alloc(trans); - btree_iter_copy(iter, best); - } else { - iter = best; - } - - trans->iters_live |= 1ULL << iter->idx; - trans->iters_touched |= 1ULL << iter->idx; - - iter->flags = flags; - - iter->snapshot = pos.snapshot; - - /* - * If the iterator has locks_want greater than requested, we explicitly - * do not downgrade it here - on transaction restart because btree node - * split needs to upgrade locks, we might be putting/getting the - * iterator again. Downgrading iterators only happens via an explicit - * bch2_trans_downgrade(). - */ - - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > iter->locks_want) { - iter->locks_want = locks_want; - btree_iter_get_locks(iter, true, _THIS_IP_); - } - - while (iter->level != depth) { - btree_node_unlock(iter, iter->level); - iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (iter->level < depth) - iter->level++; - else - iter->level--; - } + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + iter->trans = trans; + iter->path = NULL; + iter->btree_id = btree_id; iter->min_depth = depth; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k.type = KEY_TYPE_deleted; + iter->k.p = pos; + iter->k.size = 0; +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif - bch2_btree_iter_set_pos(iter, pos); - btree_iter_set_search_pos(iter, real_pos); - - trace_trans_get_iter(_RET_IP_, trans->ip, - btree_id, - &real_pos, locks_want, iter->uptodate, - best ? &best->real_pos : &pos_min, - best ? best->locks_want : U8_MAX, - best ? best->uptodate : U8_MAX); - - return iter; + iter->path = bch2_path_get(trans, + flags & BTREE_ITER_CACHED, + btree_id, + iter->pos, + locks_want, + depth, + flags & BTREE_ITER_INTENT, ip); } -struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) +void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) { - struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, - locks_want, depth, - BTREE_ITER_NODES| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - flags); - - BUG_ON(bkey_cmp(iter->pos, pos)); - BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->level != depth); - BUG_ON(iter->min_depth != depth); - iter->ip_allocated = _RET_IP_; - - return iter; + __bch2_trans_iter_init(trans, iter, btree_id, pos, + 0, 0, flags, _RET_IP_); } -struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *src) +void bch2_trans_node_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, + struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) { - struct btree_iter *iter; - - iter = btree_trans_iter_alloc(trans); - btree_iter_copy(iter, src); - - trans->iters_live |= 1ULL << iter->idx; - /* - * We don't need to preserve this iter since it's cheap to copy it - * again - this will cause trans_iter_put() to free it right away: - */ - set_btree_iter_dontneed(trans, iter); + __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, + BTREE_ITER_NOT_EXTENTS| + __BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_ALL_SNAPSHOTS| + flags, _RET_IP_); + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); +} - return iter; +void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +{ + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); } void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) @@ -2231,43 +2681,36 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (old_bytes) { trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); + btree_trans_restart(trans); return ERR_PTR(-EINTR); } } p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } -inline void bch2_trans_unlink_iters(struct btree_trans *trans) -{ - u64 iters = trans->iters_linked & - ~trans->iters_touched & - ~trans->iters_live; - - while (iters) { - unsigned idx = __ffs64(iters); - - iters &= ~(1ULL << idx); - __bch2_trans_iter_free(trans, idx); - } -} - -void bch2_trans_reset(struct btree_trans *trans, unsigned flags) +/** + * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset + * + * While iterating over nodes or updating nodes a attempt to lock a btree + * node may return EINTR when the trylock fails. When this occurs + * bch2_trans_begin() should be called and the transaction retried. + */ +void bch2_trans_begin(struct btree_trans *trans) { - struct btree_iter *iter; - - trans_for_each_iter(trans, iter) - iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| - BTREE_ITER_SET_POS_AFTER_COMMIT); - - bch2_trans_unlink_iters(trans); + struct btree_insert_entry *i; + struct btree_path *path; - trans->iters_touched &= trans->iters_live; + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + trans->extra_journal_res = 0; trans->nr_updates = 0; - trans->nr_updates2 = 0; trans->mem_top = 0; trans->hooks = NULL; @@ -2281,31 +2724,44 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) (void *) &trans->fs_usage_deltas->memset_start); } - if (!(flags & TRANS_RESET_NOUNLOCK)) - bch2_trans_cond_resched(trans); + trans_for_each_path(trans, path) { + path->should_be_locked = false; + + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction + * iterators if we do that + */ + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); + else if (!path->ref) + path->preserve = false; + } + + bch2_trans_cond_resched(trans); + + if (trans->restarted) + bch2_btree_path_traverse_all(trans); - if (!(flags & TRANS_RESET_NOTRAVERSE) && - trans->iters_linked) - bch2_btree_iter_traverse_all(trans); + trans->restarted = false; } -static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) { - size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; + size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; void *p = NULL; BUG_ON(trans->used_mempool); #ifdef __KERNEL__ - p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); + p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); #endif if (!p) - p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); - trans->iters = p; p += iters_bytes; + trans->paths = p; p += paths_bytes; trans->updates = p; p += updates_bytes; - trans->updates2 = p; p += updates_bytes; } void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, @@ -2317,11 +2773,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->c = c; trans->ip = _RET_IP_; - /* - * reallocating iterators currently completely breaks - * bch2_trans_iter_put(), we always allocate the max: - */ - bch2_trans_alloc_iters(trans, c); + bch2_trans_alloc_paths(trans, c); if (expected_mem_bytes) { trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); @@ -2343,47 +2795,63 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, #endif } -int bch2_trans_exit(struct btree_trans *trans) +static void check_btree_paths_leaked(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->ref) + goto leaked; + return; +leaked: + bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", + bch2_btree_ids[path->btree_id], + (void *) path->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); +#endif +} + +void bch2_trans_exit(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { + struct btree_insert_entry *i; struct bch_fs *c = trans->c; bch2_trans_unlock(trans); -#ifdef CONFIG_BCACHEFS_DEBUG - if (trans->iters_live) { - struct btree_iter *iter; + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + trans->nr_updates = 0; - bch_err(c, "btree iterators leaked!"); - trans_for_each_iter(trans, iter) - if (btree_iter_live(trans, iter)) - printk(KERN_ERR " btree %s allocated at %pS\n", - bch2_btree_ids[iter->btree_id], - (void *) iter->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); - } + check_btree_paths_leaked(trans); - mutex_lock(&trans->c->btree_trans_lock); +#ifdef CONFIG_BCACHEFS_DEBUG + mutex_lock(&c->btree_trans_lock); list_del(&trans->list); - mutex_unlock(&trans->c->btree_trans_lock); + mutex_unlock(&c->btree_trans_lock); #endif srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) mempool_free(trans->fs_usage_deltas, - &trans->c->replicas_delta_pool); + &c->replicas_delta_pool); else kfree(trans->fs_usage_deltas); } if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) - mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); + mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); @@ -2391,36 +2859,33 @@ int bch2_trans_exit(struct btree_trans *trans) /* * Userspace doesn't have a real percpu implementation: */ - trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); + trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); #endif - if (trans->iters) - mempool_free(trans->iters, &trans->c->btree_iters_pool); + if (trans->paths) + mempool_free(trans->paths, &c->btree_paths_pool); trans->mem = (void *) 0x1; - trans->iters = (void *) 0x1; - - return trans->error ? -EIO : 0; + trans->paths = (void *) 0x1; } static void __maybe_unused -bch2_btree_iter_node_to_text(struct printbuf *out, +bch2_btree_path_node_to_text(struct printbuf *out, struct btree_bkey_cached_common *_b, - enum btree_iter_type type) + bool cached) { pr_buf(out, " l=%u %s:", _b->level, bch2_btree_ids[_b->btree_id]); - bch2_bpos_to_text(out, btree_node_pos(_b, type)); + bch2_bpos_to_text(out, btree_node_pos(_b, cached)); } #ifdef CONFIG_BCACHEFS_DEBUG -static bool trans_has_btree_nodes_locked(struct btree_trans *trans) +static bool trans_has_locks(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - if (btree_iter_type(iter) != BTREE_ITER_CACHED && - iter->nodes_locked) + trans_for_each_path(trans, path) + if (path->nodes_locked) return true; return false; } @@ -2430,35 +2895,36 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) { #ifdef CONFIG_BCACHEFS_DEBUG struct btree_trans *trans; - struct btree_iter *iter; + struct btree_path *path; struct btree *b; unsigned l; mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (!trans_has_btree_nodes_locked(trans)) + if (!trans_has_locks(trans)) continue; pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); - trans_for_each_iter(trans, iter) { - if (!iter->nodes_locked) + trans_for_each_path(trans, path) { + if (!path->nodes_locked) continue; - pr_buf(out, " iter %u %c %s:", - iter->idx, - btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', - bch2_btree_ids[iter->btree_id]); - bch2_bpos_to_text(out, iter->pos); + pr_buf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); pr_buf(out, "\n"); for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(iter, l)) { + if (btree_node_locked(path, l)) { pr_buf(out, " %s l=%u ", - btree_node_intent_locked(iter, l) ? "i" : "r", l); - bch2_btree_iter_node_to_text(out, - (void *) iter->l[l].b, - btree_iter_type(iter)); + btree_node_intent_locked(path, l) ? "i" : "r", l); + bch2_btree_path_node_to_text(out, + (void *) path->l[l].b, + path->cached); pr_buf(out, "\n"); } } @@ -2466,18 +2932,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { - iter = &trans->iters[trans->locking_iter_idx]; - pr_buf(out, " locking iter %u %c l=%u %s:", - trans->locking_iter_idx, - btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', + path = &trans->paths[trans->locking_path_idx]; + pr_buf(out, " locking path %u %c l=%u %s:", + trans->locking_path_idx, + path->cached ? 'c' : 'b', trans->locking_level, bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); pr_buf(out, " node "); - bch2_btree_iter_node_to_text(out, - (void *) b, - btree_iter_type(iter)); + bch2_btree_path_node_to_text(out, + (void *) b, path->cached); pr_buf(out, "\n"); } } @@ -2488,7 +2953,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_iters_pool); + mempool_exit(&c->btree_paths_pool); cleanup_srcu_struct(&c->btree_trans_barrier); } @@ -2500,9 +2965,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) mutex_init(&c->btree_trans_lock); return init_srcu_struct(&c->btree_trans_barrier) ?: - mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, - sizeof(struct btree_iter) * nr + - sizeof(struct btree_insert_entry) * nr + + mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(struct btree_path) * nr + sizeof(struct btree_insert_entry) * nr) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, BTREE_TRANS_MEM_MAX); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index a2ce711fd61f..26eb90a7eab8 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -5,40 +5,49 @@ #include "bset.h" #include "btree_types.h" -static inline void btree_iter_set_dirty(struct btree_iter *iter, - enum btree_iter_uptodate u) +static inline void __btree_path_get(struct btree_path *path, bool intent) { - iter->uptodate = max_t(unsigned, iter->uptodate, u); + path->ref++; + path->intent_ref += intent; } -static inline struct btree *btree_iter_node(struct btree_iter *iter, +static inline bool __btree_path_put(struct btree_path *path, bool intent) +{ + EBUG_ON(!path->ref); + EBUG_ON(!path->intent_ref && intent); + path->intent_ref -= intent; + return --path->ref == 0; +} + +static inline void btree_path_set_dirty(struct btree_path *path, + enum btree_path_uptodate u) +{ + path->uptodate = max_t(unsigned, path->uptodate, u); +} + +static inline struct btree *btree_path_node(struct btree_path *path, unsigned level) { - return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; + return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; } -static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, +static inline bool btree_node_lock_seq_matches(const struct btree_path *path, const struct btree *b, unsigned level) { /* * We don't compare the low bits of the lock sequence numbers because - * @iter might have taken a write lock on @b, and we don't want to skip - * the linked iterator if the sequence numbers were equal before taking - * that write lock. The lock sequence number is incremented by taking - * and releasing write locks and is even when unlocked: + * @path might have taken a write lock on @b, and we don't want to skip + * the linked path if the sequence numbers were equal before taking that + * write lock. The lock sequence number is incremented by taking and + * releasing write locks and is even when unlocked: */ - return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; } -static inline struct btree *btree_node_parent(struct btree_iter *iter, +static inline struct btree *btree_node_parent(struct btree_path *path, struct btree *b) { - return btree_iter_node(iter, b->c.level + 1); -} - -static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) -{ - return hweight64(trans->iters_linked) > 1; + return btree_path_node(path, b->c.level + 1); } static inline int btree_iter_err(const struct btree_iter *iter) @@ -46,116 +55,164 @@ static inline int btree_iter_err(const struct btree_iter *iter) return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } -/* Iterate over iters within a transaction: */ +/* Iterate over paths within a transaction: */ -static inline struct btree_iter * -__trans_next_iter(struct btree_trans *trans, unsigned idx) +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned idx) { u64 l; if (idx == BTREE_ITER_MAX) return NULL; - l = trans->iters_linked >> idx; + l = trans->paths_allocated >> idx; if (!l) return NULL; idx += __ffs64(l); EBUG_ON(idx >= BTREE_ITER_MAX); - EBUG_ON(trans->iters[idx].idx != idx); - return &trans->iters[idx]; + EBUG_ON(trans->paths[idx].idx != idx); + return &trans->paths[idx]; } -#define trans_for_each_iter(_trans, _iter) \ - for (_iter = __trans_next_iter((_trans), 0); \ - (_iter); \ - _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) +#define trans_for_each_path(_trans, _path) \ + for (_path = __trans_next_path((_trans), 0); \ + (_path); \ + _path = __trans_next_path((_trans), (_path)->idx + 1)) -static inline bool __iter_has_node(const struct btree_iter *iter, +static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + unsigned idx = path ? path->sorted_idx + 1 : 0; + + EBUG_ON(idx > trans->nr_sorted); + + return idx < trans->nr_sorted + ? trans->paths + trans->sorted[idx] + : NULL; +} + +static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + EBUG_ON(path->sorted_idx >= trans->nr_sorted); + return path->sorted_idx + ? trans->paths + trans->sorted[path->sorted_idx - 1] + : NULL; +} + +#define trans_for_each_path_inorder(_trans, _path, _i) \ + for (_i = 0; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ + _i++) + +static inline bool __path_has_node(const struct btree_path *path, const struct btree *b) { - return iter->l[b->c.level].b == b && - btree_node_lock_seq_matches(iter, b, b->c.level); + return path->l[b->c.level].b == b && + btree_node_lock_seq_matches(path, b, b->c.level); } -static inline struct btree_iter * -__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, +static inline struct btree_path * +__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, unsigned idx) { - struct btree_iter *iter = __trans_next_iter(trans, idx); + struct btree_path *path = __trans_next_path(trans, idx); - while (iter && !__iter_has_node(iter, b)) - iter = __trans_next_iter(trans, iter->idx + 1); + while (path && !__path_has_node(path, b)) + path = __trans_next_path(trans, path->idx + 1); - return iter; + return path; } -#define trans_for_each_iter_with_node(_trans, _b, _iter) \ - for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ - (_iter); \ - _iter = __trans_next_iter_with_node((_trans), (_b), \ - (_iter)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path) \ + for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ + (_path); \ + _path = __trans_next_path_with_node((_trans), (_b), \ + (_path)->idx + 1)) + +struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id, + struct bpos, unsigned, unsigned, bool, + unsigned long); +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); -void bch2_btree_trans_verify_locks(struct btree_trans *); +void bch2_trans_verify_paths(struct btree_trans *); +void bch2_trans_verify_locks(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); #else -static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, - struct btree *b) {} -static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} +static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} #endif -void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, - struct bkey_packed *); -void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_packed *, - unsigned, unsigned); +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + +bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); + +void bch2_path_put(struct btree_trans *, struct btree_path *, bool); bool bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); -bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); +__always_inline +static inline int btree_trans_restart(struct btree_trans *trans) +{ + trans->restarted = true; + bch2_trans_unlock(trans); + return -EINTR; +} + +bool bch2_btree_node_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); -static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, +static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned new_locks_want) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - return iter->locks_want < new_locks_want - ? __bch2_btree_iter_upgrade(iter, new_locks_want) - : iter->uptodate <= BTREE_ITER_NEED_PEEK; + return path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE; } -void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); +void __bch2_btree_path_downgrade(struct btree_path *, unsigned); -static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) +static inline void bch2_btree_path_downgrade(struct btree_path *path) { - unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0); + unsigned new_locks_want = path->level + !!path->intent_ref; - if (iter->locks_want > new_locks_want) - __bch2_btree_iter_downgrade(iter, new_locks_want); + if (path->locks_want > new_locks_want) + __bch2_btree_path_downgrade(path, new_locks_want); } void bch2_trans_downgrade(struct btree_trans *); -void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); - -void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); int __must_check bch2_btree_iter_traverse(struct btree_iter *); -int bch2_btree_iter_traverse_all(struct btree_trans *); - struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); - struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -163,8 +220,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); - bool bch2_btree_iter_advance(struct btree_iter *); bool bch2_btree_iter_rewind(struct btree_iter *); @@ -178,151 +233,130 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos iter->k.p.offset = iter->pos.offset = new_pos.offset; iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; iter->k.size = 0; - iter->should_be_locked = false; } -/* Sort order for locking btree iterators: */ -static inline int btree_iter_lock_cmp(const struct btree_iter *l, - const struct btree_iter *r) +static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) { - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: - bkey_cmp(l->real_pos, r->real_pos); + BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + iter->pos = bkey_start_pos(&iter->k); } -/* - * Unlocks before scheduling - * Note: does not revalidate iterator - */ -static inline int bch2_trans_cond_resched(struct btree_trans *trans) +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) { - if (need_resched() || race_fault()) { - bch2_trans_unlock(trans); - schedule(); - return bch2_trans_relock(trans) ? 0 : -EINTR; - } else { - return 0; - } -} + struct bpos pos = iter->pos; -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b) \ - for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ - _start, _locks_want, _depth, _flags), \ - _b = bch2_btree_iter_peek_node(_iter); \ - (_b); \ - (_b) = bch2_btree_iter_next_node(_iter)) - -#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b) \ - __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b) - -static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, - unsigned flags) -{ - if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) - return bch2_btree_iter_peek_cached(iter); - else - return flags & BTREE_ITER_SLOTS - ? bch2_btree_iter_peek_slot(iter) - : bch2_btree_iter_peek(iter); + iter->snapshot = snapshot; + pos.snapshot = snapshot; + bch2_btree_iter_set_pos(iter, pos); } -static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, - unsigned flags) -{ - return flags & BTREE_ITER_SLOTS - ? bch2_btree_iter_next_slot(iter) - : bch2_btree_iter_next(iter); -} +void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); +void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, + unsigned, struct bpos, unsigned); +void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); +void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -static inline int bkey_err(struct bkey_s_c k) +static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - return PTR_ERR_OR_ZERO(k.k); + iter->path->preserve = false; } -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ - (_start), (_flags)), \ - (_k) = __bch2_btree_iter_peek(_iter, _flags); \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) +void *bch2_trans_kmalloc(struct btree_trans *, size_t); +void bch2_trans_begin(struct btree_trans *); -#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ - for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) +static inline struct btree * +__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) +{ + struct btree *b; -/* new multiple iterator interface: */ + while (b = bch2_btree_iter_peek_node(iter), + PTR_ERR_OR_ZERO(b) == -EINTR) + bch2_trans_begin(trans); -int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); -int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); + return b; +} -void bch2_trans_unlink_iters(struct btree_trans *); +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\ + !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned, - unsigned, unsigned); +#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b, _ret) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _ret) -static inline struct btree_iter * -bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, - struct bpos pos, unsigned flags) +static inline int bkey_err(struct bkey_s_c k) { - struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, - (flags & BTREE_ITER_INTENT) != 0, 0, - flags); - iter->ip_allocated = _THIS_IP_; - return iter; + return PTR_ERR_OR_ZERO(k.k); } -struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, - struct btree_iter *); -static inline struct btree_iter * -bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) +static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, + unsigned flags) { - struct btree_iter *iter = - __bch2_trans_copy_iter(trans, src); - - iter->ip_allocated = _THIS_IP_; - return iter; + return flags & BTREE_ITER_SLOTS + ? bch2_btree_iter_peek_slot(iter) + : bch2_btree_iter_peek(iter); } -struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); - -static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter) +static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - return (trans->iters_live & (1ULL << iter->idx)) != 0; + return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 + ? -EINTR : 0; } -static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter) +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return btree_iter_live(trans, iter) || - (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -} + struct bkey_s_c k; -static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) -{ - trans->iters_touched &= ~(1ULL << iter->idx); + while (btree_trans_too_many_iters(trans) || + (k = __bch2_btree_iter_peek(iter, flags), + bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); + + return k; } -#define TRANS_RESET_NOTRAVERSE (1 << 0) -#define TRANS_RESET_NOUNLOCK (1 << 1) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) -void bch2_trans_reset(struct btree_trans *, unsigned); +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) -static inline void bch2_trans_begin(struct btree_trans *trans) -{ - return bch2_trans_reset(trans, 0); -} +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) -void *bch2_trans_kmalloc(struct btree_trans *, size_t); +/* new multiple iterator interface: */ + +void bch2_dump_trans_paths_updates(struct btree_trans *); void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); -int bch2_trans_exit(struct btree_trans *); +void bch2_trans_exit(struct btree_trans *); void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 017c4f55fdaa..4f1bc1d165aa 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c, was_new = false; } + if (btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); + else + six_lock_pcpu_free(&ck->c.lock); + ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; @@ -196,25 +201,25 @@ btree_key_cache_create(struct btree_key_cache *c, } static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_iter *ck_iter, + struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; int ret; - iter = bch2_trans_get_iter(trans, ck->key.btree_id, - ck->key.pos, BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, + ck->key.pos, BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (!bch2_btree_node_relock(ck_iter, 0)) { + if (!bch2_btree_node_relock(trans, ck_path, 0)) { trace_transaction_restart_ip(trans->ip, _THIS_IP_); - ret = -EINTR; + ret = btree_trans_restart(trans); goto err; } @@ -233,7 +238,11 @@ static int btree_key_cache_fill(struct btree_trans *trans, } } - bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); + /* + * XXX: not allowed to be holding read locks when we take a write lock, + * currently + */ + bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); if (new_k) { kfree(ck->k); ck->u64s = new_u64s; @@ -242,93 +251,93 @@ static int btree_key_cache_fill(struct btree_trans *trans, bkey_reassemble(ck->k, k); ck->valid = true; - bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(trans, iter); + set_btree_iter_dontneed(&iter); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } static int bkey_cached_check_fn(struct six_lock *lock, void *p) { struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); - const struct btree_iter *iter = p; + const struct btree_path *path = p; - return ck->key.btree_id == iter->btree_id && - !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; + return ck->key.btree_id == path->btree_id && + !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; } __flatten -int bch2_btree_iter_traverse_cached(struct btree_iter *iter) +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey_cached *ck; int ret = 0; - BUG_ON(iter->level); + BUG_ON(path->level); + + path->l[1].b = NULL; - if (btree_node_locked(iter, 0)) { - ck = (void *) iter->l[0].b; + if (bch2_btree_node_relock(trans, path, 0)) { + ck = (void *) path->l[0].b; goto fill; } retry: - ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) { - if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { - iter->l[0].b = NULL; + if (flags & BTREE_ITER_CACHED_NOCREATE) { + path->l[0].b = NULL; return 0; } ck = btree_key_cache_create(&c->btree_key_cache, - iter->btree_id, iter->pos); + path->btree_id, path->pos); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; if (!ck) goto retry; - mark_btree_node_locked(iter, 0, SIX_LOCK_intent); - iter->locks_want = 1; + mark_btree_node_locked(path, 0, SIX_LOCK_intent); + path->locks_want = 1; } else { - enum six_lock_type lock_want = __btree_lock_want(iter, 0); + enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, - bkey_cached_check_fn, iter, _THIS_IP_)) { - if (ck->key.btree_id != iter->btree_id || - bpos_cmp(ck->key.pos, iter->pos)) { + if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, + lock_want, + bkey_cached_check_fn, path, _THIS_IP_)) { + if (!trans->restarted) goto retry; - } trace_transaction_restart_ip(trans->ip, _THIS_IP_); ret = -EINTR; goto err; } - if (ck->key.btree_id != iter->btree_id || - bpos_cmp(ck->key.pos, iter->pos)) { + if (ck->key.btree_id != path->btree_id || + bpos_cmp(ck->key.pos, path->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } - mark_btree_node_locked(iter, 0, lock_want); + mark_btree_node_locked(path, 0, lock_want); } - iter->l[0].lock_seq = ck->c.lock.state.seq; - iter->l[0].b = (void *) ck; + path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].b = (void *) ck; fill: - if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { - if (!btree_node_intent_locked(iter, 0)) - bch2_btree_iter_upgrade(iter, 1); - if (!btree_node_intent_locked(iter, 0)) { + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { trace_transaction_restart_ip(trans->ip, _THIS_IP_); - ret = -EINTR; + ret = btree_trans_restart(trans); goto err; } - ret = btree_key_cache_fill(trans, iter, ck); + ret = btree_key_cache_fill(trans, path, ck); if (ret) goto err; } @@ -336,21 +345,14 @@ fill: if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - iter->uptodate = BTREE_ITER_NEED_PEEK; - - if (!(iter->flags & BTREE_ITER_INTENT)) - bch2_btree_iter_downgrade(iter); - else if (!iter->locks_want) { - if (!__bch2_btree_iter_upgrade(iter, 1)) - ret = -EINTR; - } + path->uptodate = BTREE_ITER_UPTODATE; + BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); return ret; err: if (ret != -EINTR) { - btree_node_unlock(iter, 0); - iter->flags |= BTREE_ITER_ERROR; - iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; + btree_node_unlock(path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_ERROR; } return ret; } @@ -363,24 +365,24 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree_iter *c_iter = NULL, *b_iter = NULL; + struct btree_iter c_iter, b_iter; struct bkey_cached *ck = NULL; int ret; - b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT); - c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_CACHED_NOCREATE| - BTREE_ITER_INTENT); -retry: - ret = bch2_btree_iter_traverse(c_iter); + bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&c_iter); if (ret) - goto err; + goto out; - ck = (void *) c_iter->l[0].b; + ck = (void *) c_iter.path->l[0].b; if (!ck || (journal_seq && ck->journal.seq != journal_seq)) goto out; @@ -396,10 +398,11 @@ retry: * allocator/copygc depend on journal reclaim making progress, we need * to be using alloc reserves: * */ - ret = bch2_btree_iter_traverse(b_iter) ?: - bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| @@ -407,15 +410,10 @@ retry: ? BTREE_INSERT_JOURNAL_RESERVED : 0)| commit_flags); -err: - if (ret == -EINTR) - goto retry; - - if (ret == -EAGAIN) - goto out; - if (ret) { - bch2_fs_fatal_err_on(!bch2_journal_error(j), c, + bch2_fs_fatal_err_on(ret != -EINTR && + ret != -EAGAIN && + !bch2_journal_error(j), c, "error flushing key cache: %i", ret); goto out; } @@ -423,7 +421,7 @@ err: bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); - BUG_ON(!btree_node_locked(c_iter, 0)); + BUG_ON(!btree_node_locked(c_iter.path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -432,10 +430,10 @@ err: } } else { evict: - BUG_ON(!btree_node_intent_locked(c_iter, 0)); + BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); - mark_btree_node_unlocked(c_iter, 0); - c_iter->l[0].b = NULL; + mark_btree_node_unlocked(c_iter.path, 0); + c_iter.path->l[0].b = NULL; six_lock_write(&ck->c.lock, NULL, NULL); @@ -451,8 +449,8 @@ evict: mutex_unlock(&c->btree_key_cache.lock); } out: - bch2_trans_iter_put(trans, b_iter); - bch2_trans_iter_put(trans, c_iter); + bch2_trans_iter_exit(trans, &b_iter); + bch2_trans_iter_exit(trans, &c_iter); return ret; } @@ -463,7 +461,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; - struct btree_trans trans; int ret = 0; int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -478,10 +475,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - bch2_trans_init(&trans, c, 0, 0); - ret = btree_key_cache_flush_pos(&trans, key, seq, - BTREE_INSERT_JOURNAL_RECLAIM, false); - bch2_trans_exit(&trans); + ret = bch2_trans_do(c, NULL, NULL, 0, + btree_key_cache_flush_pos(&trans, key, seq, + BTREE_INSERT_JOURNAL_RECLAIM, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); @@ -505,11 +501,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, } bool bch2_btree_insert_key_cached(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) iter->l[0].b; + struct bkey_cached *ck = (void *) path->l[0].b; bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -602,7 +598,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - pos = *rht_bucket(tbl, bc->shrink_iter); + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); while (!rht_is_a_nulls(pos)) { next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index 7e2b0a08f745..0768ef3ca776 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -26,10 +26,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *, struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); -int bch2_btree_iter_traverse_cached(struct btree_iter *); +int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, + unsigned); bool bch2_btree_insert_key_cached(struct btree_trans *, - struct btree_iter *, struct bkey_i *); + struct btree_path *, struct bkey_i *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7532bcdef967..d599008c5fc1 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -21,7 +21,7 @@ enum btree_node_locked_type { BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, }; -static inline int btree_node_locked_type(struct btree_iter *iter, +static inline int btree_node_locked_type(struct btree_path *path, unsigned level) { /* @@ -30,35 +30,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter, * branches: */ return BTREE_NODE_UNLOCKED + - ((iter->nodes_locked >> level) & 1) + - ((iter->nodes_intent_locked >> level) & 1); + ((path->nodes_locked >> level) & 1) + + ((path->nodes_intent_locked >> level) & 1); } -static inline bool btree_node_intent_locked(struct btree_iter *iter, +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned level) { - return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; + return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; } -static inline bool btree_node_read_locked(struct btree_iter *iter, +static inline bool btree_node_read_locked(struct btree_path *path, unsigned level) { - return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; + return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; } -static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) +static inline bool btree_node_locked(struct btree_path *path, unsigned level) { - return iter->nodes_locked & (1 << level); + return path->nodes_locked & (1 << level); } -static inline void mark_btree_node_unlocked(struct btree_iter *iter, +static inline void mark_btree_node_unlocked(struct btree_path *path, unsigned level) { - iter->nodes_locked &= ~(1 << level); - iter->nodes_intent_locked &= ~(1 << level); + path->nodes_locked &= ~(1 << level); + path->nodes_intent_locked &= ~(1 << level); } -static inline void mark_btree_node_locked(struct btree_iter *iter, +static inline void mark_btree_node_locked(struct btree_path *path, unsigned level, enum six_lock_type type) { @@ -66,52 +66,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter, BUILD_BUG_ON(SIX_LOCK_read != 0); BUILD_BUG_ON(SIX_LOCK_intent != 1); - iter->nodes_locked |= 1 << level; - iter->nodes_intent_locked |= type << level; + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; } -static inline void mark_btree_node_intent_locked(struct btree_iter *iter, +static inline void mark_btree_node_intent_locked(struct btree_path *path, unsigned level) { - mark_btree_node_locked(iter, level, SIX_LOCK_intent); + mark_btree_node_locked(path, level, SIX_LOCK_intent); } -static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) { - return level < iter->locks_want + return level < path->locks_want ? SIX_LOCK_intent : SIX_LOCK_read; } static inline enum btree_node_locked_type -btree_lock_want(struct btree_iter *iter, int level) +btree_lock_want(struct btree_path *path, int level) { - if (level < iter->level) + if (level < path->level) return BTREE_NODE_UNLOCKED; - if (level < iter->locks_want) + if (level < path->locks_want) return BTREE_NODE_INTENT_LOCKED; - if (level == iter->level) + if (level == path->level) return BTREE_NODE_READ_LOCKED; return BTREE_NODE_UNLOCKED; } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +static inline void btree_node_unlock(struct btree_path *path, unsigned level) { - int lock_type = btree_node_locked_type(iter, level); + int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); if (lock_type != BTREE_NODE_UNLOCKED) - six_unlock_type(&iter->l[level].b->c.lock, lock_type); - mark_btree_node_unlocked(iter, level); + six_unlock_type(&path->l[level].b->c.lock, lock_type); + mark_btree_node_unlocked(path, level); } -static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) +static inline void __bch2_btree_path_unlock(struct btree_path *path) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); - while (iter->nodes_locked) - btree_node_unlock(iter, __ffs(iter->nodes_locked)); + while (path->nodes_locked) + btree_node_unlock(path, __ffs(path->nodes_locked)); } static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) @@ -155,11 +155,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, struct btree *b, unsigned level, enum btree_node_locked_type want) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - if (iter->l[level].b == b && - btree_node_locked_type(iter, level) >= want) { + trans_for_each_path(trans, path) + if (path->l[level].b == b && + btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->c.lock, want); return true; } @@ -167,40 +167,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, return false; } -bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type, +bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, + struct btree *, struct bpos, unsigned, + enum six_lock_type, six_lock_should_sleep_fn, void *, unsigned long); -static inline bool btree_node_lock(struct btree *b, - struct bpos pos, unsigned level, - struct btree_iter *iter, +static inline bool btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_trans *trans = iter->trans; - EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); return likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || - __bch2_btree_node_lock(b, pos, level, iter, type, + __bch2_btree_node_lock(trans, path, b, pos, level, type, should_sleep_fn, p, ip); } -bool __bch2_btree_node_relock(struct btree_iter *, unsigned); +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); -static inline bool bch2_btree_node_relock(struct btree_iter *iter, - unsigned level) +static inline bool bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - EBUG_ON(btree_node_locked(iter, level) && - btree_node_locked_type(iter, level) != - __btree_lock_want(iter, level)); + EBUG_ON(btree_node_locked(path, level) && + btree_node_locked_type(path, level) != + __btree_lock_want(path, level)); - return likely(btree_node_locked(iter, level)) || - __bch2_btree_node_relock(iter, level); + return likely(btree_node_locked(path, level)) || + __bch2_btree_node_relock(trans, path, level); } /* @@ -208,30 +207,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, * succeed: */ static inline void -bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) { - struct btree_iter *linked; + struct btree_path *linked; - EBUG_ON(iter->l[b->c.level].b != b); - EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_path_with_node(trans, b, linked) linked->l[b->c.level].lock_seq += 2; six_unlock_write(&b->c.lock); } -void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); -void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); +void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); -static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +static inline void bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - EBUG_ON(iter->l[b->c.level].b != b); - EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); + EBUG_ON(!btree_node_intent_locked(path, b->c.level)); if (unlikely(!six_trylock_write(&b->c.lock))) - __bch2_btree_node_lock_write(b, iter); + __bch2_btree_node_lock_write(trans, b); } #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index bc0f482b53d2..2c2e2f794b8f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -176,51 +176,45 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -enum btree_iter_type { - BTREE_ITER_KEYS, - BTREE_ITER_NODES, - BTREE_ITER_CACHED, -}; - -#define BTREE_ITER_TYPE ((1 << 2) - 1) - /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -#define BTREE_ITER_SLOTS (1 << 2) +#define BTREE_ITER_SLOTS (1 << 0) /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 3) +#define BTREE_ITER_INTENT (1 << 1) /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 4) +#define BTREE_ITER_PREFETCH (1 << 2) /* * Indicates that this iterator should not be reused until transaction commit, * either because a pending update references it or because the update depends * on that particular key being locked (e.g. by the str_hash code, for hash * table consistency) */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) +#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 6) -#define BTREE_ITER_ERROR (1 << 7) -#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -#define BTREE_ITER_CACHED_NOFILL (1 << 9) -#define BTREE_ITER_CACHED_NOCREATE (1 << 10) -#define BTREE_ITER_NOT_EXTENTS (1 << 11) +#define BTREE_ITER_IS_EXTENTS (1 << 4) +#define BTREE_ITER_NOT_EXTENTS (1 << 5) +#define BTREE_ITER_ERROR (1 << 6) +#define BTREE_ITER_CACHED (1 << 7) +#define BTREE_ITER_CACHED_NOFILL (1 << 8) +#define BTREE_ITER_CACHED_NOCREATE (1 << 9) +#define BTREE_ITER_WITH_UPDATES (1 << 10) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) #define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) -enum btree_iter_uptodate { +enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, }; #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) @@ -232,69 +226,76 @@ enum btree_iter_uptodate { #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) #define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - struct btree_trans *trans; - struct bpos pos; - /* what we're searching for/what the iterator actually points to: */ - struct bpos real_pos; - struct bpos pos_after_commit; - /* When we're filtering by snapshot, the snapshot ID we're looking for: */ - unsigned snapshot; - - u16 flags; +struct btree_path { u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + + /* btree_iter_copy starts here: */ + struct bpos pos; enum btree_id btree_id:4; - enum btree_iter_uptodate uptodate:3; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; /* - * True if we've returned a key (and thus are expected to keep it - * locked), false after set_pos - for avoiding spurious transaction - * restarts in bch2_trans_relock(): + * When true, failing to relock this path will cause the transaction to + * restart: */ bool should_be_locked:1; - unsigned level:4, - min_depth:4, + unsigned level:3, locks_want:4, nodes_locked:4, nodes_intent_locked:4; - struct btree_iter_level { + struct btree_path_level { struct btree *b; struct btree_node_iter iter; u32 lock_seq; } l[BTREE_MAX_DEPTH]; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif +}; + +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + + enum btree_id btree_id:4; + unsigned min_depth:4; + + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + struct bpos pos; + struct bpos pos_after_commit; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; +#ifdef CONFIG_BCACHEFS_DEBUG unsigned long ip_allocated; +#endif }; -static inline enum btree_iter_type -btree_iter_type(const struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_TYPE; -} - -static inline bool btree_iter_is_cached(const struct btree_iter *iter) -{ - return btree_iter_type(iter) == BTREE_ITER_CACHED; -} - -static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -{ - return iter->l + iter->level; -} - struct btree_key_cache { struct mutex lock; struct rhashtable table; @@ -335,14 +336,16 @@ struct bkey_cached { }; struct btree_insert_entry { - unsigned trigger_flags; + unsigned flags; u8 bkey_type; enum btree_id btree_id:8; u8 level; - unsigned trans_triggers_run:1; - unsigned is_extent:1; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; struct bkey_i *k; - struct btree_iter *iter; + struct btree_path *path; + unsigned long ip_allocated; }; #ifndef CONFIG_LOCKDEP @@ -359,14 +362,14 @@ struct btree_trans_commit_hook { struct btree_trans_commit_hook *next; }; -#define BTREE_TRANS_MEM_MAX 4096 +#define BTREE_TRANS_MEM_MAX (1U << 14) struct btree_trans { struct bch_fs *c; #ifdef CONFIG_BCACHEFS_DEBUG struct list_head list; struct btree *locking; - unsigned locking_iter_idx; + unsigned locking_path_idx; struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; @@ -375,23 +378,26 @@ struct btree_trans { unsigned long ip; int srcu_idx; + u8 nr_sorted; u8 nr_updates; - u8 nr_updates2; - unsigned used_mempool:1; - unsigned error:1; - unsigned in_traverse_all:1; + bool used_mempool:1; + bool in_traverse_all:1; + bool restarted:1; + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; + u64 paths_allocated; unsigned mem_top; unsigned mem_bytes; void *mem; - struct btree_iter *iters; + u8 sorted[BTREE_ITER_MAX]; + struct btree_path *paths; struct btree_insert_entry *updates; - struct btree_insert_entry *updates2; /* update path: */ struct btree_trans_commit_hook *hooks; @@ -428,6 +434,7 @@ enum btree_flags { BTREE_NODE_write_idx, BTREE_NODE_accessed, BTREE_NODE_write_in_flight, + BTREE_NODE_write_in_flight_inner, BTREE_NODE_just_written, BTREE_NODE_dying, BTREE_NODE_fake, @@ -442,6 +449,7 @@ BTREE_FLAG(noevict); BTREE_FLAG(write_idx); BTREE_FLAG(accessed); BTREE_FLAG(write_in_flight); +BTREE_FLAG(write_in_flight_inner); BTREE_FLAG(just_written); BTREE_FLAG(dying); BTREE_FLAG(fake); @@ -592,16 +600,6 @@ static inline bool btree_node_is_extents(struct btree *b) return btree_node_type_is_extents(btree_node_type(b)); } -static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) -{ - return __btree_node_type(iter->level, iter->btree_id); -} - -static inline bool btree_iter_is_extents(struct btree_iter *iter) -{ - return btree_node_type_is_extents(btree_iter_key_type(iter)); -} - #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_extents)| \ (1U << BKEY_TYPE_inodes)| \ @@ -611,7 +609,9 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ ((1U << BKEY_TYPE_alloc)| \ - (1U << BKEY_TYPE_stripes)) + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -632,28 +632,39 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; } -enum btree_trigger_flags { +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - __BTREE_TRIGGER_OVERWRITE_SPLIT, __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, __BTREE_TRIGGER_NOATOMIC, }; +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) + #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) #define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) +#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) + static inline bool btree_node_type_needs_gc(enum btree_node_type type) { return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); @@ -669,16 +680,10 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - enum btree_insert_ret { BTREE_INSERT_OK, /* leaf node needs to be split */ BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_MARK_REPLICAS, BTREE_INSERT_NEED_JOURNAL_RES, BTREE_INSERT_NEED_JOURNAL_RECLAIM, @@ -695,8 +700,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 56131ac516ce..0268dd74f0ab 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -8,14 +8,14 @@ struct bch_fs; struct btree; -void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, - struct btree_iter *); -bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_i *); +void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, + struct btree *); +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { - __BTREE_INSERT_NOUNLOCK, __BTREE_INSERT_NOFAIL, __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, @@ -29,11 +29,6 @@ enum btree_insert_flags { __BCH_HASH_SET_MUST_REPLACE, }; -/* - * Don't drop locks _after_ successfully updating btree: - */ -#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) - /* Don't check for -ENOSPC: */ #define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) @@ -66,18 +61,20 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, u64 *); + struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); -int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, - __le64, unsigned); +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); -int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, - struct btree *, struct bkey_i *); +int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, bool); +int bch2_btree_node_update_key_get_iter(struct btree_trans *, + struct btree *, struct bkey_i *, bool); int bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_trigger_flags); + struct bkey_i *, enum btree_update_flags); void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); @@ -108,12 +105,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans, ({ \ int _ret; \ \ - while (1) { \ + do { \ + bch2_trans_begin(_trans); \ _ret = (_do); \ - if (_ret != -EINTR) \ - break; \ - bch2_trans_reset(_trans, 0); \ - } \ + } while (_ret == -EINTR); \ \ _ret; \ }) @@ -125,14 +120,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans, #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ({ \ struct btree_trans trans; \ - int _ret, _ret2; \ + int _ret; \ \ bch2_trans_init(&trans, (_c), 0, 0); \ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ _do); \ - _ret2 = bch2_trans_exit(&trans); \ + bch2_trans_exit(&trans); \ \ - _ret ?: _ret2; \ + _ret; \ }) #define trans_for_each_update(_trans, _i) \ @@ -140,9 +135,21 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_update2(_trans, _i) \ - for ((_i) = (_trans)->updates2; \ - (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ - (_i)++) +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } + + return NULL; +} #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 2d8093d1bf00..dfff972551ee 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -22,6 +22,11 @@ #include <linux/random.h> #include <trace/events/bcachefs.h> +static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, + struct btree_path *, struct btree *, + struct keylist *, unsigned); +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + /* Debug code: */ /* @@ -148,38 +153,26 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) clear_btree_node_noevict(b); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); } -void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) +static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree *b) { - struct open_buckets ob = b->ob; + struct bch_fs *c = trans->c; + struct btree_path *path; - b->ob.nr = 0; + trans_for_each_path(trans, path) + BUG_ON(path->l[b->c.level].b == b && + path->l[b->c.level].lock_seq == b->c.lock.state.seq); - clear_btree_node_dirty(c, b); + six_lock_write(&b->c.lock, NULL, NULL); - btree_node_lock_type(c, b, SIX_LOCK_write); + bch2_btree_node_hash_remove(&c->btree_cache, b); __btree_node_free(c, b); - six_unlock_write(&b->c.lock); - bch2_open_buckets_put(c, &ob); -} - -void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - struct btree_iter *linked; - - trans_for_each_iter(iter->trans, linked) - BUG_ON(linked->l[b->c.level].b == b); - - six_lock_write(&b->c.lock, NULL, NULL); - __btree_node_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -242,11 +235,7 @@ retry: goto retry; } - if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) - bkey_btree_ptr_v2_init(&tmp.k); - else - bkey_btree_ptr_init(&tmp.k); - + bkey_btree_ptr_v2_init(&tmp.k); bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); bch2_open_bucket_get(c, wp, &ob); @@ -367,7 +356,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) struct btree *b = bch2_btree_node_alloc(as, level); btree_set_min(b, POS_MIN); - btree_set_max(b, POS_MAX); + btree_set_max(b, SPOS_MAX); b->data->format = bch2_btree_calc_format(b); btree_node_set_format(b, b->data->format); @@ -511,7 +500,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, ret = bch2_trans_mark_key(trans, bkey_s_c_null, bkey_i_to_s_c(k), - 0, 0, BTREE_TRIGGER_INSERT); + BTREE_TRIGGER_INSERT); if (ret) return ret; } @@ -520,7 +509,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), bkey_s_c_null, - 0, 0, BTREE_TRIGGER_OVERWRITE); + BTREE_TRIGGER_OVERWRITE); if (ret) return ret; } @@ -563,7 +552,8 @@ static void btree_update_nodes_written(struct btree_update *as) six_unlock_read(&old->c.lock); if (seq == as->old_nodes_seq[i]) - btree_node_wait_on_io(old); + wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner, + TASK_UNINTERRUPTIBLE); } /* @@ -772,7 +762,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) * And it adds @b to the list of @as's new nodes, so that we can update sector * counts in bch2_btree_update_nodes_written: */ -void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) +static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; @@ -826,7 +816,7 @@ found: closure_put(&as->cl); } -void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) +static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) { while (b->ob.nr) as->open_buckets[as->nr_open_buckets++] = @@ -838,7 +828,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b * nodes and thus outstanding btree_updates - redirect @b's * btree_updates to point to this btree_update: */ -void bch2_btree_interior_update_will_free_node(struct btree_update *as, +static void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; @@ -910,7 +900,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, as->nr_old_nodes++; } -void bch2_btree_update_done(struct btree_update *as) +static void bch2_btree_update_done(struct btree_update *as) { BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); @@ -924,11 +914,10 @@ void bch2_btree_update_done(struct btree_update *as) as->c->btree_interior_update_worker); } -struct btree_update * -bch2_btree_update_start(struct btree_iter *iter, unsigned level, - unsigned nr_nodes, unsigned flags) +static struct btree_update * +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + unsigned level, unsigned nr_nodes, unsigned flags) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct btree_update *as; struct closure cl; @@ -937,36 +926,28 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level, int journal_flags = 0; int ret = 0; + BUG_ON(!path->should_be_locked); + if (flags & BTREE_INSERT_JOURNAL_RESERVED) journal_flags |= JOURNAL_RES_GET_RESERVED; closure_init_stack(&cl); retry: - /* - * This check isn't necessary for correctness - it's just to potentially - * prevent us from doing a lot of work that'll end up being wasted: - */ - ret = bch2_journal_error(&c->journal); - if (ret) - return ERR_PTR(ret); /* * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, - iter->btree_id, - &iter->real_pos); - return ERR_PTR(-EINTR); + path->btree_id, &path->pos); + ret = btree_trans_restart(trans); + return ERR_PTR(ret); } if (flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); else if (!down_read_trylock(&c->gc_lock)) { - if (flags & BTREE_INSERT_NOUNLOCK) - return ERR_PTR(-EINTR); - bch2_trans_unlock(trans); down_read(&c->gc_lock); if (!bch2_trans_relock(trans)) { @@ -981,7 +962,7 @@ retry: as->c = c; as->mode = BTREE_INTERIOR_NO_UPDATE; as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); - as->btree_id = iter->btree_id; + as->btree_id = path->btree_id; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -989,24 +970,31 @@ retry: bch2_keylist_init(&as->new_keys, as->_new_keys); bch2_keylist_init(&as->parent_keys, as->inline_keys); + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* + * We don't want to allocate if we're in an error state, that can cause + * deadlock on emergency shutdown due to open buckets getting stuck in + * the btree_reserve_cache after allocator shutdown has cleared it out. + * This check needs to come after adding us to the btree_interior_update + * list but before calling bch2_btree_reserve_get, to synchronize with + * __bch2_fs_read_only(). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, journal_flags|JOURNAL_RES_GET_NONBLOCK); if (ret == -EAGAIN) { - /* - * this would be cleaner if bch2_journal_preres_get() took a - * closure argument - */ - if (flags & BTREE_INSERT_NOUNLOCK) { - trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); - ret = -EINTR; - goto err; - } - bch2_trans_unlock(trans); if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { bch2_btree_update_free(as); + btree_trans_restart(trans); return ERR_PTR(ret); } @@ -1031,8 +1019,7 @@ retry: if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, - !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); + ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); if (ret) goto err; @@ -1040,17 +1027,11 @@ retry: atomic64_read(&c->journal.seq), &as->journal, NULL); - mutex_lock(&c->btree_interior_update_lock); - list_add_tail(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - return as; err: bch2_btree_update_free(as); if (ret == -EAGAIN) { - BUG_ON(flags & BTREE_INSERT_NOUNLOCK); - bch2_trans_unlock(trans); closure_sync(&cl); ret = -EINTR; @@ -1099,8 +1080,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) * is nothing new to be done. This just guarantees that there is a * journal write. */ -static void bch2_btree_set_root(struct btree_update *as, struct btree *b, - struct btree_iter *iter) +static void bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { struct bch_fs *c = as->c; struct btree *old; @@ -1115,7 +1098,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write(old, iter); + bch2_btree_node_lock_write(trans, path, old); bch2_btree_set_root_inmem(c, b); @@ -1128,20 +1111,25 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, * an intent lock on the new root, and any updates that would * depend on the new root would have to update the new root. */ - bch2_btree_node_unlock_write(old, iter); + bch2_btree_node_unlock_write(trans, path, old); } /* Interior node updates: */ -static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, - struct btree_iter *iter, - struct bkey_i *insert, - struct btree_node_iter *node_iter) +static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) { struct bch_fs *c = as->c; struct bkey_packed *k; const char *invalid; + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { @@ -1165,15 +1153,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) bch2_btree_node_iter_advance(node_iter, b); - bch2_btree_bset_insert_key(iter, b, node_iter, insert); + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty(c, b); set_btree_node_need_write(b); } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys, - struct btree_node_iter node_iter) +__bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1185,8 +1176,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ; while (!bch2_keylist_empty(keys)) { - bch2_insert_fixup_btree_ptr(as, b, iter, - bch2_keylist_front(keys), &node_iter); + bch2_insert_fixup_btree_ptr(as, trans, path, b, + &node_iter, bch2_keylist_front(keys)); bch2_keylist_pop_front(keys); } } @@ -1196,8 +1187,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * node) */ static struct btree *__btree_split_node(struct btree_update *as, - struct btree *n1, - struct btree_iter *iter) + struct btree *n1) { struct bkey_format_state s; size_t nr_packed = 0, nr_unpacked = 0; @@ -1312,8 +1302,10 @@ static struct btree *__btree_split_node(struct btree_update *as, * nodes that were coalesced, and thus in the middle of a child node post * coalescing: */ -static void btree_split_insert_keys(struct btree_update *as, struct btree *b, - struct btree_iter *iter, +static void btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, struct keylist *keys) { struct btree_node_iter node_iter; @@ -1323,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, bch2_btree_node_iter_init(&node_iter, b, &k->k.p); - __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter); + __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); /* * We can't tolerate whiteouts here - with whiteouts there can be @@ -1353,17 +1345,17 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, btree_node_interior_verify(as->c, b); } -static void btree_split(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys, - unsigned flags) +static void btree_split(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(iter, b); + struct btree *parent = btree_node_parent(path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; u64 start_time = local_clock(); BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); bch2_btree_interior_update_will_free_node(as, b); @@ -1371,18 +1363,19 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_update_add_new_node(as, n1); if (keys) - btree_split_insert_keys(as, n1, iter, keys); + btree_split_insert_keys(as, trans, path, n1, keys); if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { trace_btree_split(c, b); - n2 = __btree_split_node(as, n1, iter); + n2 = __btree_split_node(as, n1); bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n1); six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); + bch2_btree_node_write(c, n1, SIX_LOCK_intent); bch2_btree_node_write(c, n2, SIX_LOCK_intent); /* @@ -1400,7 +1393,7 @@ static void btree_split(struct btree_update *as, struct btree *b, n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; - btree_split_insert_keys(as, n3, iter, &as->parent_keys); + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); bch2_btree_node_write(c, n3, SIX_LOCK_intent); } @@ -1410,22 +1403,22 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->c.lock); + bch2_btree_node_write(c, n1, SIX_LOCK_intent); + if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); } - bch2_btree_node_write(c, n1, SIX_LOCK_intent); - /* New nodes all written, now make them visible: */ if (parent) { /* Split a non root node */ - bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); + bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); } else if (n3) { - bch2_btree_set_root(as, n3, iter); + bch2_btree_set_root(as, trans, path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, n1, iter); + bch2_btree_set_root(as, trans, path, n1); } bch2_btree_update_get_open_buckets(as, n1); @@ -1434,15 +1427,14 @@ static void btree_split(struct btree_update *as, struct btree *b, if (n3) bch2_btree_update_get_open_buckets(as, n3); - /* Successful split, update the iterator to point to the new nodes: */ + /* Successful split, update the path to point to the new nodes: */ six_lock_increment(&b->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); if (n3) - bch2_btree_iter_node_replace(iter, n3); + bch2_trans_node_add(trans, n3); if (n2) - bch2_btree_iter_node_replace(iter, n2); - bch2_btree_iter_node_replace(iter, n1); + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); /* * The old node must be freed (in memory) _before_ unlocking the new @@ -1450,7 +1442,7 @@ static void btree_split(struct btree_update *as, struct btree *b, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(trans, b); if (n3) six_unlock_intent(&n3->c.lock); @@ -1458,26 +1450,30 @@ static void btree_split(struct btree_update *as, struct btree *b, six_unlock_intent(&n2->c.lock); six_unlock_intent(&n1->c.lock); - bch2_btree_trans_verify_locks(iter->trans); + bch2_trans_verify_locks(trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], start_time); } static void -bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) { - struct btree_iter *linked; + struct btree_path *linked; - __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter); + __bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_path_with_node(trans, b, linked) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - bch2_btree_trans_verify_iters(iter->trans, b); + bch2_trans_verify_paths(trans); } /** @@ -1492,9 +1488,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic. */ -void bch2_btree_insert_node(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys, - unsigned flags) +static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); @@ -1502,21 +1498,21 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, int live_u64s_added, u64s_added; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - bch2_btree_node_lock_for_insert(c, b, iter); + bch2_btree_node_lock_for_insert(trans, path, b); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(b, iter); + bch2_btree_node_unlock_write(trans, path, b); goto split; } btree_node_interior_verify(c, b); - bch2_btree_insert_keys_interior(as, b, iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, keys); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1528,46 +1524,48 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); - bch2_btree_node_unlock_write(b, iter); + bch2_btree_node_unlock_write(trans, path, b); btree_node_interior_verify(c, b); return; split: - btree_split(as, b, iter, keys, flags); + btree_split(as, trans, path, b, keys, flags); } -int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, +int bch2_btree_split_leaf(struct btree_trans *trans, + struct btree_path *path, unsigned flags) { - struct btree *b = iter_l(iter)->b; + struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(iter, iter->level, + as = bch2_btree_update_start(trans, path, path->level, btree_update_reserve_required(c, b), flags); if (IS_ERR(as)) return PTR_ERR(as); - btree_split(as, b, iter, NULL, flags); + btree_split(as, trans, path, b, NULL, flags); bch2_btree_update_done(as); - for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) - ret = bch2_foreground_maybe_merge(c, iter, l, flags); + for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } -int __bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, +int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, unsigned level, unsigned flags, enum btree_node_sibling sib) { - struct btree_trans *trans = iter->trans; - struct btree_iter *sib_iter = NULL; + struct bch_fs *c = trans->c; + struct btree_path *sib_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1575,39 +1573,35 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; - int ret = 0, ret2 = 0; - - BUG_ON(!btree_node_locked(iter, level)); -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto err; + int ret = 0; - BUG_ON(!btree_node_locked(iter, level)); + BUG_ON(!path->should_be_locked); + BUG_ON(!btree_node_locked(path, level)); - b = iter->l[level].b; + b = path->l[level].b; if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || - (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { b->sib_u64s[sib] = U16_MAX; - goto out; + return 0; } sib_pos = sib == btree_prev_sib ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, - sib_pos, U8_MAX, level, - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(sib_iter); + sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos, + U8_MAX, level, true, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - m = sib_iter->l[level].b; + sib_path->should_be_locked = true; - if (btree_node_parent(iter, b) != - btree_node_parent(sib_iter, m)) { + m = sib_path->l[level].b; + + if (btree_node_parent(path, b) != + btree_node_parent(sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1658,8 +1652,8 @@ retry: if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(iter, b); - as = bch2_btree_update_start(iter, level, + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, level, btree_update_reserve_required(c, parent) + 1, flags| BTREE_INSERT_NOFAIL| @@ -1688,92 +1682,63 @@ retry: bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); + bch2_btree_node_write(c, n, SIX_LOCK_intent); + bkey_init(&delete.k); delete.k.p = prev->key.k.p; bch2_keylist_add(&as->parent_keys, &delete); bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_node_write(c, n, SIX_LOCK_intent); + bch2_trans_verify_paths(trans); + + bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); six_lock_increment(&m->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); - bch2_btree_iter_node_drop(iter, m); - bch2_btree_iter_node_replace(iter, n); + bch2_trans_node_add(trans, n); - bch2_btree_trans_verify_iters(trans, n); + bch2_trans_verify_paths(trans); - bch2_btree_node_free_inmem(c, b, iter); - bch2_btree_node_free_inmem(c, m, iter); + bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, m); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); out: - bch2_btree_trans_verify_locks(trans); - bch2_trans_iter_free(trans, sib_iter); - - /* - * Don't downgrade locks here: we're called after successful insert, - * and the caller will downgrade locks after a successful insert - * anyways (in case e.g. a split was required first) - * - * And we're also called when inserting into interior nodes in the - * split path, and downgrading to read locks in there is potentially - * confusing: - */ - return ret ?: ret2; err: - bch2_trans_iter_put(trans, sib_iter); - sib_iter = NULL; - - if (ret == -EINTR && bch2_trans_relock(trans)) - goto retry; - - if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { - ret2 = ret; - ret = bch2_btree_iter_traverse_all(trans); - if (!ret) - goto retry; - } - - goto out; + bch2_path_put(trans, sib_path, true); + bch2_trans_verify_locks(trans); + return ret; } /** * bch_btree_node_rewrite - Rewrite/move a btree node */ -int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, - __le64 seq, unsigned flags) +int bch2_btree_node_rewrite(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, + unsigned flags) { - struct btree *b, *n, *parent; + struct bch_fs *c = trans->c; + struct btree *n, *parent; struct btree_update *as; int ret; flags |= BTREE_INSERT_NOFAIL; -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; - b = bch2_btree_iter_peek_node(iter); - if (!b || b->data->keys.seq != seq) - goto out; - - parent = btree_node_parent(iter, b); - as = bch2_btree_update_start(iter, b->c.level, + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, (parent ? btree_update_reserve_required(c, parent) : 0) + 1, flags); ret = PTR_ERR_OR_ZERO(as); - if (ret == -EINTR) - goto retry; if (ret) { trace_btree_gc_rewrite_node_fail(c, b); goto out; @@ -1793,22 +1758,22 @@ retry: if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); + bch2_btree_insert_node(as, trans, iter->path, parent, + &as->parent_keys, flags); } else { - bch2_btree_set_root(as, n, iter); + bch2_btree_set_root(as, trans, iter->path, n); } bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); - bch2_btree_iter_node_replace(iter, n); - bch2_btree_node_free_inmem(c, b, iter); + bch2_trans_node_add(trans, n); + bch2_btree_node_free_inmem(trans, b); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); out: - bch2_btree_iter_downgrade(iter); + bch2_btree_path_downgrade(iter->path); return ret; } @@ -1821,20 +1786,38 @@ struct async_btree_rewrite { __le64 seq; }; +static int async_btree_node_rewrite_trans(struct btree_trans *trans, + struct async_btree_rewrite *a) +{ + struct btree_iter iter; + struct btree *b; + int ret; + + bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + if (!b || b->data->keys.seq != a->seq) + goto out; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +out : + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - struct btree_trans trans; - struct btree_iter *iter; - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos, - BTREE_MAX_DEPTH, a->level, 0); - bch2_btree_node_rewrite(c, iter, a->seq, 0); - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + bch2_trans_do(c, NULL, NULL, 0, + async_btree_node_rewrite_trans(&trans, a)); percpu_ref_put(&c->writes); kfree(a); } @@ -1865,75 +1848,123 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) queue_work(c->btree_interior_update_worker, &a->work); } -static void __bch2_btree_node_update_key(struct bch_fs *c, - struct btree_update *as, - struct btree_iter *iter, - struct btree *b, struct btree *new_hash, - struct bkey_i *new_key) +static int __bch2_btree_node_update_key(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, + struct bkey_i *new_key, + bool skip_triggers) { + struct bch_fs *c = trans->c; + struct btree_iter iter2 = { NULL }; struct btree *parent; + u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; int ret; - btree_update_will_delete_key(as, &b->key); - btree_update_will_add_key(as, new_key); + if (!skip_triggers) { + ret = bch2_trans_mark_key(trans, + bkey_s_c_null, + bkey_i_to_s_c(new_key), + BTREE_TRIGGER_INSERT); + if (ret) + return ret; + + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(&b->key), + bkey_s_c_null, + BTREE_TRIGGER_OVERWRITE); + if (ret) + return ret; + } - parent = btree_node_parent(iter, b); + if (new_hash) { + bkey_copy(&new_hash->key, new_key); + ret = bch2_btree_node_hash_insert(&c->btree_cache, + new_hash, b->c.level, b->c.btree_id); + BUG_ON(ret); + } + + parent = btree_node_parent(iter->path, b); if (parent) { - if (new_hash) { - bkey_copy(&new_hash->key, new_key); - ret = bch2_btree_node_hash_insert(&c->btree_cache, - new_hash, b->c.level, b->c.btree_id); - BUG_ON(ret); - } + bch2_trans_copy_iter(&iter2, iter); - bch2_keylist_add(&as->parent_keys, new_key); - bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, + iter2.flags & BTREE_ITER_INTENT, + _THIS_IP_); - if (new_hash) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, new_hash); + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); - bch2_btree_node_hash_remove(&c->btree_cache, b); + btree_node_unlock(iter2.path, iter2.path->level); + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; - bkey_copy(&b->key, new_key); - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - } else { - bkey_copy(&b->key, new_key); - } + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + if (ret) + goto err; } else { BUG_ON(btree_node_root(c, b) != b); - bch2_btree_node_lock_write(b, iter); - bkey_copy(&b->key, new_key); + trans->extra_journal_entries = (void *) &journal_entries[0]; + trans->extra_journal_entry_u64s = + journal_entry_set((void *) &journal_entries[0], + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + new_key, new_key->k.u64s); + } - if (btree_ptr_hash_val(&b->key) != b->hash_val) { - mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED); + if (ret) + goto err; - ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); - BUG_ON(ret); - mutex_unlock(&c->btree_cache.lock); - } + bch2_btree_node_lock_write(trans, iter->path, b); - btree_update_updated_root(as, b); - bch2_btree_node_unlock_write(b, iter); + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, new_hash); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { + bkey_copy(&b->key, new_key); } - bch2_btree_update_done(as); + bch2_btree_node_unlock_write(trans, iter->path, b); +out: + bch2_trans_iter_exit(trans, &iter2); + return ret; +err: + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + } + goto out; } -int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, - struct btree *b, - struct bkey_i *new_key) +int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, struct bkey_i *new_key, + bool skip_triggers) { - struct btree *parent = btree_node_parent(iter, b); - struct btree_update *as = NULL; + struct bch_fs *c = trans->c; struct btree *new_hash = NULL; + struct btree_path *path = iter->path; struct closure cl; int ret = 0; + if (!btree_node_intent_locked(path, b->c.level) && + !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) { + btree_trans_restart(trans); + return -EINTR; + } + closure_init_stack(&cl); /* @@ -1943,27 +1974,20 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, if (btree_ptr_hash_val(new_key) != b->hash_val) { ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { - bch2_trans_unlock(iter->trans); + bch2_trans_unlock(trans); closure_sync(&cl); - if (!bch2_trans_relock(iter->trans)) + if (!bch2_trans_relock(trans)) return -EINTR; } new_hash = bch2_btree_node_mem_alloc(c); } - as = bch2_btree_update_start(iter, b->c.level, - parent ? btree_update_reserve_required(c, parent) : 0, - BTREE_INSERT_NOFAIL); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - goto err; - } - - __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); + path->intent_ref++; + ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, + new_key, skip_triggers); + --path->intent_ref; - bch2_btree_iter_downgrade(iter); -err: if (new_hash) { mutex_lock(&c->btree_cache.lock); list_move(&new_hash->list, &c->btree_cache.freeable); @@ -1977,6 +2001,35 @@ err: return ret; } +int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + struct btree *b, struct bkey_i *new_key, + bool skip_triggers) +{ + struct btree_iter iter; + int ret; + + bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* has node been freed? */ + if (iter.path->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; + } + + BUG_ON(!btree_node_hashed(b)); + + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + /* Init code: */ /* @@ -2012,7 +2065,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) b->c.btree_id = id; bkey_btree_ptr_init(&b->key); - b->key.k.p = POS_MAX; + b->key.k.p = SPOS_MAX; *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; bch2_bset_init_first(b, &b->data->keys); @@ -2020,7 +2073,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) b->data->flags = 0; btree_set_min(b, POS_MIN); - btree_set_max(b, POS_MAX); + btree_set_max(b, SPOS_MAX); b->data->format = bch2_btree_calc_format(b); btree_node_set_format(b, b->data->format); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7ed67b47e1b9..8e03bd987d6d 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -113,60 +113,39 @@ struct btree_update { u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; }; -void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, - struct btree_iter *); -void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); - -void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); - struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -void bch2_btree_update_done(struct btree_update *); -struct btree_update * -bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); - -void bch2_btree_interior_update_will_free_node(struct btree_update *, - struct btree *); -void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); - -void bch2_btree_insert_node(struct btree_update *, struct btree *, - struct btree_iter *, struct keylist *, - unsigned); -int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); -int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, +int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, unsigned, unsigned, enum btree_node_sibling); -static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, - struct btree_iter *iter, +static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + struct btree_path *path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct btree *b; - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return 0; - - if (!bch2_btree_node_relock(iter, level)) - return 0; + EBUG_ON(!btree_node_locked(path, level)); - b = iter->l[level].b; - if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + b = path->l[level].b; + if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(c, iter, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); } -static inline int bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - unsigned level, - unsigned flags) +static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + unsigned flags) { - return bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: - bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_next_sib); } diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 0d566be7455e..112ac7caf579 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -15,6 +15,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "subvolume.h" #include "replicas.h" #include <linux/prefetch.h> @@ -29,37 +30,59 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, bpos_cmp(l->k->k.p, r->k->k.p); } +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { - return i != trans->updates2 && - iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + return i != trans->updates && + insert_l(&i[0])->b == insert_l(&i[-1])->b; } -inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) { - bch2_btree_node_lock_write(b, iter); + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} - if (btree_iter_type(iter) == BTREE_ITER_CACHED) +static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + + if (path->cached) return; if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); /* * If the last bset has been written, or if it's gotten too big - start * a new bset to insert into: */ if (want_new_bset(c, b)) - bch2_btree_init_next(c, b, iter); + bch2_btree_init_next(trans, b); +} + +void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + bch2_btree_node_lock_write(trans, path, b); + bch2_btree_node_prep_for_write(trans, path, b); } /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_iter *iter, +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, struct btree *b, struct btree_node_iter *node_iter, struct bkey_i *insert) @@ -73,8 +96,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(iter->trans->c, b)); - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch_btree_keys_u64s_remaining(trans->c, b)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -93,7 +115,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(iter->trans->c, b, insert->k.p); + push_whiteout(trans->c, b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -101,7 +123,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bch2_bset_delete(b, k, clobber_u64s); goto fix_iter; } else { - bch2_btree_iter_fix_key_modified(iter, b, k); + bch2_btree_path_fix_key_modified(trans, b, k); } return true; @@ -119,7 +141,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = k->u64s; goto overwrite; } else { - bch2_btree_iter_fix_key_modified(iter, b, k); + bch2_btree_path_fix_key_modified(trans, b, k); } } @@ -129,7 +151,7 @@ overwrite: new_u64s = k->u64s; fix_iter: if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(iter, b, node_iter, k, + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, clobber_u64s, new_u64s); return true; } @@ -173,22 +195,21 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, * btree_insert_key - insert a key one key into a leaf node */ static bool btree_insert_key_leaf(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; + struct btree *b = insert_l(insert)->b; struct bset_tree *t = bset_tree_last(b); struct bset *i = bset(b, t); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - EBUG_ON(!iter->level && + EBUG_ON(!insert->level && !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); - if (unlikely(!bch2_btree_bset_insert_key(iter, b, - &iter_l(iter)->iter, insert))) + if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) return false; i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, @@ -209,9 +230,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); - trace_btree_insert_key(c, b, insert); return true; } @@ -222,9 +242,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); - BUG_ON(i->level != i->iter->level); - BUG_ON(i->btree_id != i->iter->btree_id); + BUG_ON(bpos_cmp(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); } static noinline int @@ -264,13 +290,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } -static enum btree_insert_ret +static inline enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, - struct btree_iter *iter, + struct btree *b, unsigned u64s) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; if (!bch2_btree_node_insert_fits(c, b, u64s)) return BTREE_INSERT_BTREE_NODE_FULL; @@ -280,14 +305,14 @@ btree_key_can_insert(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert_cached(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned u64s) { - struct bkey_cached *ck = (void *) iter->l[0].b; + struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; - BUG_ON(iter->level); + EBUG_ON(path->level); if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(trans->c) && @@ -325,9 +350,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans, i->k->k.needs_whiteout = false; - did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED) - ? btree_insert_key_leaf(trans, i->iter, i->k) - : bch2_btree_insert_key_cached(trans, i->iter, i->k); + did_work = !i->cached + ? btree_insert_key_leaf(trans, i) + : bch2_btree_insert_key_cached(trans, i->path, i->k); if (!did_work) return; @@ -337,19 +362,11 @@ static inline void do_btree_insert_one(struct btree_trans *trans, i->level, i->k); - bch2_journal_set_has_inode(j, &trans->journal_res, - i->k->k.p.inode); - if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } } -static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) -{ - __bch2_btree_iter_unlock(iter); -} - static noinline void bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -358,12 +375,13 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc */ - BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); + BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i->iter, i->k, NULL, - i->trigger_flags|BTREE_TRIGGER_GC); + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) + bch2_mark_update(trans, i->path, i->k, + i->flags|BTREE_TRIGGER_GC); } } @@ -381,6 +399,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (race_fault()) { trace_trans_restart_fault_inject(trans->ip, trace_ip); + trans->restarted = true; return -EINTR; } @@ -400,15 +419,15 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, h = h->next; } - trans_for_each_update2(trans, i) { + trans_for_each_update(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; u64s += i->k->k.u64s; - ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED - ? btree_key_can_insert(trans, i->iter, u64s) - : btree_key_can_insert_cached(trans, i->iter, u64s); + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, i->path, u64s); if (ret) { *stopped_at = i; return ret; @@ -458,17 +477,16 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (bch2_journal_seq_verify) - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; else if (bch2_inject_invalid_keys) - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) i->k->k.version = MAX_VERSION; } trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) - bch2_mark_update(trans, i->iter, i->k, - NULL, i->trigger_flags); + bch2_mark_update(trans, i->path, i->k, i->flags); if (marking && trans->fs_usage_deltas) bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); @@ -476,7 +494,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) do_btree_insert_one(trans, i); err: if (marking) { @@ -486,41 +504,106 @@ err: return ret; } -static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) +static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) { - struct btree_insert_entry *i; - struct btree *b = iter_l(iter)->b; - struct bkey_s_c old; - int u64s_delta = 0; - int ret; + unsigned l; - /* - * Inserting directly into interior nodes is an uncommon operation with - * various weird edge cases: also, a lot of things about - * BTREE_ITER_NODES iters need to be audited - */ - if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) - return 0; + for (l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_read_locked(path, l)) + BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); +} - BUG_ON(iter->level); +static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) +{ + struct btree *b = path_l(path)->b; - trans_for_each_update2(trans, i) { - if (iter_l(i->iter)->b != b) + do { + if (path->nodes_locked && + path->nodes_locked != path->nodes_intent_locked) + path_upgrade_readers(trans, path); + } while ((path = prev_btree_path(trans, path)) && + path_l(path)->b == b); +} + +/* + * Check for nodes that we have both read and intent locks on, and upgrade the + * readers to intent: + */ +static inline void normalize_read_intent_locks(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i, nr_read = 0, nr_intent = 0; + + trans_for_each_path_inorder(trans, path, i) { + struct btree_path *next = i + 1 < trans->nr_sorted + ? trans->paths + trans->sorted[i + 1] + : NULL; + + if (path->nodes_locked) { + if (path->nodes_intent_locked) + nr_intent++; + else + nr_read++; + } + + if (!next || path_l(path)->b != path_l(next)->b) { + if (nr_read && nr_intent) + upgrade_readers(trans, path); + + nr_read = nr_intent = 0; + } + } + + bch2_trans_verify_locks(trans); +} + +static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path_inorder(trans, path, i) { + //if (path == pos) + // break; + + if (path->nodes_locked != path->nodes_intent_locked && + !bch2_btree_path_upgrade(trans, path, path->level + 1)) + return true; + } + + return false; +} + +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) continue; - old = bch2_btree_iter_peek_slot(i->iter); - ret = bkey_err(old); - if (ret) - return ret; + if (!six_trylock_write(&insert_l(i)->b->c.lock)) { + if (have_conflicting_read_lock(trans, i->path)) + goto fail; - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + __btree_node_lock_type(trans->c, insert_l(i)->b, + SIX_LOCK_write); + } + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } - return u64s_delta <= 0 - ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level, - trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) - : 0; + return 0; +fail: + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); + } + + trace_trans_restart_would_deadlock_write(trans->ip); + return btree_trans_restart(trans); } /* @@ -532,29 +615,55 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct btree_iter *iter; - int ret; + struct bkey_s_c old; + int ret, u64s_delta = 0; - trans_for_each_update2(trans, i) { - struct btree *b; + trans_for_each_update(trans, i) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n", + buf, (void *) trans->ip, + (void *) i->ip_allocated, invalid); + bch2_fatal_error(c); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); + } + + trans_for_each_update(trans, i) { + struct bkey u; - if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) + /* + * peek_slot() doesn't yet work on iterators that point to + * interior nodes: + */ + if (i->cached || i->level) continue; - b = iter_l(i->iter)->b; - if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || - b->sib_u64s[1] < c->btree_foreground_merge_threshold) { - ret = maybe_do_btree_merge(trans, i->iter); - if (unlikely(ret)) - return ret; + old = bch2_btree_path_peek_slot(i->path, &u); + ret = bkey_err(old); + if (unlikely(ret)) + return ret; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, trans->flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; } } - trans_for_each_update2(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); - ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| @@ -566,57 +675,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) return ret; - /* - * Can't be holding any read locks when we go to take write locks: - * another thread could be holding an intent lock on the same node we - * have a read lock on, and it'll block trying to take a write lock - * (because we hold a read lock) and it could be blocking us by holding - * its own read lock (while we're trying to to take write locks). - * - * note - this must be done after bch2_trans_journal_preres_get_cold() - * or anything else that might call bch2_trans_relock(), since that - * would just retake the read locks: - */ - trans_for_each_iter(trans, iter) { - if (iter->nodes_locked != iter->nodes_intent_locked) { - if (btree_iter_keep(trans, iter)) { - if (!bch2_btree_iter_upgrade(iter, 1)) { - trace_trans_restart_upgrade(trans->ip, trace_ip, - iter->btree_id, - &iter->real_pos); - return -EINTR; - } - } else { - bch2_btree_iter_unlock_noinline(iter); - } - } - } - - trans_for_each_update2(trans, i) { - const char *invalid = bch2_bkey_invalid(c, - bkey_i_to_s_c(i->k), i->bkey_type); - if (invalid) { - char buf[200]; - - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid); - bch2_fatal_error(c); - } - btree_insert_entry_checks(trans, i); - } - bch2_btree_trans_verify_locks(trans); + normalize_read_intent_locks(trans); - trans_for_each_update2(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(c, - iter_l(i->iter)->b, i->iter); + ret = trans_lock_write(trans); + if (unlikely(ret)) + return ret; ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); - trans_for_each_update2(trans, i) + trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, - i->iter); + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, @@ -652,60 +722,23 @@ int bch2_trans_commit_error(struct btree_trans *trans, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; - unsigned flags = trans->flags; - - /* - * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree - * update; if we haven't done anything yet it doesn't apply - */ - flags &= ~BTREE_INSERT_NOUNLOCK; switch (ret) { case BTREE_INSERT_BTREE_NODE_FULL: - ret = bch2_btree_split_leaf(c, i->iter, flags); - - /* - * if the split succeeded without dropping locks the insert will - * still be atomic (what the caller peeked() and is overwriting - * won't have changed) - */ -#if 0 - /* - * XXX: - * split -> btree node merging (of parent node) might still drop - * locks when we're not passing it BTREE_INSERT_NOUNLOCK - * - * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that - * will inhibit merging - but we don't have a reliable way yet - * (do we?) of checking if we dropped locks in this path - */ + ret = bch2_btree_split_leaf(trans, i->path, trans->flags); if (!ret) - goto retry; -#endif + return 0; - /* - * don't care if we got ENOSPC because we told split it - * couldn't block: - */ - if (!ret || - ret == -EINTR || - (flags & BTREE_INSERT_NOUNLOCK)) { + if (ret == -EINTR) trace_trans_restart_btree_node_split(trans->ip, trace_ip, - i->iter->btree_id, - &i->iter->real_pos); - ret = -EINTR; - } - break; - case BTREE_INSERT_ENOSPC: - BUG_ON(flags & BTREE_INSERT_NOFAIL); - ret = -ENOSPC; + i->btree_id, &i->path->pos); break; case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); if (ret) - return ret; + break; if (bch2_trans_relock(trans)) return 0; @@ -717,12 +750,15 @@ int bch2_trans_commit_error(struct btree_trans *trans, bch2_trans_unlock(trans); if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) - return -EAGAIN; + !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { + trans->restarted = true; + ret = -EAGAIN; + break; + } ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); if (ret) - return ret; + break; if (bch2_trans_relock(trans)) return 0; @@ -738,7 +774,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); if (ret < 0) - return ret; + break; if (bch2_trans_relock(trans)) return 0; @@ -751,7 +787,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, break; } - BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL)); + BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); + BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); return ret; } @@ -771,150 +808,124 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) if (ret) return ret; - percpu_ref_get(&c->writes); - return 0; -} - -static void __bch2_trans_update2(struct btree_trans *trans, - struct btree_insert_entry n) -{ - struct btree_insert_entry *i; - - btree_insert_entry_checks(trans, &n); - - EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); - - n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - - trans_for_each_update2(trans, i) - if (btree_insert_entry_cmp(&n, i) <= 0) - break; - - if (i < trans->updates2 + trans->nr_updates2 && - !btree_insert_entry_cmp(&n, i)) - *i = n; - else - array_insert_item(trans->updates2, trans->nr_updates2, - i - trans->updates2, n); -} - -static void bch2_trans_update2(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - __bch2_trans_update2(trans, (struct btree_insert_entry) { - .bkey_type = __btree_node_type(iter->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->level, - .iter = iter, - .k = insert, - }); -} - -static int extent_update_to_keys(struct btree_trans *trans, - struct btree_insert_entry n) -{ - int ret; - - ret = bch2_extent_can_insert(trans, n.iter, n.k); - if (ret) - return ret; - - if (bkey_deleted(&n.k->k)) - return 0; - - n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS); - n.is_extent = false; + if (!bch2_trans_relock(trans)) + return -EINTR; - __bch2_trans_update2(trans, n); - bch2_trans_iter_put(trans, n.iter); + percpu_ref_get(&c->writes); return 0; } -static int extent_handle_overwrites(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_i *insert) +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_iter *iter, *update_iter; - struct bpos start = bkey_start_pos(&insert->k); - struct bkey_i *update; - struct bkey_s_c k; + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; + struct bkey unpacked; + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + bool trans_trigger_run; + unsigned btree_id = 0; int ret = 0; - iter = bch2_trans_get_iter(trans, btree_id, start, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_with_updates(iter); + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; - while (k.k && !(ret = bkey_err(k))) { - if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) - break; + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; - if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - break; + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->insert_trigger_run || + (i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + continue; - bkey_reassemble(update, k); + BUG_ON(i->overwrite_trigger_run); - bch2_cut_back(start, update); + i->insert_trigger_run = true; + trans_trigger_run = true; - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); - bch2_trans_iter_put(trans, update_iter); - } + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; + + if (old.k->type == i->k->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); + } else { + ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|i->flags); + } - if (bkey_cmp(k.k->p, insert->k.p) < 0 || - (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); - if ((ret = PTR_ERR_OR_ZERO(update))) - break; + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; + } + } while (trans_trigger_run); - bkey_init(&update->k); - update->k.p = k.k->p; + do { + trans_trigger_run = false; - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); - bch2_trans_iter_put(trans, update_iter); - } + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->overwrite_trigger_run || + (i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + continue; - if (bkey_cmp(k.k->p, insert->k.p) > 0) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - break; + BUG_ON(!i->insert_trigger_run); - bkey_reassemble(update, k); - bch2_cut_front(insert->k.p, update); + i->overwrite_trigger_run = true; + trans_trigger_run = true; - update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - bch2_trans_update2(trans, update_iter, update); - bch2_trans_iter_put(trans, update_iter); - break; - } + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; - k = bch2_btree_iter_next_with_updates(iter); + ret = bch2_trans_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|i->flags); + + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip, _RET_IP_, + i->btree_id, &i->path->pos); + if (ret) + return ret; + } + } while (trans_trigger_run); } - bch2_trans_iter_put(trans, iter); - return ret; + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); + + return 0; } int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; - struct btree_iter *iter; - bool trans_trigger_run; - unsigned u64s, reset_flags = 0; + unsigned u64s; int ret = 0; - if (!trans->nr_updates) + if (!trans->nr_updates && + !trans->extra_journal_entry_u64s) goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) @@ -933,106 +944,80 @@ int __bch2_trans_commit(struct btree_trans *trans) } #ifdef CONFIG_BCACHEFS_DEBUG + /* + * if BTREE_TRIGGER_NORUN is set, it means we're probably being called + * from the key cache flush code: + */ trans_for_each_update(trans, i) - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && - !(i->trigger_flags & BTREE_TRIGGER_NORUN)) + if (!i->cached && + !(i->flags & BTREE_TRIGGER_NORUN)) bch2_btree_key_cache_verify_clean(trans, i->btree_id, i->k->k.p); #endif - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - trans_for_each_update(trans, i) { - if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - !i->trans_triggers_run) { - i->trans_triggers_run = true; - trans_trigger_run = true; - - ret = bch2_trans_mark_update(trans, i->iter, i->k, - i->trigger_flags); - if (unlikely(ret)) { - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip, _RET_IP_, - i->iter->btree_id, - &i->iter->pos); - goto out; - } - } - } - } while (trans_trigger_run); - - /* Turn extents updates into keys: */ - trans_for_each_update(trans, i) - if (i->is_extent) { - ret = extent_handle_overwrites(trans, i->btree_id, i->k); - if (unlikely(ret)) - goto out; - } + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out; trans_for_each_update(trans, i) { - ret = i->is_extent - ? extent_update_to_keys(trans, *i) - : (__bch2_trans_update2(trans, *i), 0); - if (unlikely(ret)) - goto out; - } - - trans_for_each_update2(trans, i) { - ret = bch2_btree_iter_traverse(i->iter); - if (unlikely(ret)) { - trace_trans_restart_traverse(trans->ip, _RET_IP_, - i->iter->btree_id, - &i->iter->pos); - goto out; - } + BUG_ON(!i->path->should_be_locked); - if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { + if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { trace_trans_restart_upgrade(trans->ip, _RET_IP_, - i->iter->btree_id, - &i->iter->pos); - ret = -EINTR; + i->btree_id, &i->path->pos); + ret = btree_trans_restart(trans); goto out; } - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + BUG_ON(!btree_node_intent_locked(i->path, i->level)); u64s = jset_u64s(i->k->k.u64s); - if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && + if (i->cached && likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) trans->journal_preres_u64s += u64s; trans->journal_u64s += u64s; } + + if (trans->extra_journal_res) { + ret = bch2_disk_reservation_add(trans->c, trans->disk_res, + trans->extra_journal_res, + (trans->flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto err; + } retry: + BUG_ON(trans->restarted); memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ret = do_bch2_trans_commit(trans, &i, _RET_IP_); /* make sure we didn't drop or screw up locks: */ - bch2_btree_trans_verify_locks(trans); + bch2_trans_verify_locks(trans); if (ret) goto err; - - trans_for_each_iter(trans, iter) - if (btree_iter_live(trans, iter) && - (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) - bch2_btree_iter_set_pos(iter, iter->pos_after_commit); out: bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); out_reset: - if (!ret) - reset_flags |= TRANS_RESET_NOTRAVERSE; - if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) - reset_flags |= TRANS_RESET_NOUNLOCK; - bch2_trans_reset(trans, reset_flags); + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->hooks = NULL; + trans->extra_journal_entries = NULL; + trans->extra_journal_entry_u64s = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } return ret; err: @@ -1043,125 +1028,363 @@ err: goto retry; } -int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_trigger_flags flags) +static int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) { - struct btree_insert_entry *i, n = (struct btree_insert_entry) { - .trigger_flags = flags, - .bkey_type = __btree_node_type(iter->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->level, - .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0, - .iter = iter, - .k = k - }; + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; - BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + if (!snapshot_t(c, pos.snapshot)->children[0]) + return 0; -#ifdef CONFIG_BCACHEFS_DEBUG - BUG_ON(bkey_cmp(iter->pos, - n.is_extent ? bkey_start_pos(&k->k) : k->k.p)); + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; - trans_for_each_update(trans, i) { - BUG_ON(bkey_cmp(i->iter->pos, - i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p)); + if (!k.k) + break; - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); + if (bkey_cmp(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } } -#endif + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0, compressed_sectors; + + bch2_trans_iter_init(trans, &iter, btree_id, start, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + /* + * We can't merge extents if they belong to interior snapshot + * tree nodes, and there's a snapshot in which one extent is + * visible and the other is not - i.e. if visibility is + * different. + * + * Instead of checking if visibilitiy of the two extents is + * different, for now we just check if either has been + * overwritten: + */ + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto err; + if (ret) + goto nomerge1; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto err; + if (ret) + goto nomerge1; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + bkey_reassemble(update, k); - if (n.is_extent) { - iter->pos_after_commit = k->k.p; - iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { + ret = bch2_btree_delete_at(trans, &iter, flags); + if (ret) + goto err; + + insert = update; + goto next; + } } +nomerge1: + ret = 0; + if (!bkey_cmp(k.k->p, start)) + goto next; - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - if (!n.is_extent) { - trans_for_each_update(trans, i) - if (btree_insert_entry_cmp(&n, i) <= 0) - break; + while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { + bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; + bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; - if (i < trans->updates + trans->nr_updates && - !btree_insert_entry_cmp(&n, i)) - *i = n; - else - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - } else { - trans_for_each_update(trans, i) - if (btree_insert_entry_cmp(&n, i) < 0) - break; + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(k))) + trans->extra_journal_res += compressed_sectors; - while (i > trans->updates && - i[-1].btree_id == n.btree_id && - bkey_cmp(bkey_start_pos(&n.k->k), - bkey_start_pos(&i[-1].k->k)) <= 0) { - --i; - array_remove_item(trans->updates, trans->nr_updates, - i - trans->updates); + if (front_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + + bch2_cut_back(start, update); + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; } - if (i > trans->updates && - i[-1].btree_id == n.btree_id && - bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0) - bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k); + if (k.k->p.snapshot != insert->k.p.snapshot && + (front_split || back_split)) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; - if (i < trans->updates + trans->nr_updates && - i->btree_id == n.btree_id && - bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { - if (bkey_cmp(bkey_start_pos(&n.k->k), - bkey_start_pos(&i->k->k)) > 0) { - struct btree_insert_entry split = *i; - int ret; + bkey_reassemble(update, k); - BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX); + bch2_cut_front(start, update); + bch2_cut_back(insert->k.p, update); + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + goto err; + } - split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k)); - ret = PTR_ERR_OR_ZERO(split.k); - if (ret) - return ret; + if (bkey_cmp(k.k->p, insert->k.p) <= 0) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; - bkey_copy(split.k, i->k); - bch2_cut_back(bkey_start_pos(&n.k->k), split.k); - - split.iter = bch2_trans_get_iter(trans, split.btree_id, - bkey_start_pos(&split.k->k), - BTREE_ITER_INTENT); - split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, split.iter); - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, split); - i++; - } + bkey_init(&update->k); + update->k.p = k.k->p; - /* - * When we have an extent that overwrites the start of another - * update, trimming that extent will mean the iterator's - * position has to change since the iterator position has to - * match the extent's start pos - but we don't want to change - * the iterator pos if some other code is using it, so we may - * need to clone it: - */ - if (btree_iter_live(trans, i->iter)) { - i->iter = bch2_trans_copy_iter(trans, i->iter); - - i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, i->iter); + if (insert->k.p.snapshot != k.k->p.snapshot) { + update->k.p.snapshot = insert->k.p.snapshot; + update->k.type = KEY_TYPE_whiteout; } - bch2_cut_front(n.k->k.p, i->k); - bch2_btree_iter_set_pos(i->iter, n.k->k.p); + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; + } + + if (back_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + + bch2_trans_copy_iter(&update_iter, &iter); + update_iter.pos = update->k.p; + ret = bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; + goto out; } +next: + k = bch2_btree_iter_next(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto err; + if (ret) + goto nomerge2; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto err; + if (ret) + goto nomerge2; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + } +nomerge2: + ret = 0; +out: + if (!bkey_deleted(&insert->k)) { + /* + * Rewinding iterators is expensive: get a new one and the one + * that points to the start of insert will be cloned from: + */ + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, insert, flags); + } +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_insert_entry *i, n; + + BUG_ON(!iter->path->should_be_locked); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(iter->path->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->path->level, + .cached = iter->flags & BTREE_ITER_CACHED, + .path = iter->path, + .k = k, + .ip_allocated = _RET_IP_, + }; + + __btree_path_get(n.path, true); + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + + if (bkey_deleted(&n.k->k) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + n.k->k.type = KEY_TYPE_whiteout; + } + + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ + trans_for_each_update(trans, i) + if (btree_insert_entry_cmp(&n, i) <= 0) + break; + + if (i < trans->updates + trans->nr_updates && + !btree_insert_entry_cmp(&n, i)) { + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + /* + * This is a hack to ensure that inode creates update the btree, + * not the key cache, which helps with cache coherency issues in + * other areas: + */ + if (n.cached && !i->cached) { + i->k = n.k; + i->flags = n.flags; + + __btree_path_get(n.path, false); + } else { + bch2_path_put(trans, i->path, true); + *i = n; + } + } else array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - } return 0; } @@ -1176,14 +1399,14 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, struct bkey_i *k) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - ret = bch2_trans_update(trans, iter, k, 0); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1204,35 +1427,36 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, } int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) + struct btree_iter *iter, unsigned update_flags) { - struct bkey_i k; + struct bkey_i *k; - bkey_init(&k.k); - k.k.p = iter->pos; + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); - return bch2_trans_update(trans, iter, &k, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); + bkey_init(&k->k); + k->k.p = iter->pos; + return bch2_trans_update(trans, iter, k, update_flags); } int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, + unsigned iter_flags, u64 *journal_seq) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); retry: - while ((k = bch2_btree_iter_peek(iter)).k && + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && !(ret = bkey_err(k)) && - bkey_cmp(iter->pos, end) < 0) { + bkey_cmp(iter.pos, end) < 0) { struct bkey_i delete; - bch2_trans_begin(trans); - bkey_init(&delete.k); /* @@ -1249,9 +1473,9 @@ retry: * (bch2_btree_iter_peek() does guarantee that iter.pos >= * bkey_start_pos(k.k)). */ - delete.k.p = iter->pos; + delete.k.p = iter.pos; - if (btree_node_type_is_extents(iter->btree_id)) { + if (btree_node_type_is_extents(id)) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << trans->c->block_bits); @@ -1259,18 +1483,16 @@ retry: bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete); - ret = bch2_extent_trim_atomic(&delete, iter); + ret = bch2_extent_trim_atomic(trans, &iter, &delete); if (ret) break; } - ret = bch2_trans_update(trans, iter, &delete, 0) ?: + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, journal_seq, BTREE_INSERT_NOFAIL); if (ret) break; - - bch2_trans_cond_resched(trans); } if (ret == -EINTR) { @@ -1278,7 +1500,7 @@ retry: goto retry; } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1292,5 +1514,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, u64 *journal_seq) { return bch2_trans_do(c, NULL, journal_seq, 0, - bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); + bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq)); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 76d15a5dc62f..6fc93b56bcb2 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -13,9 +13,12 @@ #include "buckets.h" #include "ec.h" #include "error.h" +#include "inode.h" #include "movinggc.h" +#include "recovery.h" #include "reflink.h" #include "replicas.h" +#include "subvolume.h" #include <linux/preempt.h> #include <trace/events/bcachefs.h> @@ -114,6 +117,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, unsigned journal_seq, bool gc) { + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? ca->usage_gc : ca->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -139,6 +144,8 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, unsigned journal_seq, bool gc) { + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? c->usage_gc : c->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -351,17 +358,23 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, u64 journal_seq, bool gc) { + struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; + /* + * Hack for bch2_fs_initialize path, where we're first marking sb and + * journal non-transactionally: + */ + if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags)) + journal_seq = 1; + percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - if (!fs_usage) - fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage = fs_usage_ptr(c, journal_seq, gc); u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) @@ -390,30 +403,48 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } +static inline int __update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) +{ + int idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) + return -1; + + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + return 0; +} + static inline int update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, - s64 sectors) + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) { + struct bch_fs_usage __percpu *fs_usage; int idx = bch2_replicas_entry_idx(c, r); if (idx < 0) return -1; + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; + preempt_enable(); return 0; } static inline int update_cached_sectors(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - unsigned dev, s64 sectors) + unsigned dev, s64 sectors, + unsigned journal_seq, bool gc) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, fs_usage, &r.e, sectors); + return update_replicas(c, &r.e, sectors, journal_seq, gc); } static struct replicas_delta_list * @@ -505,20 +536,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(owned_by_allocator == old.owned_by_allocator); } -static int bch2_mark_alloc(struct bch_fs *c, +static int bch2_mark_alloc(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; struct bkey_alloc_unpacked u; struct bch_dev *ca; struct bucket *g; struct bucket_mark old_m, m; /* We don't do anything for deletions - do we?: */ - if (new.k->type != KEY_TYPE_alloc && - new.k->type != KEY_TYPE_alloc_v2) + if (!bkey_is_alloc(new.k)) return 0; /* @@ -528,6 +559,15 @@ static int bch2_mark_alloc(struct bch_fs *c, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; + + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_alloc_v3); + + v->journal_seq = cpu_to_le64(journal_seq); + } + ca = bch_dev_bkey_exists(c, new.k->p.inode); if (new.k->p.offset >= ca->mi.nbuckets) @@ -549,7 +589,7 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); + bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -565,8 +605,8 @@ static int bch2_mark_alloc(struct bch_fs *c, if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old_m.cached_sectors) { - if (update_cached_sectors(c, fs_usage, ca->dev_idx, - -old_m.cached_sectors)) { + if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, + journal_seq, gc)) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); return -1; } @@ -617,8 +657,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, old.dirty_sectors, sectors); if (c) - bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, 0, gc); + bch2_dev_usage_update(c, ca, old, new, 0, gc); return 0; } @@ -637,57 +676,27 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (b >= ca->mi.nbuckets) return; - preempt_disable(); - if (likely(c)) { do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ca, b, type, sectors); } else { __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); } - - preempt_enable(); -} - -static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) -{ - return DIV_ROUND_UP(sectors * n, d); } -static s64 __ptr_disk_sectors_delta(unsigned old_size, - unsigned offset, s64 delta, - unsigned flags, - unsigned n, unsigned d) +static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) { - BUG_ON(!n || !d); + EBUG_ON(sectors < 0); - if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { - BUG_ON(offset + -delta > old_size); - - return -disk_sectors_scaled(n, d, old_size) + - disk_sectors_scaled(n, d, offset) + - disk_sectors_scaled(n, d, old_size - offset + delta); - } else if (flags & BTREE_TRIGGER_OVERWRITE) { - BUG_ON(offset + -delta > old_size); - - return -disk_sectors_scaled(n, d, old_size) + - disk_sectors_scaled(n, d, old_size + delta); - } else { - return disk_sectors_scaled(n, d, delta); - } + return p.crc.compression_type && + p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; } -static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, - unsigned offset, s64 delta, - unsigned flags) -{ - return __ptr_disk_sectors_delta(p.crc.live_size, - offset, delta, flags, - p.crc.compressed_size, - p.crc.uncompressed_size); -} - -static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, +static int check_bucket_ref(struct bch_fs *c, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 bucket_data_type, @@ -761,11 +770,12 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, - unsigned ptr_idx, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + u64 journal_seq, unsigned flags) { + struct bch_fs *c = trans->c; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; @@ -805,11 +815,12 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); + bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); return 0; } -static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 *bucket_data_type, @@ -818,7 +829,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, u16 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, + int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, bucket_gen, *bucket_data_type, *dirty_sectors, *cached_sectors); @@ -831,13 +842,15 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); @@ -850,7 +863,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, new.v.counter = old.v.counter = v; bucket_data_type = new.data_type; - ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, new.gen, &bucket_data_type, &new.dirty_sectors, &new.cached_sectors); @@ -872,20 +886,21 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); + bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); return 0; } -static int bch2_mark_stripe_ptr(struct bch_fs *c, +static int bch2_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, - struct bch_fs_usage *fs_usage, - s64 sectors, unsigned flags) + s64 sectors, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; + struct bch_fs *c = trans->c; struct bch_replicas_padded r; struct stripe *m; unsigned i, blocks_nonempty = 0; @@ -918,23 +933,29 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); r.e.data_type = data_type; - update_replicas(c, fs_usage, &r.e, sectors); + update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc); return 0; } -static int bch2_mark_extent(struct bch_fs *c, +static int bch2_mark_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - unsigned offset, s64 sectors, - enum bch_data_type data_type, - struct bch_fs_usage *fs_usage, - unsigned journal_seq, unsigned flags) + unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? c->opts.btree_node_size + : k.k->size; s64 dirty_sectors = 0; bool stale; int ret; @@ -943,15 +964,14 @@ static int bch2_mark_extent(struct bch_fs *c, r.e.nr_devs = 0; r.e.nr_required = 1; - BUG_ON(!sectors); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_btree - ? sectors - : ptr_disk_sectors_delta(p, offset, sectors, flags); + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; - ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, - fs_usage, journal_seq, flags); + ret = bch2_mark_pointer(trans, k, p, disk_sectors, + data_type, flags); if (ret < 0) return ret; @@ -959,8 +979,8 @@ static int bch2_mark_extent(struct bch_fs *c, if (p.ptr.cached) { if (!stale) - if (update_cached_sectors(c, fs_usage, p.ptr.dev, - disk_sectors)) { + if (update_cached_sectors(c, p.ptr.dev, disk_sectors, + journal_seq, gc)) { bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); return -1; @@ -969,8 +989,8 @@ static int bch2_mark_extent(struct bch_fs *c, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_mark_stripe_ptr(c, p.ec, data_type, - fs_usage, disk_sectors, flags); + ret = bch2_mark_stripe_ptr(trans, p.ec, data_type, + disk_sectors, flags); if (ret) return ret; @@ -984,7 +1004,7 @@ static int bch2_mark_extent(struct bch_fs *c, } if (r.e.nr_devs) { - if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) { + if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, k); @@ -996,12 +1016,13 @@ static int bch2_mark_extent(struct bch_fs *c, return 0; } -static int bch2_mark_stripe(struct bch_fs *c, +static int bch2_mark_stripe(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; size_t idx = new.k->p.offset; const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(old).v : NULL; @@ -1014,8 +1035,13 @@ static int bch2_mark_stripe(struct bch_fs *c, BUG_ON(gc && old_s); if (!m || (old_s && !m->alive)) { - bch_err_ratelimited(c, "error marking nonexistent stripe %zu", - idx); + char buf1[200], buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, old); + bch2_bkey_val_to_text(&PBUF(buf2), c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n" + "old %s\n" + "new %s", idx, buf1, buf2); bch2_inconsistent_error(c); return -1; } @@ -1060,14 +1086,14 @@ static int bch2_mark_stripe(struct bch_fs *c, m->blocks_nonempty = 0; for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(c, new, i, fs_usage, - journal_seq, flags); + ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); if (ret) return ret; } - if (update_replicas(c, fs_usage, &m->r.e, - ((s64) m->sectors * m->nr_redundant))) { + if (update_replicas(c, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc)) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, new); @@ -1079,98 +1105,123 @@ static int bch2_mark_stripe(struct bch_fs *c, return 0; } -static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p, - u64 p_start, u64 p_end, - u64 v_start, u64 v_end) +static int bch2_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - if (p_start == p_end) - return false; + struct bch_fs *c = trans->c; + struct bch_fs_usage __percpu *fs_usage; + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; - p_start += le64_to_cpu(p.v->idx); - p_end += le64_to_cpu(p.v->idx); + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_inode_v2); - if (p_end <= v_start) - return false; - if (p_start >= v_end) - return false; - return true; + v->bi_journal_seq = cpu_to_le64(journal_seq); + } + + if (flags & BTREE_TRIGGER_GC) { + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); + preempt_enable(); + } + return 0; } -static int reflink_p_frag_references(struct bkey_s_c_reflink_p p, - u64 start, u64 end, - struct bkey_s_c k) +static int bch2_mark_reservation(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - return __reflink_p_frag_references(p, start, end, - bkey_start_offset(k.k), - k.k->p.offset); + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bch_fs_usage __percpu *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + preempt_disable(); + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; + preempt_enable(); + + return 0; } -static int __bch2_mark_reflink_p(struct bch_fs *c, - struct bkey_s_c_reflink_p p, - u64 idx, unsigned sectors, - unsigned front_frag, - unsigned back_frag, - unsigned flags, - size_t *r_idx) +static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags, size_t r_idx) { struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - int frags_referenced; - - while (1) { - if (*r_idx >= c->reflink_gc_nr) - goto not_found; - r = genradix_ptr(&c->reflink_gc_table, *r_idx); - BUG_ON(!r); + s64 ret = 0; - if (r->offset > idx) - break; - (*r_idx)++; - } + if (r_idx >= c->reflink_gc_nr) + goto not_found; - frags_referenced = - __reflink_p_frag_references(p, 0, front_frag, - r->offset - r->size, r->offset) + - __reflink_p_frag_references(p, back_frag, p.k->size, - r->offset - r->size, r->offset); - - if (frags_referenced == 2) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); - add = -add; - } else if (frags_referenced == 1) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); - add = 0; - } + r = genradix_ptr(&c->reflink_gc_table, r_idx); + if (*idx < r->offset - r->size) + goto not_found; BUG_ON((s64) r->refcount + add < 0); r->refcount += add; - return min_t(u64, sectors, r->offset - idx); + *idx = r->offset; + return 0; not_found: - bch2_fs_inconsistent(c, - "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, idx); - bch2_inconsistent_error(c); - return -EIO; + *idx = U64_MAX; + ret = -EIO; + + /* + * XXX: we're replacing the entire reflink pointer with an error + * key, we should just be replacing the part that was missing: + */ + if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { + struct bkey_i_error *new; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = p.k->p; + new->k.size = p.k->size; + ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i); + } +fsck_err: + return ret; } -static int bch2_mark_reflink_p(struct bch_fs *c, - struct bkey_s_c_reflink_p p, unsigned offset, - s64 sectors, unsigned flags) +static int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - u64 idx = le64_to_cpu(p.v->idx) + offset; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; - unsigned front_frag, back_frag; - s64 ret = 0; - - if (sectors < 0) - sectors = -sectors; - - BUG_ON(offset + sectors > p.k->size); + u64 idx = le64_to_cpu(p.v->idx); + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; - front_frag = offset; - back_frag = offset + sectors; + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); + } l = 0; r = c->reflink_gc_nr; @@ -1184,203 +1235,89 @@ static int bch2_mark_reflink_p(struct bch_fs *c, r = m; } - while (sectors) { - ret = __bch2_mark_reflink_p(c, p, idx, sectors, - front_frag, back_frag, flags, &l); - if (ret < 0) - return ret; + while (idx < end && !ret) + ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); - idx += ret; - sectors -= ret; - } - - return 0; + return ret; } -static int bch2_mark_key_locked(struct bch_fs *c, +static int bch2_mark_key_locked(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - unsigned offset, s64 sectors, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; - int ret = 0; - - BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); - - preempt_disable(); - - if (!fs_usage || (flags & BTREE_TRIGGER_GC)) - fs_usage = fs_usage_ptr(c, journal_seq, - flags & BTREE_TRIGGER_GC); + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; switch (k.k->type) { case KEY_TYPE_alloc: case KEY_TYPE_alloc_v2: - ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); - break; + case KEY_TYPE_alloc_v3: + return bch2_mark_alloc(trans, old, new, flags); case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: - sectors = !(flags & BTREE_TRIGGER_OVERWRITE) - ? c->opts.btree_node_size - : -c->opts.btree_node_size; - - ret = bch2_mark_extent(c, old, new, offset, sectors, - BCH_DATA_btree, fs_usage, journal_seq, flags); - break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - ret = bch2_mark_extent(c, old, new, offset, sectors, - BCH_DATA_user, fs_usage, journal_seq, flags); - break; + return bch2_mark_extent(trans, old, new, flags); case KEY_TYPE_stripe: - ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); - break; + return bch2_mark_stripe(trans, old, new, flags); case KEY_TYPE_inode: - fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; - fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; - break; - case KEY_TYPE_reservation: { - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - - sectors *= replicas; - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(fs_usage->persistent_reserved)); - - fs_usage->reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; - break; - } + case KEY_TYPE_inode_v2: + return bch2_mark_inode(trans, old, new, flags); + case KEY_TYPE_reservation: + return bch2_mark_reservation(trans, old, new, flags); case KEY_TYPE_reflink_p: - ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k), - offset, sectors, flags); - break; + return bch2_mark_reflink_p(trans, old, new, flags); + case KEY_TYPE_snapshot: + return bch2_mark_snapshot(trans, old, new, flags); + default: + return 0; } - - preempt_enable(); - - return ret; } -int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, - unsigned offset, s64 sectors, - struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) +int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) { - struct bkey deleted; + struct bch_fs *c = trans->c; + struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; int ret; - bkey_init(&deleted); + deleted.p = new.k->p; percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, old, new, offset, sectors, - fs_usage, journal_seq, - BTREE_TRIGGER_INSERT|flags); + ret = bch2_mark_key_locked(trans, old, new, flags); percpu_up_read(&c->mark_lock); return ret; } -int bch2_mark_update(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *new, - struct bch_fs_usage *fs_usage, - unsigned flags) +int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *new, unsigned flags) { - struct bch_fs *c = trans->c; + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; struct bkey_s_c old; struct bkey unpacked; - int ret = 0; + int ret; + + _deleted.p = path->pos; if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(iter->btree_id)) + if (!btree_node_type_needs_gc(path->btree_id)) return 0; - bkey_init(&unpacked); - old = (struct bkey_s_c) { &unpacked, NULL }; + old = bch2_btree_path_peek_slot(path, &unpacked); - if (!btree_node_type_is_extents(iter->btree_id)) { - /* iterators should be uptodate, shouldn't get errors here: */ - if (btree_iter_type(iter) != BTREE_ITER_CACHED) { - old = bch2_btree_iter_peek_slot(iter); - BUG_ON(bkey_err(old)); - } else { - struct bkey_cached *ck = (void *) iter->l[0].b; - - if (ck->valid) - old = bkey_i_to_s_c(ck->k); - } - - if (old.k->type == new->k.type) { - bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, - fs_usage, trans->journal_res.seq, + if (old.k->type == new->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - - } else { - bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, - fs_usage, trans->journal_res.seq, - BTREE_TRIGGER_INSERT|flags); - bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, - fs_usage, trans->journal_res.seq, - BTREE_TRIGGER_OVERWRITE|flags); - } } else { - struct btree_iter *copy; - - BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), - 0, new->k.size, - fs_usage, trans->journal_res.seq, - BTREE_TRIGGER_INSERT|flags); - - copy = bch2_trans_copy_iter(trans, iter); - - for_each_btree_key_continue(copy, 0, old, ret) { - unsigned offset = 0; - s64 sectors = -((s64) old.k->size); - - flags |= BTREE_TRIGGER_OVERWRITE; - - if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - break; - - switch (bch2_extent_overlap(&new->k, old.k)) { - case BCH_EXTENT_OVERLAP_ALL: - offset = 0; - sectors = -((s64) old.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - offset = 0; - sectors = bkey_start_offset(old.k) - - new->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = -((s64) new->k.size); - flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; - break; - } - - BUG_ON(sectors >= 0); - - ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), - offset, sectors, fs_usage, - trans->journal_res.seq, flags) ?: 1; - if (ret <= 0) - break; - } - bch2_trans_iter_put(trans, copy); + ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key_locked(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); } return ret; @@ -1404,23 +1341,14 @@ void fs_usage_apply_warn(struct btree_trans *trans, pr_err("%s", buf); pr_err("overlapping with"); - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { - struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); - struct bkey_s_c k; - int ret; + if (!i->cached) { + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); - for_each_btree_key_continue(copy, 0, k, ret) { - if (btree_node_type_is_extents(i->iter->btree_id) - ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 - : bkey_cmp(i->k->k.p, k.k->p)) - break; - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - pr_err("%s", buf); - } - bch2_trans_iter_put(trans, copy); + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); } else { - struct bkey_cached *ck = (void *) i->iter->l[0].b; + struct bkey_cached *ck = (void *) i->path->l[0].b; if (ck->valid) { bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); @@ -1457,7 +1385,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, added += d->delta; } - BUG_ON(update_replicas(c, dst, &d->r, d->delta)); + BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); } dst->nr_inodes += deltas->nr_inodes; @@ -1474,7 +1402,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, */ should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { - atomic64_sub(should_not_have_added, &c->sectors_available); + u64 old, new, v = atomic64_read(&c->sectors_available); + + do { + old = v; + new = max_t(s64, 0, old - should_not_have_added); + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, new)) != old); + added -= should_not_have_added; warn = true; } @@ -1492,54 +1427,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, /* trans_mark: */ -static struct btree_iter *trans_get_update(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - struct bkey_s_c *k) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if (i->iter->btree_id == btree_id && - (btree_node_type_is_extents(btree_id) - ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && - bkey_cmp(pos, i->k->k.p) < 0 - : !bkey_cmp(pos, i->iter->pos))) { - *k = bkey_i_to_s_c(i->k); - - /* ugly hack.. */ - BUG_ON(btree_iter_live(trans, i->iter)); - trans->iters_live |= 1ULL << i->iter->idx; - return i->iter; - } - - return NULL; -} - -static int trans_get_key(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - struct btree_iter **iter, - struct bkey_s_c *k) -{ - unsigned flags = btree_id != BTREE_ID_alloc - ? BTREE_ITER_SLOTS - : BTREE_ITER_CACHED; - int ret; - - *iter = trans_get_update(trans, btree_id, pos, k); - if (*iter) - return 1; - - *iter = bch2_trans_get_iter(trans, btree_id, pos, - flags|BTREE_ITER_INTENT); - *k = __bch2_btree_iter_peek(*iter, flags); - ret = bkey_err(*k); - if (ret) - bch2_trans_iter_put(trans, *iter); - return ret; -} - static struct bkey_alloc_buf * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, const struct bch_extent_ptr *ptr, struct bkey_alloc_unpacked *u) { @@ -1547,36 +1436,33 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); struct bucket *g; - struct btree_iter *iter; - struct bkey_s_c k; struct bkey_alloc_buf *a; + struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos); int ret; a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); if (IS_ERR(a)) return a; - iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k); - if (iter) { - *u = bch2_alloc_unpack(k); - } else { - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); - if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); - } + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(iter); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } + if (update && !bpos_cmp(update->k.p, pos)) { + *u = bch2_alloc_unpack(bkey_i_to_s_c(update)); + } else { percpu_down_read(&c->mark_lock); g = bucket(ca, pos.offset); *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); } - *_iter = iter; return a; } @@ -1585,7 +1471,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; struct bkey_alloc_buf *a; int ret; @@ -1594,15 +1480,16 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, if (IS_ERR(a)) return PTR_ERR(a); - ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, + u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + bch2_trans_update(trans, &iter, &a->k, 0); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1611,15 +1498,19 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i_stripe *s; struct bch_replicas_padded r; int ret = 0; - ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k); - if (ret < 0) - return ret; + bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; if (k.k->type != KEY_TYPE_stripe) { bch2_fs_inconsistent(c, @@ -1627,7 +1518,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, (u64) p.ec.idx); bch2_inconsistent_error(c); ret = -EIO; - goto out; + goto err; } if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { @@ -1635,37 +1526,42 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); ret = -EIO; - goto out; + goto err; } s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(s); if (ret) - goto out; + goto err; bkey_reassemble(&s->k_i, k); stripe_blockcount_set(&s->v, p.ec.block, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - bch2_trans_update(trans, iter, &s->k_i, 0); + bch2_trans_update(trans, &iter, &s->k_i, 0); bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); r.e.data_type = data_type; update_replicas_list(trans, &r.e, sectors); -out: - bch2_trans_iter_put(trans, iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_trans_mark_extent(struct btree_trans *trans, - struct bkey_s_c k, unsigned offset, - s64 sectors, unsigned flags, - enum bch_data_type data_type) + struct bkey_s_c k, unsigned flags) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? c->opts.btree_node_size + : k.k->size; s64 dirty_sectors = 0; bool stale; int ret; @@ -1674,15 +1570,14 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, r.e.nr_devs = 0; r.e.nr_required = 1; - BUG_ON(!sectors); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_btree - ? sectors - : ptr_disk_sectors_delta(p, offset, sectors, flags); + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; - ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, - data_type); + ret = bch2_trans_mark_pointer(trans, k, p, + disk_sectors, data_type); if (ret < 0) return ret; @@ -1718,7 +1613,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; struct bkey_alloc_buf *a; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; int ret = 0; @@ -1742,7 +1637,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, if (!deleting) { if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", - iter->pos.inode, iter->pos.offset, u.gen, + iter.pos.inode, iter.pos.offset, u.gen, u.stripe, s.k->p.offset)) { ret = -EIO; goto err; @@ -1756,9 +1651,9 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, } bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + bch2_trans_update(trans, &iter, &a->k, 0); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1818,40 +1713,62 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } +static int bch2_trans_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, + unsigned flags) +{ + int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); + + if (nr) { + struct replicas_delta_list *d = + replicas_deltas_realloc(trans, 0); + d->nr_inodes += nr; + } + + return 0; +} + +static int bch2_trans_mark_reservation(struct btree_trans *trans, + struct bkey_s_c k, unsigned flags) +{ + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + d = replicas_deltas_realloc(trans, 0); + + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(d->persistent_reserved)); + + d->persistent_reserved[replicas - 1] += sectors; + return 0; +} + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, - u64 idx, unsigned sectors, - unsigned front_frag, - unsigned back_frag, - unsigned flags) + u64 *idx, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - int frags_referenced; - s64 ret; - - ret = trans_get_key(trans, BTREE_ID_reflink, - POS(0, idx), &iter, &k); - if (ret < 0) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - idx); - - frags_referenced = - reflink_p_frag_references(p, 0, front_frag, k) + - reflink_p_frag_references(p, back_frag, p.k->size, k); + char buf[200]; + int ret; - if (frags_referenced == 2) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); - add = -add; - } else if (frags_referenced == 1) { - BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); - goto out; - } + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(n); @@ -1862,15 +1779,38 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, refcount = bkey_refcount(n); if (!refcount) { + bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); bch2_fs_inconsistent(c, - "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, idx); - bch2_inconsistent_error(c); + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf); ret = -EIO; goto err; } - BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)); + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); + bch2_fs_inconsistent(c, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(k.k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k.k->p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + le64_add_cpu(refcount, add); if (!*refcount) { @@ -1878,210 +1818,72 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, set_bkey_val_u64s(&n->k, 0); } - bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - ret = bch2_trans_update(trans, iter, n, 0); + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, n, 0); if (ret) goto err; -out: - ret = sectors; + + *idx = k.k->p.offset; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_trans_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, unsigned offset, - s64 sectors, unsigned flags) + struct bkey_s_c k, unsigned flags) { - u64 idx = le64_to_cpu(p.v->idx) + offset; - unsigned front_frag, back_frag; - s64 ret = 0; - - if (sectors < 0) - sectors = -sectors; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; - BUG_ON(offset + sectors > p.k->size); + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - front_frag = offset; - back_frag = offset + sectors; + v->front_pad = v->back_pad = 0; + } - while (sectors) { - ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, - front_frag, back_frag, flags); - if (ret < 0) - return ret; + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); - idx += ret; - sectors -= ret; - } + while (idx < end_idx && !ret) + ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); - return 0; + return ret; } -int bch2_trans_mark_key(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned offset, s64 sectors, unsigned flags) +int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, + struct bkey_s_c new, unsigned flags) { - struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; - struct replicas_delta_list *d; - - BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; switch (k.k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: - sectors = !(flags & BTREE_TRIGGER_OVERWRITE) - ? c->opts.btree_node_size - : -c->opts.btree_node_size; - - return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_btree); case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_user); + return bch2_trans_mark_extent(trans, k, flags); case KEY_TYPE_stripe: return bch2_trans_mark_stripe(trans, old, new, flags); - case KEY_TYPE_inode: { - int nr = (new.k->type == KEY_TYPE_inode) - - (old.k->type == KEY_TYPE_inode); - - if (nr) { - d = replicas_deltas_realloc(trans, 0); - d->nr_inodes += nr; - } - - return 0; - } - case KEY_TYPE_reservation: { - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - - d = replicas_deltas_realloc(trans, 0); - - sectors *= replicas; - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->persistent_reserved)); - - d->persistent_reserved[replicas - 1] += sectors; - return 0; - } + case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: + return bch2_trans_mark_inode(trans, old, new, flags); + case KEY_TYPE_reservation: + return bch2_trans_mark_reservation(trans, k, flags); case KEY_TYPE_reflink_p: - return bch2_trans_mark_reflink_p(trans, - bkey_s_c_to_reflink_p(k), - offset, sectors, flags); + return bch2_trans_mark_reflink_p(trans, k, flags); default: return 0; } } -int bch2_trans_mark_update(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *new, - unsigned flags) -{ - struct bkey_s_c old; - int ret; - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc(iter->btree_id)) - return 0; - - if (!btree_node_type_is_extents(iter->btree_id)) { - if (btree_iter_type(iter) != BTREE_ITER_CACHED) { - old = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(old); - if (ret) - return ret; - } else { - struct bkey_cached *ck = (void *) iter->l[0].b; - - BUG_ON(!ck->valid); - old = bkey_i_to_s_c(ck->k); - } - - if (old.k->type == new->k.type) { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_INSERT|flags) ?: - bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, - BTREE_TRIGGER_OVERWRITE|flags); - } - } else { - struct btree_iter *copy; - struct bkey _old; - - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - - bkey_init(&_old); - old = (struct bkey_s_c) { &_old, NULL }; - - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - 0, new->k.size, - BTREE_TRIGGER_INSERT); - if (ret) - return ret; - - copy = bch2_trans_copy_iter(trans, iter); - - for_each_btree_key_continue(copy, 0, old, ret) { - unsigned offset = 0; - s64 sectors = -((s64) old.k->size); - - flags |= BTREE_TRIGGER_OVERWRITE; - - if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - break; - - switch (bch2_extent_overlap(&new->k, old.k)) { - case BCH_EXTENT_OVERLAP_ALL: - offset = 0; - sectors = -((s64) old.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - offset = 0; - sectors = bkey_start_offset(old.k) - - new->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = -((s64) new->k.size); - flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; - break; - } - - BUG_ON(sectors >= 0); - - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - offset, sectors, flags); - if (ret) - break; - } - bch2_trans_iter_put(trans, copy); - } - - return ret; -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; struct bkey_alloc_buf *a; struct bch_extent_ptr ptr = { @@ -2104,7 +1906,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - iter->pos.inode, iter->pos.offset, u.gen, + iter.pos.inode, iter.pos.offset, u.gen, bch2_data_types[u.data_type], bch2_data_types[type], bch2_data_types[type]); @@ -2116,9 +1918,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, u.dirty_sectors = sectors; bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + bch2_trans_update(trans, &iter, &a->k, 0); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 04a2a9310cdd..5ed9441cb115 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -125,20 +125,6 @@ static inline u8 ptr_stale(struct bch_dev *ca, return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } -static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, - unsigned live_size) -{ - return live_size && p.crc.compression_type - ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, - p.crc.uncompressed_size)) - : live_size; -} - -static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) -{ - return __ptr_disk_sectors(p, p.crc.live_size); -} - /* bucket gc marks */ static inline unsigned bucket_sectors_used(struct bucket_mark mark) @@ -240,16 +226,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, - s64, struct bch_fs_usage *, u64, unsigned); +int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned); -int bch2_mark_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bch_fs_usage *, unsigned); +int bch2_mark_update(struct btree_trans *, struct btree_path *, + struct bkey_i *, unsigned); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, - unsigned, s64, unsigned); -int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, - struct bkey_i *insert, unsigned); +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index c29f8272e682..db68a78276cf 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -157,6 +157,9 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c, #if 0 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (arg.flags || arg.pad) return -EINVAL; @@ -165,6 +168,9 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) static long bch2_ioctl_stop(struct bch_fs *c) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + bch2_fs_stop(c); return 0; } @@ -175,6 +181,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) char *path; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (arg.flags || arg.pad) return -EINVAL; @@ -192,6 +201,9 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) { struct bch_dev *ca; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| BCH_FORCE_IF_METADATA_LOST| BCH_FORCE_IF_DEGRADED| @@ -211,6 +223,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) char *path; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (arg.flags || arg.pad) return -EINVAL; @@ -228,6 +243,9 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) struct bch_dev *ca; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| BCH_FORCE_IF_METADATA_LOST| BCH_FORCE_IF_DEGRADED| @@ -250,11 +268,15 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, struct bch_dev *ca; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| BCH_FORCE_IF_METADATA_LOST| BCH_FORCE_IF_DEGRADED| BCH_BY_INDEX)) || - arg.pad[0] || arg.pad[1] || arg.pad[2]) + arg.pad[0] || arg.pad[1] || arg.pad[2] || + arg.new_state >= BCH_MEMBER_STATE_NR) return -EINVAL; ca = bch2_device_lookup(c, arg.dev, arg.flags); @@ -331,6 +353,9 @@ static long bch2_ioctl_data(struct bch_fs *c, unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; int ret, fd = -1; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (arg.op >= BCH_DATA_OP_NR || arg.flags) return -EINVAL; @@ -497,6 +522,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c, struct bch_sb *sb; int ret = 0; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || arg.pad) return -EINVAL; @@ -537,6 +565,9 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, struct bch_dev *ca; unsigned i; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for_each_online_member(ca, c, i) if (ca->disk_sb.bdev->bd_dev == dev) { percpu_ref_put(&ca->io_ref); @@ -552,6 +583,9 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, struct bch_dev *ca; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~BCH_BY_INDEX) || arg.pad) return -EINVAL; @@ -572,6 +606,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, struct bch_dev *ca; int ret; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((arg.flags & ~BCH_BY_INDEX) || arg.pad) return -EINVAL; @@ -597,7 +634,6 @@ do { \ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) { - /* ioctls that don't require admin cap: */ switch (cmd) { case BCH_IOCTL_QUERY_UUID: return bch2_ioctl_query_uuid(c, arg); @@ -605,12 +641,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) return bch2_ioctl_fs_usage(c, arg); case BCH_IOCTL_DEV_USAGE: return bch2_ioctl_dev_usage(c, arg); - } - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { #if 0 case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); @@ -626,7 +656,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EINVAL; - /* ioctls that do require admin cap: */ switch (cmd) { case BCH_IOCTL_DISK_ADD: BCH_IOCTL(disk_add, struct bch_ioctl_disk); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..fbe8603cfb30 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -6,83 +6,108 @@ #include <linux/crc32c.h> #include <linux/crypto.h> +#include <linux/xxhash.h> #include <linux/key.h> #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> #include <keys/user-type.h> -static u64 bch2_checksum_init(unsigned type) +/* + * bch2_checksum state is an abstraction of the checksum state calculated over different pages. + * it features page merging without having the checksum algorithm lose its state. + * for native checksum aglorithms (like crc), a default seed value will do. + * for hash-like algorithms, a state needs to be stored + */ + +struct bch2_checksum_state { + union { + u64 seed; + struct xxh64_state h64state; + }; + unsigned int type; +}; + +static void bch2_checksum_init(struct bch2_checksum_state *state) { - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C_NONZERO: - return U32_MAX; - case BCH_CSUM_CRC64_NONZERO: - return U64_MAX; - case BCH_CSUM_CRC32C: - return 0; - case BCH_CSUM_CRC64: - return 0; + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + state->seed = 0; + break; + case BCH_CSUM_crc32c_nonzero: + state->seed = U32_MAX; + break; + case BCH_CSUM_crc64_nonzero: + state->seed = U64_MAX; + break; + case BCH_CSUM_xxhash: + xxh64_reset(&state->h64state, 0); + break; default: BUG(); } } -static u64 bch2_checksum_final(unsigned type, u64 crc) +static u64 bch2_checksum_final(const struct bch2_checksum_state *state) { - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C_NONZERO: - return crc ^ U32_MAX; - case BCH_CSUM_CRC64_NONZERO: - return crc ^ U64_MAX; - case BCH_CSUM_CRC32C: - return crc; - case BCH_CSUM_CRC64: - return crc; + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + return state->seed; + case BCH_CSUM_crc32c_nonzero: + return state->seed ^ U32_MAX; + case BCH_CSUM_crc64_nonzero: + return state->seed ^ U64_MAX; + case BCH_CSUM_xxhash: + return xxh64_digest(&state->h64state); default: BUG(); } } -static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) +static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) { - switch (type) { - case BCH_CSUM_NONE: - return 0; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC32C: - return crc32c(crc, data, len); - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC64: - return crc64_be(crc, data, len); + switch (state->type) { + case BCH_CSUM_none: + return; + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc32c: + state->seed = crc32c(state->seed, data, len); + break; + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc64: + state->seed = crc64_be(state->seed, data, len); + break; + case BCH_CSUM_xxhash: + xxh64_update(&state->h64state, data, len); + break; default: BUG(); } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +120,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +129,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +138,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -135,21 +161,24 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, struct nonce nonce, const void *data, size_t len) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: { - u64 crc = bch2_checksum_init(type); + case BCH_CSUM_none: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; - crc = bch2_checksum_update(type, crc, data, len); - crc = bch2_checksum_final(type, crc); + bch2_checksum_init(&state); + bch2_checksum_update(&state, data, len); - return (struct bch_csum) { .lo = cpu_to_le64(crc) }; + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; @@ -183,33 +212,34 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, struct bio_vec bv; switch (type) { - case BCH_CSUM_NONE: + case BCH_CSUM_none: return (struct bch_csum) { 0 }; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: { - u64 crc = bch2_checksum_init(type); + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; - crc = bch2_checksum_update(type, - crc, p, bv.bv_len); + bch2_checksum_update(&state, p, bv.bv_len); kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) - crc = bch2_checksum_update(type, crc, - page_address(bv.bv_page) + bv.bv_offset, + __bio_for_each_bvec(bv, bio, *iter, *iter) + bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif - crc = bch2_checksum_final(type, crc); - return (struct bch_csum) { .lo = cpu_to_le64(crc) }; + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; @@ -224,7 +254,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -283,16 +313,22 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, struct bch_csum b, size_t b_len) { + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); + state.seed = a.lo; + BUG_ON(!bch2_checksum_mergeable(type)); while (b_len) { unsigned b = min_t(unsigned, b_len, PAGE_SIZE); - a.lo = bch2_checksum_update(type, a.lo, + bch2_checksum_update(&state, page_address(ZERO_PAGE(0)), b); b_len -= b; } - + a.lo = bch2_checksum_final(&state); a.lo ^= b.lo; a.hi ^= b.hi; return a; @@ -463,7 +499,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +582,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +610,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +642,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index a8af0603c2c0..f5c1a609c5c4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,15 +7,15 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: return true; default: return false; @@ -78,11 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, { switch (type) { case BCH_CSUM_OPT_none: - return BCH_CSUM_NONE; + return BCH_CSUM_none; case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; + return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; + return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; + case BCH_CSUM_OPT_xxhash: + return BCH_CSUM_xxhash; default: BUG(); } @@ -93,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, { if (c->sb.encryption_type) return c->opts.wide_macs - ? BCH_CSUM_CHACHA20_POLY1305_128 - : BCH_CSUM_CHACHA20_POLY1305_80; + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; return bch2_csum_opt_to_type(opt, true); } @@ -102,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) - return BCH_CSUM_CHACHA20_POLY1305_128; + return BCH_CSUM_chacha20_poly1305_128; return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } @@ -138,9 +140,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 773cf87812ad..f63651d291e5 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4215c119e0a2..294e4baf4deb 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) if (c->opts.nochanges) return; - btree_node_io_lock(b); + bch2_btree_node_io_lock(b); mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { @@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) } out: mutex_unlock(&c->verify_lock); - btree_node_io_unlock(b); + bch2_btree_node_io_unlock(b); } #ifdef CONFIG_DEBUG_FS @@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int err; @@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(&trans, &iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); while (k.k && !(err = bkey_err(k))) { bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); @@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->buf[i->bytes] = '\n'; i->bytes++; - k = bch2_btree_iter_next(iter); - i->from = iter->pos; + k = bch2_btree_iter_next(&iter); + i->from = iter.pos; err = flush_buf(i); if (err) @@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; int err; @@ -313,12 +313,12 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (err) return err; - if (!i->size || !bpos_cmp(POS_MAX, i->from)) + if (!i->size || !bpos_cmp(SPOS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); - for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); i->bytes = strlen(i->buf); err = flush_buf(i); @@ -329,14 +329,14 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, * can't easily correctly restart a btree node traversal across * all nodes, meh */ - i->from = bpos_cmp(POS_MAX, b->key.k.p) + i->from = bpos_cmp(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; if (!i->size) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct btree *prev_node = NULL; int err; @@ -373,11 +373,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH); - while ((k = bch2_btree_iter_peek(iter)).k && + while ((k = bch2_btree_iter_peek(&iter)).k && !(err = bkey_err(k))) { - struct btree_iter_level *l = &iter->l[0]; + struct btree_path_level *l = &iter.path->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); @@ -396,8 +396,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (err) break; - bch2_btree_iter_advance(iter); - i->from = iter->pos; + bch2_btree_iter_advance(&iter); + i->from = iter.pos; err = flush_buf(i); if (err) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 3bf6379cefe6..fe4a85a6a8cb 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -8,6 +8,7 @@ #include "fs.h" #include "keylist.h" #include "str_hash.h" +#include "subvolume.h" #include <linux/dcache.h> @@ -63,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); } +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) + return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; + return true; +} + const struct bch_hash_desc bch2_dirent_hash_desc = { .btree_id = BTREE_ID_dirents, .key_type = KEY_TYPE_dirent, @@ -70,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .hash_bkey = dirent_hash_bkey, .cmp_key = dirent_cmp_key, .cmp_bkey = dirent_cmp_bkey, + .is_visible = dirent_is_visible, }; const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -99,7 +110,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (memchr(d.v->d_name, '/', len)) return "invalid name"; - if (le64_to_cpu(d.v->d_inum) == d.k->p.inode) + if (d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode) return "dirent points to own directory"; return NULL; @@ -112,11 +124,16 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, bch_scnmemcpy(out, d.v->d_name, bch2_dirent_name_bytes(d)); - pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); + pr_buf(out, " -> %llu type %s", + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), + bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - u8 type, const struct qstr *name, u64 dst) + subvol_inum dir, u8 type, + const struct qstr *name, u64 dst) { struct bkey_i_dirent *dirent; unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); @@ -132,7 +149,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, bkey_dirent_init(&dirent->k_i); dirent->k.u64s = u64s; - dirent->v.d_inum = cpu_to_le64(dst); + + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); + } + dirent->v.d_type = type; memcpy(dirent->v.d_name, name->name, name->len); @@ -146,21 +170,21 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, int flags) { struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, type, name, dst_inum); + dirent = dirent_create_key(trans, dir, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -173,75 +197,130 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) +{ + struct bch_subvolume s; + int ret = 0; + + if (d.v->d_type == DT_SUBVOL && + d.v->d_parent_subvol != dir.subvol) + return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { + target->subvol = dir.subvol; + target->inum = le64_to_cpu(d.v->d_inum); + } else { + target->subvol = le32_to_cpu(d.v->d_child_subvol); + + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + + target->inum = le64_to_cpu(s.inode); + } + + return ret; +} + int bch2_dirent_rename(struct btree_trans *trans, - u64 src_dir, struct bch_hash_info *src_hash, - u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, u64 *src_offset, - const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) { - struct btree_iter *src_iter = NULL, *dst_iter = NULL; + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = - POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); + unsigned src_type = 0, dst_type = 0, src_update_flags = 0; int ret = 0; - *src_inum = *dst_inum = 0; + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; - /* - * Lookup dst: - * - * Note that in BCH_RENAME mode, we're _not_ checking if - * the target already exists - we're relying on the VFS - * to do that check for us for correctness: - */ - dst_iter = mode == BCH_RENAME - ? bch2_hash_hole(trans, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name) - : bch2_hash_lookup(trans, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_iter); + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); + + /* Lookup src: */ + ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_INTENT); if (ret) goto out; - old_dst = bch2_btree_iter_peek_slot(dst_iter); - - if (mode != BCH_RENAME) - *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); - if (mode != BCH_RENAME_EXCHANGE) - *src_offset = dst_iter->pos.offset; + old_src = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(old_src); + if (ret) + goto out; - /* Lookup src: */ - src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_iter); + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); if (ret) goto out; - old_src = bch2_btree_iter_peek_slot(src_iter); - *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + src_type = bkey_s_c_to_dirent(old_src).v->d_type; + + if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) + return -EOPNOTSUPP; + + + /* Lookup dst: */ + if (mode == BCH_RENAME) { + /* + * Note that we're _not_ checking if the target already exists - + * we're relying on the VFS to do that check for us for + * correctness: + */ + ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name); + if (ret) + goto out; + } else { + ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_dst = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(old_dst); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + + dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; + + if (dst_type == DT_SUBVOL) + return -EOPNOTSUPP; + } + + if (mode != BCH_RENAME_EXCHANGE) + *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - new_dst->k.p = dst_iter->pos; + new_dst->k.p = dst_iter.pos; /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, 0, src_name, 0); + new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - new_src->k.p = src_iter->pos; + new_src->k.p = src_iter.pos; } else { new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(new_src); @@ -249,10 +328,10 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; bkey_init(&new_src->k); - new_src->k.p = src_iter->pos; + new_src->k.p = src_iter.pos; - if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && - bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { /* * We have a hash collision for the new dst key, * and new_src - the key we're deleting - is between @@ -265,10 +344,9 @@ int bch2_dirent_rename(struct btree_trans *trans, * If we're not overwriting, we can just insert * new_dst at the src position: */ - new_dst->k.p = src_iter->pos; - bch2_trans_update(trans, src_iter, - &new_dst->k_i, 0); - goto out_set_offset; + new_src = new_dst; + new_src->k.p = src_iter.pos; + goto out_set_src; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to @@ -280,7 +358,7 @@ int bch2_dirent_rename(struct btree_trans *trans, } else { /* Check if we need a whiteout to delete src: */ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - src_hash, src_iter); + src_hash, &src_iter); if (ret < 0) goto out; @@ -289,70 +367,108 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, src_iter, &new_src->k_i, 0); - bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); -out_set_offset: + bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); +out_set_src: + + /* + * If we're deleting a subvolume, we need to really delete the dirent, + * not just emit a whiteout in the current snapshot: + */ + if (src_type == DT_SUBVOL) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter); + if (ret) + goto out; + + new_src->k.p = src_iter.pos; + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + + bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); + if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; *dst_offset = new_dst->k.p.offset; out: - bch2_trans_iter_put(trans, src_iter); - bch2_trans_iter_put(trans, dst_iter); + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); return ret; } -int bch2_dirent_delete_at(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - struct btree_iter *iter) +int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, iter); -} + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u32 snapshot; + int ret; -struct btree_iter * -__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name, unsigned flags) -{ - return bch2_hash_lookup(trans, bch2_dirent_hash_desc, - hash_info, dir_inum, name, flags); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret > 0) + ret = -ENOENT; +err: + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; } -u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, - const struct qstr *name) + const struct qstr *name, subvol_inum *inum) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - u64 inum = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, 0, 0); - - iter = __bch2_dirent_lookup_trans(&trans, dir_inum, - hash_info, name, 0); - if (IS_ERR(iter)) { - BUG_ON(PTR_ERR(iter) == -EINTR); - goto out; - } - - k = bch2_btree_iter_peek_slot(iter); - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - bch2_trans_iter_put(&trans, iter); -out: +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); + if (ret == -EINTR) + goto retry; + if (!ret) + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return inum; + return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, BTREE_ID_dirents, - POS(dir_inum, 0), 0, k, ret) { - if (k.k->p.inode > dir_inum) + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode > dir.inum) break; if (k.k->type == KEY_TYPE_dirent) { @@ -360,24 +476,32 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; + subvol_inum target; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); - for_each_btree_key(&trans, iter, BTREE_ID_dirents, - POS(inum, ctx->pos), 0, k, ret) { - if (k.k->p.inode > inum) + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { + if (k.k->p.inode > inum.inum) break; if (k.k->type != KEY_TYPE_dirent) @@ -385,6 +509,12 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) dirent = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inum, dirent, &target); + if (ret < 0) + break; + if (ret) + continue; + /* * XXX: dir_emit() can fault and block, while we're holding * locks @@ -392,14 +522,25 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ctx->pos = dirent.k->p.offset; if (!dir_emit(ctx, dirent.v->d_name, bch2_dirent_name_bytes(dirent), - le64_to_cpu(dirent.v->d_inum), - dirent.v->d_type)) + target.inum, + vfs_d_type(dirent.v->d_type))) break; ctx->pos = dirent.k->p.offset + 1; + + /* + * read_target looks up subvolumes, we can overflow paths if the + * directory has many subvolumes in it + */ + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index e1d8ce377d43..1bb4d802bc1d 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int bch2_dirent_create(struct btree_trans *, u64, +int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + +int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); -int bch2_dirent_delete_at(struct btree_trans *, - const struct bch_hash_info *, - struct btree_iter *); +static inline unsigned vfs_d_type(unsigned type) +{ + return type == DT_SUBVOL ? DT_DIR : type; +} enum bch_rename_mode { BCH_RENAME, @@ -44,20 +48,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - u64, struct bch_hash_info *, - u64, struct bch_hash_info *, - const struct qstr *, u64 *, u64 *, - const struct qstr *, u64 *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -struct btree_iter * -__bch2_dirent_lookup_trans(struct btree_trans *, u64, - const struct bch_hash_info *, - const struct qstr *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *); +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); -int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_readdir(struct bch_fs *, u64, struct dir_context *); +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index db6e4f6cac37..bca1b8a7b673 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -392,7 +392,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); while (offset < bytes) { - unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, DIV_ROUND_UP(bytes, PAGE_SIZE)); unsigned b = min_t(size_t, bytes - offset, nr_iovecs << PAGE_SHIFT); @@ -429,13 +429,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, + POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -445,6 +446,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip } bkey_reassemble(&stripe->key.k_i, k); err: + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -552,19 +554,19 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) return 0; } -static int ec_stripe_mem_alloc(struct bch_fs *c, +static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { size_t idx = iter->pos.offset; int ret = 0; - if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) + if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) return ret; - bch2_trans_unlock(iter->trans); + bch2_trans_unlock(trans); ret = -EINTR; - if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) + if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL)) return ret; return -ENOMEM; @@ -704,7 +706,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c, struct disk_reservation *res) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bpos min_pos = POS(0, 1); struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); @@ -719,7 +721,7 @@ retry: if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(iter, start_pos); + bch2_btree_iter_set_pos(&iter, start_pos); continue; } @@ -733,19 +735,19 @@ retry: goto err; found_slot: - start_pos = iter->pos; + start_pos = iter.pos; - ret = ec_stripe_mem_alloc(c, iter); + ret = ec_stripe_mem_alloc(&trans, &iter); if (ret) goto err; - stripe->k.p = iter->pos; + stripe->k.p = iter.pos; - ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?: + ret = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?: bch2_trans_commit(&trans, res, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -759,15 +761,15 @@ err: static int ec_stripe_bkey_update(struct btree_trans *trans, struct bkey_i_stripe *new) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; const struct bch_stripe *existing; unsigned i; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -790,9 +792,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, stripe_blockcount_set(&new->v, i, stripe_blockcount_get(existing, i)); - ret = bch2_trans_update(trans, iter, &new->k_i, 0); + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -820,10 +822,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct bkey *pos) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_extent e; struct bkey_buf sk; + struct bpos next_pos; int ret = 0, dev, block; bch2_bkey_buf_init(&sk); @@ -831,23 +834,24 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, /* XXX this doesn't support the reflink btree */ - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - bkey_start_pos(pos), - BTREE_ITER_INTENT); - - while ((k = bch2_btree_iter_peek(iter)).k && + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(pos), + BTREE_ITER_INTENT); +retry: + while (bch2_trans_begin(&trans), + (k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { struct bch_extent_ptr *ptr, *ec_ptr = NULL; if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } block = bkey_matches_stripe(&s->key.v, k); if (block < 0) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } @@ -862,16 +866,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, extent_stripe_ptr_add(e, s, ec_ptr, block); - bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); + next_pos = sk.k->k.p; + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); - if (ret == -EINTR) - ret = 0; + if (!ret) + bch2_btree_iter_set_pos(&iter, next_pos); if (ret) break; } - bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) + goto retry; + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -1061,16 +1070,14 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } -void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, - struct bpos pos, unsigned sectors) +void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, + struct bkey *k) { - struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct ec_stripe_new *ec; + struct ec_stripe_new *ec = ob->ec; - if (!ob) + if (!ec) return; - ec = ob->ec; mutex_lock(&ec->lock); if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, @@ -1080,8 +1087,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, } bkey_init(&ec->keys.top->k); - ec->keys.top->k.p = pos; - bch2_key_resize(&ec->keys.top->k, sectors); + ec->keys.top->k.p = k->p; + ec->keys.top->k.size = k->size; bch2_keylist_push(&ec->keys); mutex_unlock(&ec->lock); @@ -1147,7 +1154,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_blocks = nr_data + nr_parity; s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); - s->v.csum_type = BCH_CSUM_CRC32C; + s->v.csum_type = BCH_CSUM_crc32c; s->v.pad = 0; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { @@ -1592,7 +1599,7 @@ write: int bch2_stripes_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct genradix_iter giter; struct bkey_i_stripe *new_key; struct stripe *m; @@ -1603,8 +1610,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); genradix_for_each(&c->stripes[0], giter, m) { if (!m->alive) @@ -1612,13 +1619,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags, - __bch2_stripe_write_key(&trans, iter, m, + __bch2_stripe_write_key(&trans, &iter, m, giter.pos, new_key)); if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -1627,13 +1634,14 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) return ret; } -static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) +static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) { + struct bch_fs *c = trans->c; int ret = 0; if (k.k->type == KEY_TYPE_stripe) ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: - bch2_mark_key(c, k, 0, 0, NULL, 0, + bch2_mark_key(trans, k, BTREE_TRIGGER_NOATOMIC); return ret; @@ -1641,8 +1649,13 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes, - bch2_stripes_read_fn); + struct btree_trans trans; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, + bch2_stripes_read_fn); + bch2_trans_exit(&trans); if (ret) bch_err(c, "error reading stripes: %i", ret); @@ -1652,20 +1665,21 @@ int bch2_stripes_read(struct bch_fs *c) int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; size_t i, idx = 0; int ret = 0; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0); - k = bch2_btree_iter_prev(iter); - if (!IS_ERR_OR_NULL(k.k)) + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (!ret && k.k) idx = k.k->p.offset + 1; - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans); + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); if (ret) return ret; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index e79626b59509..eb16e140e2c8 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -193,8 +193,8 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, - struct bpos, unsigned); +void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, + struct bkey *); void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 90c3b986c264..2cea694575e9 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -111,6 +111,7 @@ found: list_move(&s->list, &c->fsck_errors); s->nr++; if (c->opts.ratelimit_errors && + !(flags & FSCK_NO_RATELIMIT) && s->nr >= FSCK_ERR_RATELIMIT_NR) { if (s->nr == FSCK_ERR_RATELIMIT_NR) suppressing = true; diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index d8cd19b3f63c..986938298adc 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -104,6 +104,7 @@ struct fsck_err_state { #define FSCK_CAN_FIX (1 << 0) #define FSCK_CAN_IGNORE (1 << 1) #define FSCK_NEED_FSCK (1 << 2) +#define FSCK_NO_RATELIMIT (1 << 3) __printf(3, 4) __cold enum fsck_err_ret bch2_fsck_err(struct bch_fs *, diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index bb4b2b4352e0..58b2c96f450c 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -58,10 +58,10 @@ static int count_iters_for_insert(struct btree_trans *trans, u64 idx = le64_to_cpu(p.v->idx); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c r_k; - for_each_btree_key(trans, iter, + for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), BTREE_ITER_SLOTS, r_k, ret2) { if (bkey_cmp(bkey_start_pos(r_k.k), @@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans, break; } } + bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_put(trans, iter); break; } } @@ -94,16 +94,20 @@ static int count_iters_for_insert(struct btree_trans *trans, #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -int bch2_extent_atomic_end(struct btree_iter *iter, +int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_i *insert, struct bpos *end) { - struct btree_trans *trans = iter->trans; - struct btree_iter *copy; + struct btree_iter copy; struct bkey_s_c k; unsigned nr_iters = 0; int ret; + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + *end = insert->k.p; /* extent_update_to_keys(): */ @@ -114,9 +118,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, if (ret < 0) return ret; - copy = bch2_trans_copy_iter(trans, iter); + bch2_trans_copy_iter(©, iter); - for_each_btree_key_continue(copy, 0, k, ret) { + for_each_btree_key_continue_norestart(copy, 0, k, ret) { unsigned offset = 0; if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) @@ -145,66 +149,21 @@ int bch2_extent_atomic_end(struct btree_iter *iter, break; } - bch2_trans_iter_put(trans, copy); + bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } -int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_trim_atomic(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) { struct bpos end; int ret; - ret = bch2_extent_atomic_end(iter, k, &end); + ret = bch2_extent_atomic_end(trans, iter, k, &end); if (ret) return ret; bch2_cut_back(end, k); return 0; } - -int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - return !bkey_cmp(end, k->k.p); -} - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bkey_s_c k; - int ret, sectors; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - /* Check if we're splitting a compressed extent: */ - - if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && - bkey_cmp(insert->k.p, k.k->p) < 0 && - (sectors = bch2_bkey_sectors_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 38dc084627d2..6f5cf449361a 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -4,13 +4,9 @@ #include "bcachefs.h" -int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, - struct bpos *); -int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, - struct bkey_i *); +int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bpos *); +int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index b07d39555eb6..89b5be907eea 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -192,9 +192,10 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx written %u min_key ", + pr_buf(out, "seq %llx written %u min_key %s", le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors_written)); + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); bch2_bpos_to_text(out, bp.v->min_key); pr_buf(out, " "); @@ -230,112 +231,134 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; - - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); - - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; + return false; - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_is_compressed(crc_l)) - return BCH_MERGE_NOMERGE; + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; - if (crc_l.uncompressed_size + crc_r.uncompressed_size > + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.has_ec != rp.has_ec) + return false; + + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; + + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; + + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; + + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset ) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > c->sb.encoded_extent_max) + return false; + + if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; + return false; } - } - - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; - - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (!extent_entry_is_crc(en_l)) - continue; - - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset ) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, - extent_entry_type(en_l)); + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); } bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* KEY_TYPE_reservation: */ @@ -363,25 +386,17 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, r.v->nr_replicas); } -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); if (l.v->generation != r.v->generation || l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - bch2_cut_front_s(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } + return false; bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* Extent checksum entries: */ @@ -465,7 +480,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -597,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - bch2_trans_iter_put(&trans, iter); - - bch2_trans_exit(&trans); - - return ret; -} - unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -802,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; bool drop_crc = true; EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; - - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { break; - - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; break; } + } - dst = prev; + extent_entry_drop(k, entry); + + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; + + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + return ret; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + __bch2_bkey_drop_ptr(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } - return dst; + return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -906,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - /* will only happen if all pointers were cached: */ - if (!bch2_bkey_nr_ptrs(k.s_c)) - k.k->type = KEY_TYPE_deleted; - return bkey_deleted(k.k); } @@ -946,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); break; case BCH_EXTENT_ENTRY_stripe_ptr: ec = &entry->stripe_ptr; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9999805f955e..9c2567274a2b 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - switch (extent_entry_type(e)) { - case BCH_EXTENT_ENTRY_ptr: - return true; - default: - return false; - } + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} + +static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) @@ -394,8 +394,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_extent_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_extent (struct bkey_ops) { \ .key_invalid = bch2_extent_invalid, \ @@ -409,8 +408,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *, const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_reservation_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reservation (struct bkey_ops) { \ .key_invalid = bch2_reservation_invalid, \ @@ -428,6 +426,17 @@ void bch2_extent_crc_append(struct bkey_i *, /* Generic code for keys with pointers: */ +static inline bool bkey_is_btree_ptr(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return true; + default: + return false; + } +} + static inline bool bkey_extent_is_direct_data(const struct bkey *k) { switch (k->type) { @@ -558,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); @@ -570,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); +union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 00a63fecb976..5f3429e99115 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -6,113 +6,207 @@ #include "dirent.h" #include "fs-common.h" #include "inode.h" +#include "subvolume.h" #include "xattr.h" #include <linux/posix_acl.h> -int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *new_inode, const struct qstr *name, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct posix_acl *default_acl, - struct posix_acl *acl) + struct posix_acl *acl, + subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL; - struct btree_iter *inode_iter = NULL; - struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); - u64 dir_offset = 0; + u64 dir_target; + u32 snapshot; + unsigned dir_type = mode_to_type(mode); int ret; - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); if (ret) goto err; - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); - - if (!name) - new_inode->bi_flags |= BCH_INODE_UNLINKED; - - inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - default_acl, ACL_TYPE_DEFAULT); + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, + BTREE_ITER_CACHED, &s); + if (ret) + goto err; + + snapshot_src.inum = le64_to_cpu(s.inode); + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; } - if (acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - acl, ACL_TYPE_ACCESS); + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; + + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); if (ret) goto err; } - if (name) { + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } + } + + if (!(flags & BCH_CREATE_TMPFILE)) { struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - dir_u->bi_mtime = dir_u->bi_ctime = now; + u64 dir_offset; - if (S_ISDIR(new_inode->bi_mode)) + if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; - ret = bch2_inode_write(trans, dir_iter, dir_u); + ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(new_inode->bi_mode), - name, new_inode->bi_inum, + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; - } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } } - /* XXX use bch2_btree_iter_set_snapshot() */ - inode_iter->snapshot = U32_MAX; - bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); - ret = bch2_inode_write(trans, inode_iter, new_inode); + ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); err: - bch2_trans_iter_put(trans, inode_iter); - bch2_trans_iter_put(trans, dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); return ret; } -int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, - u64 inum, struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, const struct qstr *name) +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL, *inode_iter = NULL; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; int ret; - inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + if (dir.subvol != inum.subvol) + return -EXDEV; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; @@ -120,80 +214,110 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum, &dir_offset, + name, inum.inum, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir_inum; + inode_u->bi_dir = dir.inum; inode_u->bi_dir_offset = dir_offset; } - ret = bch2_inode_write(trans, dir_iter, dir_u) ?: - bch2_inode_write(trans, inode_iter, inode_u); + ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_put(trans, dir_iter); - bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); return ret; } int bch2_unlink_trans(struct btree_trans *trans, - u64 dir_inum, struct bch_inode_unpacked *dir_u, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, - const struct qstr *name) + const struct qstr *name, + bool deleting_snapshot) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, - *inode_iter = NULL; + struct btree_iter dir_iter = { NULL }; + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(c); + subvol_inum inum; + u64 now = bch2_current_time(c); struct bkey_s_c k; int ret; - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); - dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, - name, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dirent_iter); + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; - k = bch2_btree_iter_peek_slot(dirent_iter); - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - - inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); if (ret) goto err; - if (inode_u->bi_dir == k.k->p.inode && - inode_u->bi_dir_offset == k.k->p.offset) { + if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + + if (deleting_snapshot && !inode_u->bi_subvol) { + ret = -ENOENT; + goto err; + } + + if (deleting_snapshot || inode_u->bi_subvol) { + ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } else { + bch2_inode_nlink_dec(inode_u); + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { inode_u->bi_dir = 0; inode_u->bi_dir_offset = 0; } dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); - bch2_inode_nlink_dec(inode_u); - - ret = (S_ISDIR(inode_u->bi_mode) - ? bch2_empty_dir_trans(trans, inum) - : 0) ?: - bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: - bch2_inode_write(trans, dir_iter, dir_u) ?: - bch2_inode_write(trans, inode_iter, inode_u); + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_put(trans, inode_iter); - bch2_trans_iter_put(trans, dirent_iter); - bch2_trans_iter_put(trans, dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(trans, &dir_iter); return ret; } @@ -222,8 +346,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, } int bch2_rename_trans(struct btree_trans *trans, - u64 src_dir, struct bch_inode_unpacked *src_dir_u, - u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, struct bch_inode_unpacked *src_inode_u, struct bch_inode_unpacked *dst_inode_u, const struct qstr *src_name, @@ -231,25 +355,27 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; - struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; + struct btree_iter src_dir_iter = { NULL }; + struct btree_iter dst_dir_iter = { NULL }; + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, src_offset, dst_inode, dst_offset; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; u64 now = bch2_current_time(c); int ret; - src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_dir_iter); + ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, + BTREE_ITER_INTENT); if (ret) goto err; src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir != src_dir) { - dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_dir_iter); + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); if (ret) goto err; @@ -262,22 +388,20 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, &src_offset, - dst_name, &dst_inode, &dst_offset, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, mode); if (ret) goto err; - src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_inode_iter); + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, + BTREE_ITER_INTENT); if (ret) goto err; - if (dst_inode) { - dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_inode_iter); + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, + BTREE_ITER_INTENT); if (ret) goto err; } @@ -307,7 +431,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inode)) { + bch2_empty_dir_trans(trans, dst_inum)) { ret = -ENOTEMPTY; goto err; } @@ -326,12 +450,12 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(src_inode_u->bi_mode)) { + if (is_subdir_for_nlink(src_inode_u)) { src_dir_u->bi_nlink--; dst_dir_u->bi_nlink++; } - if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; } @@ -342,28 +466,28 @@ int bch2_rename_trans(struct btree_trans *trans, src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; - if (src_dir != dst_dir) { + if (src_dir.inum != dst_dir.inum) { dst_dir_u->bi_mtime = now; dst_dir_u->bi_ctime = now; } src_inode_u->bi_ctime = now; - if (dst_inode) + if (dst_inum.inum) dst_inode_u->bi_ctime = now; - ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: - (src_dir != dst_dir - ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) : 0 ) ?: - bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: - (dst_inode - ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) : 0 ); err: - bch2_trans_iter_put(trans, dst_inode_iter); - bch2_trans_iter_put(trans, src_inode_iter); - bch2_trans_iter_put(trans, dst_dir_iter); - bch2_trans_iter_put(trans, src_dir_iter); + bch2_trans_iter_exit(trans, &dst_inode_iter); + bch2_trans_iter_exit(trans, &src_inode_iter); + bch2_trans_iter_exit(trans, &dst_dir_iter); + bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index 2273b7961c9b..dde237859514 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -4,27 +4,33 @@ struct posix_acl; -int bch2_create_trans(struct btree_trans *, u64, +#define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + +int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, - struct posix_acl *); + struct posix_acl *, + subvol_inum, unsigned); -int bch2_link_trans(struct btree_trans *, u64, - u64, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, const struct qstr *); -int bch2_unlink_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, +int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, struct bch_inode_unpacked *, - const struct qstr *); + const struct qstr *, bool); int bch2_rename_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, - u64, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index e6916bbc25eb..5bcdfe3c5890 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -99,8 +99,7 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping, * is continually redirtying a specific page */ do { - if (!mapping->nrpages && - !mapping->nrexceptional) + if (!mapping->nrpages) return 0; ret = filemap_write_and_wait_range(mapping, start, end); @@ -224,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, return; mutex_lock(&inode->ei_quota_lock); + BUG_ON((s64) inode->v.i_blocks + sectors < 0); + inode->v.i_blocks += sectors; + #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { BUG_ON(sectors > quota_res->sectors); @@ -235,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } #endif - inode->v.i_blocks += sectors; mutex_unlock(&inode->ei_quota_lock); } @@ -244,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* stored in page->private: */ struct bch_page_sector { - /* Uncompressed, fully allocated replicas: */ - unsigned nr_replicas:3; + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; - /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ - unsigned replicas_reserved:3; + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; /* i_sectors: */ enum { SECTOR_UNALLOCATED, SECTOR_RESERVED, SECTOR_DIRTY, + SECTOR_DIRTY_RESERVED, SECTOR_ALLOCATED, - } state:2; + } state:8; }; struct bch_page_state { spinlock_t lock; atomic_t write_count; + bool uptodate; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -282,28 +285,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -317,13 +305,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -333,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page, return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); } +static unsigned bkey_to_sector_state(const struct bkey *k) +{ + if (k->type == KEY_TYPE_reservation) + return SECTOR_RESERVED; + if (bkey_extent_is_allocation(k)) + return SECTOR_ALLOCATED; + return SECTOR_UNALLOCATED; +} + +static void __bch2_page_state_set(struct page *page, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); + unsigned i; + + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + s->s[i].state = state; + } + + if (i == PAGE_SECTORS) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, + struct page **pages, unsigned nr_pages) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; + unsigned pg_idx = 0; + u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + while (pg_idx < nr_pages) { + struct page *page = pages[pg_idx]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; + unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; + + BUG_ON(k.k->p.offset < pg_start); + BUG_ON(bkey_start_offset(k.k) > pg_end); + + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) + __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); + + if (k.k->p.offset < pg_end) + break; + pg_idx++; + } + + if (pg_idx == nr_pages) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + bio_for_each_segment(bv, bio, iter) + __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, + bv.bv_len >> 9, nr_ptrs, state); +} + +static void mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); +} + +static void mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + s64 i_sectors_delta = 0; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + switch (s->s[j].state) { + case SECTOR_UNALLOCATED: + s->s[j].state = SECTOR_RESERVED; + break; + case SECTOR_DIRTY: + s->s[j].state = SECTOR_DIRTY_RESERVED; + i_sectors_delta--; + break; + default: + break; + } + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); + + i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) { /* XXX: this should not be open coded */ @@ -417,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c, if (!s) return -ENOMEM; + BUG_ON(!s->uptodate); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { @@ -471,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page) disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - if (s->s[i].state == SECTOR_DIRTY) { - dirty_sectors++; + switch (s->s[i].state) { + case SECTOR_DIRTY: s->s[i].state = SECTOR_UNALLOCATED; + --dirty_sectors; + break; + case SECTOR_DIRTY_RESERVED: + s->s[i].state = SECTOR_RESERVED; + break; + default: + break; } } bch2_disk_reservation_put(c, &disk_res); - if (dirty_sectors) - i_sectors_acct(c, inode, NULL, -dirty_sectors); + i_sectors_acct(c, inode, NULL, dirty_sectors); bch2_page_state_release(page); } @@ -513,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; - if (s->s[i].state == SECTOR_UNALLOCATED) + switch (s->s[i].state) { + case SECTOR_UNALLOCATED: + s->s[i].state = SECTOR_DIRTY; dirty_sectors++; - - s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); + break; + case SECTOR_RESERVED: + s->s[i].state = SECTOR_DIRTY_RESERVED; + break; + default: + break; + } } spin_unlock(&s->lock); - if (dirty_sectors) - i_sectors_acct(c, inode, &res->quota, dirty_sectors); + i_sectors_acct(c, inode, &res->quota, dirty_sectors); if (!PageDirty(page)) __set_page_dirty_nobuffers(page); @@ -576,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct bch2_page_reservation res; unsigned len; loff_t isize; - int ret = VM_FAULT_LOCKED; + int ret; bch2_page_reservation_init(c, inode, &res); @@ -602,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + } + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; @@ -612,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) bch2_page_reservation_put(c, inode, &res); wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; out: bch2_pagecache_add_put(&inode->ei_pagecache_lock); sb_end_pagefault(inode->v.i_sb); @@ -646,18 +857,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -671,10 +876,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -693,31 +898,29 @@ struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -725,69 +928,14 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; } -static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = k.k->type == KEY_TYPE_reservation - ? SECTOR_RESERVED - : SECTOR_ALLOCATED; - - bio_for_each_segment(bv, bio, iter) { - struct bch_page_state *s = bch2_page_state(bv.bv_page); - unsigned i; - - for (i = bv.bv_offset >> 9; - i < (bv.bv_offset + bv.bv_len) >> 9; - i++) { - s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = state; - } - } -} - static bool extent_partial_reads_expensive(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -807,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, { while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { - pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; struct page *page = readpage_iter_next(iter); int ret; @@ -820,11 +968,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -851,35 +996,58 @@ static void readpage_bio_extend(struct readpages_iter *iter, } } -static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, - struct bch_read_bio *rbio, u64 inum, +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + u32 snapshot; int ret = 0; rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - bch2_btree_iter_set_pos(iter, - POS(inum, rbio->bio.bi_iter.bi_sector)); + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + break; + } - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -906,10 +1074,9 @@ retry: if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - if (bkey_extent_is_allocation(k.k)) - bch2_add_page_sectors(&rbio->bio, k); + bch2_bio_page_state_set(&rbio->bio, k); - bch2_read_extent(trans, rbio, iter->pos, + bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) @@ -917,13 +1084,19 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; } +err: + bch2_trans_iter_exit(trans, &iter); if (ret == -EINTR) goto retry; if (ret) { - bch_err_inum_ratelimited(c, inum, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); @@ -932,24 +1105,20 @@ retry: bch2_bkey_buf_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; - struct btree_iter *iter; struct page *page; struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); bch2_pagecache_add_get(&inode->ei_pagecache_lock); @@ -958,7 +1127,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, unsigned n = min_t(unsigned, readpages_iter.nr_pages - readpages_iter.idx, - BIO_MAX_PAGES); + BIO_MAX_VECS); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), opts); @@ -966,43 +1135,34 @@ int bch2_readpages(struct file *file, struct address_space *mapping, readpages_iter.idx++; bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); - rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; + rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); - bchfs_read(&trans, iter, rbio, inode->v.i_ino, + bchfs_read(&trans, rbio, inode_inum(inode), &readpages_iter); } bch2_pagecache_add_put(&inode->ei_pagecache_lock); - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inum, struct page *page) + subvol_inum inum, struct page *page) { struct btree_trans trans; - struct btree_iter *iter; bch2_page_state_create(page, __GFP_NOFAIL); bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); rbio->bio.bi_iter.bi_sector = - (sector_t) page->index << PAGE_SECTOR_SHIFT; + (sector_t) page->index << PAGE_SECTORS_SHIFT; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); - - bchfs_read(&trans, iter, rbio, inum, NULL); - - bch2_trans_iter_put(&trans, iter); + bchfs_read(&trans, rbio, inum, NULL); bch2_trans_exit(&trans); } @@ -1016,7 +1176,7 @@ int bch2_readpage(struct file *file, struct page *page) rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); rbio->bio.bi_end_io = bch2_readpages_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); return 0; } @@ -1039,7 +1199,7 @@ static int bch2_read_single_page(struct page *page, rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -1081,36 +1241,37 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; up(&io->op.c->io_in_flight); if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1134,7 +1295,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1168,8 +1329,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, { struct bch_write_op *op; - w->io = container_of(bio_alloc_bioset(GFP_NOFS, - BIO_MAX_PAGES, + w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); @@ -1179,10 +1339,10 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; - op_journal_seq_set(op, &inode->ei_journal_seq); op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); @@ -1225,16 +1385,16 @@ static int __bch2_writepage(struct page *page, do_io: s = bch2_page_state_create(page, __GFP_NOFAIL); - ret = bch2_get_page_disk_reservation(c, inode, page, true); - if (ret) { - SetPageError(page); - mapping_set_error(page->mapping, ret); - unlock_page(page); - return 0; - } + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_page_disk_reservation(c, inode, page, false); + BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); orig = *s; + spin_unlock(&s->lock); for (i = 0; i < PAGE_SECTORS; i++) { if (s->s[i].state < SECTOR_DIRTY) @@ -1267,7 +1427,7 @@ do_io: offset = 0; while (1) { - unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; while (offset < PAGE_SECTORS && @@ -1277,22 +1437,21 @@ do_io: if (offset == PAGE_SECTORS) break; - sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; - while (offset + sectors < PAGE_SECTORS && - orig.s[offset + sectors].state >= SECTOR_DIRTY) + orig.s[offset + sectors].state >= SECTOR_DIRTY) { + reserved_sectors += orig.s[offset + sectors].replicas_reserved; + dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; sectors++; - - for (i = offset; i < offset + sectors; i++) { - reserved_sectors += orig.s[i].replicas_reserved; - dirty_sectors += orig.s[i].state == SECTOR_DIRTY; } + BUG_ON(!sectors); + + sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_PAGES * PAGE_SIZE) || + (BIO_MAX_VECS * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1403,6 +1562,12 @@ readpage: if (ret) goto err; out: + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); + if (ret) + goto out; + } + ret = bch2_page_reservation_get(c, inode, page, res, offset, len, true); if (ret) { @@ -1532,20 +1697,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } while (reserved < len) { - struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; + unsigned i = (offset + reserved) >> PAGE_SHIFT; + struct page *page = pages[i]; unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - reserved, PAGE_SIZE - pg_offset); -retry_reservation: - ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len, true); - if (ret && !PageUptodate(page)) { - ret = bch2_read_single_page(page, mapping); - if (!ret) - goto retry_reservation; + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), + pages + i, nr_pages - i); + if (ret) + goto out; } + ret = bch2_page_reservation_get(c, inode, page, &res, + pg_offset, pg_len, true); if (ret) goto out; @@ -1561,8 +1727,8 @@ retry_reservation: unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - copied, PAGE_SIZE - pg_offset); - unsigned pg_copied = iov_iter_copy_from_user_atomic(page, - iter, pg_offset, pg_len); + unsigned pg_copied = copy_page_from_iter_atomic(page, + pg_offset, pg_len,iter); if (!pg_copied) break; @@ -1575,7 +1741,6 @@ retry_reservation: } flush_dcache_page(page); - iov_iter_advance(iter, pg_copied); copied += pg_copied; if (pg_copied != pg_len) @@ -1693,18 +1858,6 @@ again: /* O_DIRECT reads */ -static void bio_release_pages(struct bio *bio, bool mark_dirty) -{ - struct bio_vec *bvec; - unsigned i; - - bio_for_each_segment_all(bvec, bio, i) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); - } -} - static void bio_check_or_release(struct bio *bio, bool check_dirty) { if (check_dirty) { @@ -1768,7 +1921,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) iter->count -= shorten; bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -1803,7 +1956,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_npages(iter, BIO_MAX_VECS), &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: @@ -1827,7 +1980,7 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } iter->count += shorten; @@ -1882,6 +2035,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) /* O_DIRECT writes */ +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (err == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + static void bch2_dio_write_loop_async(struct bch_write_op *); static long bch2_dio_write_loop(struct dio_write *dio) @@ -1892,8 +2089,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned, iter_count; + unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; @@ -1906,7 +2104,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) iter_count = dio->iter.count; if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -1916,7 +2114,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); /* * If the fault handler returned an error but also signalled @@ -1949,8 +2147,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) - put_page(bv->bv_page); ret = -EFAULT; goto err; } @@ -1958,9 +2154,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); dio->op.end_io = bch2_dio_write_loop_async; dio->op.target = dio->op.opts.foreground_target; - op_journal_seq_set(&dio->op, &inode->ei_journal_seq); dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); if ((req->ki_flags & IOCB_DSYNC) && @@ -1971,8 +2167,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), + !bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), dio->op.opts.data_replicas, dio->op.opts.compression != 0)) goto err; @@ -2015,8 +2211,10 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) - put_page(bv->bv_page); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); + bio->bi_vcnt = 0; if (dio->op.error) { set_bit(EI_INODE_ERROR, &inode->ei_flags); @@ -2039,6 +2237,9 @@ err: if (dio->free_iov) kfree(dio->iter.iov); + if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); bio_put(bio); /* inode->i_dio_count is our ref on inode and thus bch_fs */ @@ -2105,7 +2306,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) } bio = bio_alloc_bioset(GFP_KERNEL, - iov_iter_npages(iter, BIO_MAX_PAGES), + iov_iter_is_bvec(iter) + ? 0 + : iov_iter_npages(iter, BIO_MAX_VECS), &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); init_completion(&dio->done); @@ -2182,45 +2385,58 @@ unlock: /* fsync: */ -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +/* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead + */ +static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2; + struct bch_inode_unpacked inode; + int ret; - ret = file_write_and_wait_range(file, start, end); + if (c->opts.journal_flush_disabled) + return 0; + + ret = bch2_inode_find_by_inum(c, inum, &inode); if (ret) return ret; - if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) - goto out; + return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); +} - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - return ret; -out: - if (!c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, - inode->ei_journal_seq); - ret2 = file_check_and_advance_wb_err(file); +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret, ret2, ret3; - return ret ?: ret2; + ret = file_write_and_wait_range(file, start, end); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode_inum(inode)); + + return ret ?: ret2 ?: ret3; } /* truncate: */ -static inline int range_has_data(struct bch_fs *c, - struct bpos start, - struct bpos end) +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); - for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { + ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; @@ -2229,9 +2445,14 @@ static inline int range_has_data(struct bch_fs *c, break; } } - bch2_trans_iter_put(&trans, iter); + start = iter.pos; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } static int __bch2_truncate_page(struct bch_inode_info *inode, @@ -2244,6 +2465,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; unsigned i; struct page *page; + s64 i_sectors_delta = 0; int ret = 0; /* Page boundary? Nothing to do */ @@ -2261,9 +2483,9 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * page */ - ret = range_has_data(c, - POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), - POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + ret = range_has_data(c, inode->ei_subvol, + POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); if (ret <= 0) return ret; @@ -2295,9 +2517,21 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; + if (s->s[i].state == SECTOR_DIRTY) + i_sectors_delta--; s->s[i].state = SECTOR_UNALLOCATED; } + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + /* + * Caller needs to know whether this page will be written out by + * writeback - doing an i_size update if necessary - or whether it will + * be responsible for the i_size update: + */ + ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), + PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; + zero_user_segment(page, start_offset, end_offset); /* @@ -2306,8 +2540,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: because we aren't currently tracking whether the page has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - ret = bch2_get_page_disk_reservation(c, inode, page, false); - BUG_ON(ret); + BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); /* * This removes any writeable userspace mappings; we need to force @@ -2329,11 +2562,25 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } -static int bch2_extend(struct bch_inode_info *inode, +static int bch2_truncate_pages(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, + start, end); + + if (ret >= 0 && + start >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode, + end >> PAGE_SHIFT, + start, end); + return ret; +} + +static int bch2_extend(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, struct iattr *iattr) { - struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; int ret; @@ -2347,24 +2594,15 @@ static int bch2_extend(struct bch_inode_info *inode, return ret; truncate_setsize(&inode->v, iattr->ia_size); - setattr_copy(&inode->v, iattr); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - - return ret; + return bch2_setattr_nonsize(mnt_userns, inode, iattr); } static int bch2_truncate_finish_fn(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { - struct bch_fs *c = inode->v.i_sb->s_fs_info; - bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); return 0; } @@ -2378,30 +2616,33 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode, return 0; } -int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) +int bch2_truncate(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; struct bch_inode_unpacked inode_u; - struct btree_trans trans; - struct btree_iter *iter; u64 new_i_size = iattr->ia_size; s64 i_sectors_delta = 0; int ret = 0; - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - /* - * fetch current on disk i_size: inode is locked, i_size can only - * increase underneath us: + * If the truncate call with change the size of the file, the + * cmtimes should be updated. If the size will not change, we + * do not need to update the cmtimes. */ - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); - ret = PTR_ERR_OR_ZERO(iter); - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + if (iattr->ia_size != inode->v.i_size) { + if (!(iattr->ia_valid & ATTR_MTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_mtime); + if (!(iattr->ia_valid & ATTR_CTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_ctime); + iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) goto err; @@ -2418,12 +2659,14 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(inode, &inode_u, iattr); + ret = bch2_extend(mnt_userns, inode, &inode_u, iattr); goto err; } + iattr->ia_valid &= ~ATTR_SIZE; + ret = bch2_truncate_page(inode, iattr->ia_size); - if (unlikely(ret)) + if (unlikely(ret < 0)) goto err; /* @@ -2457,20 +2700,21 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) truncate_setsize(&inode->v, iattr->ia_size); - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); + BUG_ON(!inode->v.i_size && inode->v.i_blocks); + if (unlikely(ret)) goto err; - setattr_copy(&inode->v, iattr); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, - ATTR_MTIME|ATTR_CTIME); + ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); mutex_unlock(&inode->ei_update_lock); + + ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); return ret; @@ -2490,49 +2734,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode, static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 discard_start = round_up(offset, block_bytes(c)) >> 9; - u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; + u64 end = offset + len; + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; int ret = 0; - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) goto err; - if (offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) { - ret = __bch2_truncate_page(inode, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto err; - } + truncated_last_page = ret; - truncate_pagecache_range(&inode->v, offset, offset + len - 1); + truncate_pagecache_range(&inode->v, offset, end - 1); - if (discard_start < discard_end) { + if (block_start < block_end ) { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, - discard_start, discard_end, - &inode->ei_journal_seq, + ret = bch2_fpunch(c, inode_inum(inode), + block_start >> 9, block_end >> 9, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME) ?: ret; + if (end >= inode->v.i_size && !truncated_last_page) { + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + } else { + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } mutex_unlock(&inode->ei_update_lock); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; } @@ -2544,7 +2778,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct address_space *mapping = inode->v.i_mapping; struct bkey_buf copy; struct btree_trans trans; - struct btree_iter *src, *dst, *del; + struct btree_iter src, dst, del; loff_t shift, new_size; u64 src_start; int ret = 0; @@ -2552,31 +2786,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - /* - * We need i_mutex to keep the page cache consistent with the extents - * btree, and the btree consistent with i_size - we don't need outside - * locking for the extents btree itself, because we're using linked - * iterators - */ - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - if (insert) { - ret = -EFBIG; if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) - goto err; + return -EFBIG; - ret = -EINVAL; if (offset >= inode->v.i_size) - goto err; + return -EINVAL; src_start = U64_MAX; shift = len; } else { - ret = -EINVAL; if (offset + len >= inode->v.i_size) - goto err; + return -EINVAL; src_start = offset + len; shift = -len; @@ -2586,7 +2807,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) - goto err; + return ret; if (insert) { i_size_write(&inode->v, new_size); @@ -2597,23 +2818,22 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, } else { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), offset >> 9, (offset + len) >> 9, - &inode->ei_journal_seq, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); if (ret) - goto err; + return ret; } bch2_bkey_buf_init(©); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - src = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, POS(inode->v.i_ino, src_start >> 9), BTREE_ITER_INTENT); - dst = bch2_trans_copy_iter(&trans, src); - del = bch2_trans_copy_iter(&trans, src); + bch2_trans_copy_iter(&dst, &src); + bch2_trans_copy_iter(&del, &src); while (ret == 0 || ret == -EINTR) { struct disk_reservation disk_res = @@ -2624,10 +2844,24 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); struct bpos atomic_end; unsigned trigger_flags = 0; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src, snapshot); + bch2_btree_iter_set_snapshot(&dst, snapshot); + bch2_btree_iter_set_snapshot(&del, snapshot); + + bch2_trans_begin(&trans); k = insert - ? bch2_btree_iter_peek_prev(src) - : bch2_btree_iter_peek(src); + ? bch2_btree_iter_peek_prev(&src) + : bch2_btree_iter_peek(&src); if ((ret = bkey_err(k))) continue; @@ -2645,9 +2879,9 @@ reassemble: bch2_cut_front(move_pos, copy.k); copy.k->k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); + bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); - ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); + ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); if (ret) continue; @@ -2665,7 +2899,7 @@ reassemble: delete.k.p = copy.k->k.p; delete.k.size = copy.k->k.size; delete.k.p.offset -= shift >> 9; - bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); + bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2686,35 +2920,36 @@ reassemble: BUG_ON(ret); } - ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?: - bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: - bch2_trans_commit(&trans, &disk_res, - &inode->ei_journal_seq, + ret = bch2_btree_iter_traverse(&del) ?: + bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(c, &disk_res); if (!ret) - bch2_btree_iter_set_pos(src, next_pos); + bch2_btree_iter_set_pos(&src, next_pos); } - bch2_trans_iter_put(&trans, del); - bch2_trans_iter_put(&trans, dst); - bch2_trans_iter_put(&trans, src); + bch2_trans_iter_exit(&trans, &del); + bch2_trans_iter_exit(&trans, &dst); + bch2_trans_iter_exit(&trans, &src); bch2_trans_exit(&trans); bch2_bkey_buf_exit(©, c); if (ret) - goto err; + return ret; + mutex_lock(&inode->ei_update_lock); if (!insert) { i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, new_size, ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -2723,41 +2958,49 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { + while (!ret && bkey_cmp(iter.pos, end_pos) < 0) { s64 i_sectors_delta = 0; struct disk_reservation disk_res = { 0 }; struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; unsigned sectors; + u32 snapshot; bch2_trans_begin(&trans); - k = bch2_btree_iter_peek_slot(iter); + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; /* already reserved */ if (k.k->type == KEY_TYPE_reservation && bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch2_btree_iter_next_slot(iter); + bch2_btree_iter_advance(&iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_next_slot(iter); + bch2_btree_iter_advance(&iter); continue; } @@ -2766,7 +3009,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter->pos, &reservation.k_i); + bch2_cut_front(iter.pos, &reservation.k_i); bch2_cut_back(end_pos, &reservation.k_i); sectors = reservation.k.size; @@ -2790,9 +3033,12 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } - ret = bch2_extent_update(&trans, iter, &reservation.k_i, - &disk_res, &inode->ei_journal_seq, + ret = bch2_extent_update(&trans, inode_inum(inode), &iter, + &reservation.k_i, + &disk_res, NULL, 0, &i_sectors_delta, true); + if (ret) + goto bkey_err; i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); @@ -2800,7 +3046,21 @@ bkey_err: if (ret == -EINTR) ret = 0; } - bch2_trans_iter_put(&trans, iter); + + bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ + mark_pagecache_reserved(inode, start_sector, iter.pos.offset); + + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + + bch2_fpunch_at(&trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -2808,77 +3068,58 @@ bkey_err: static long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { - struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - loff_t end = offset + len; - loff_t block_start = round_down(offset, block_bytes(c)); - loff_t block_end = round_up(end, block_bytes(c)); - int ret; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + u64 end = offset + len; + u64 block_start = round_down(offset, block_bytes(c)); + u64 block_end = round_up(end, block_bytes(c)); + bool truncated_last_page = false; + int ret, ret2 = 0; if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); if (ret) - goto err; + return ret; } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, end); - - if (!ret && - offset >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - offset, end); + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) + return ret; - if (unlikely(ret)) - goto err; + truncated_last_page = ret; truncate_pagecache_range(&inode->v, offset, end - 1); + + block_start = round_up(offset, block_bytes(c)); + block_end = round_down(end, block_bytes(c)); } ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - if (ret) - goto err; /* - * Do we need to extend the file? - * - * If we zeroed up to the end of the file, we dropped whatever writes - * were going to write out the current i_size, so we have to extend - * manually even if FL_KEEP_SIZE was set: + * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, + * so that the VFS cache i_size is consistent with the btree i_size: */ - if (end >= inode->v.i_size && - (!(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_ZERO_RANGE))) { + if (ret && + !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) + return ret; - /* - * Sync existing appends before extending i_size, - * as in bch2_extend(): - */ - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); - if (ret) - goto err; + if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) + end = inode->v.i_size; - if (mode & FALLOC_FL_KEEP_SIZE) - end = inode->v.i_size; - else - i_size_write(&inode->v, end); + if (end >= inode->v.i_size && + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { + spin_lock(&inode->v.i_lock); + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, end, 0); + ret2 = bch2_write_inode_size(c, inode, end, 0); mutex_unlock(&inode->ei_update_lock); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; + + return ret ?: ret2; } long bch2_fallocate_dispatch(struct file *file, int mode, @@ -2891,6 +3132,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode, if (!percpu_ref_tryget(&c->writes)) return -EROFS; + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -2902,277 +3147,14 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; + + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); percpu_ref_put(&c->writes); return ret; } -static void mark_range_unallocated(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - pgoff_t index = start >> PAGE_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SHIFT; - struct pagevec pvec; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i, j; - - nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, - &index, end_index); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct bch_page_state *s; - - lock_page(page); - s = bch2_page_state(page); - - if (s) { - spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - unlock_page(page); - } - pagevec_release(&pvec); - } while (index <= end_index); -} - -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3218,13 +3200,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; - mark_range_unallocated(src, pos_src, pos_src + aligned_len); + mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); ret = bch2_remap_range(c, - POS(dst->v.i_ino, pos_dst >> 9), - POS(src->v.i_ino, pos_src >> 9), + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, aligned_len >> 9, - &dst->ei_journal_seq, pos_dst + len, &i_sectors_delta); if (ret < 0) goto err; @@ -3242,10 +3224,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, i_size_write(&dst->v, pos_dst + ret); spin_unlock(&dst->v.i_lock); - if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) && - !c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, inode_inum(dst)); err: bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); @@ -3311,9 +3292,11 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3321,9 +3304,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); - for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), 0, k, ret) { + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -3332,9 +3321,12 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) } else if (k.k->p.offset >> 9 > isize) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); if (ret) return ret; @@ -3407,9 +3399,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3417,9 +3411,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, @@ -3436,9 +3436,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) offset = max(offset, bkey_start_offset(k.k) << 9); } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); if (ret) return ret; diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..b24efeaf343e 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); @@ -32,13 +31,10 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); int bch2_fsync(struct file *, loff_t, loff_t, int); -int bch2_truncate(struct bch_inode_info *, struct iattr *); +int bch2_truncate(struct user_namespace *, + struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 258f7f967dbb..9f329a624c12 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -10,7 +10,11 @@ #include "quota.h" #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/mount.h> +#include <linux/namei.h> +#include <linux/security.h> +#include <linux/writeback.h> #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) #define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -81,7 +85,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto setflags_out; } @@ -152,7 +156,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(&inode->v)) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { ret = -EACCES; goto err; } @@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, char *kname = NULL; struct qstr qstr; int ret = 0; - u64 inum; + subvol_inum inum; kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); if (!kname) @@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.len = ret; qstr.name = kname; - ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, - &qstr); - if (!inum) + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) goto err1; vinode = bch2_vfs_inode_get(c, inum); @@ -266,22 +268,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) down_write(&c->vfs_sb->s_umount); switch (flags) { - case FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(c->vfs_sb->s_bdev); + case FSOP_GOING_FLAGS_DEFAULT: + ret = freeze_bdev(c->vfs_sb->s_bdev); if (ret) goto err; - if (sb && !IS_ERR(sb)) { - bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; - bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev, sb); - } + bch2_journal_flush(&c->journal); + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + thaw_bdev(c->vfs_sb->s_bdev); break; - } case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); + fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: c->vfs_sb->s_flags |= SB_RDONLY; @@ -296,6 +296,161 @@ err: return ret; } +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -ENOENT; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_user_ns(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + + inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + struct inode *dir; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + path_put(&path); + return -EXDEV; + } + + dir = path.dentry->d_parent->d_inode; + + ret = __bch2_unlink(dir, path.dentry, true); + if (!ret) { + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); + } + path_put(&path); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -326,6 +481,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_GOINGDOWN: return bch2_ioc_goingdown(c, (u32 __user *) arg); + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_create(c, file, i); + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_destroy(c, file, i); + } + default: return bch2_fs_ioctl(c, cmd, (void __user *) arg); } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 7899feb68399..7eb33da9c253 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -27,6 +27,7 @@ #include <linux/exportfs.h> #include <linux/fiemap.h> #include <linux/module.h> +#include <linux/pagemap.h> #include <linux/posix_acl.h> #include <linux/random.h> #include <linux/statfs.h> @@ -35,30 +36,10 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, - struct bch_inode_unpacked *); - -static void journal_seq_copy(struct bch_fs *c, - struct bch_inode_info *dst, - u64 journal_seq) -{ - /* - * atomic64_cmpxchg has a fallback for archs that don't support it, - * cmpxchg does not: - */ - atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; - u64 old, v = READ_ONCE(dst->ei_journal_seq); - - do { - old = v; - - if (old >= journal_seq) - break; - } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); - - bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); -} + struct bch_inode_unpacked *, + struct bch_subvolume *); static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { @@ -112,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock) __pagecache_lock_get(lock, -1); } -void bch2_inode_update_after_write(struct bch_fs *c, +void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + 0 && c->opts.inodes_use_key_cache); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); @@ -140,32 +129,29 @@ int __must_check bch2_write_inode(struct bch_fs *c, void *p, unsigned fields) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; bch2_trans_init(&trans, c, 0, 512); + trans.ip = _RET_IP_; retry: bch2_trans_begin(&trans); - iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter) ?: + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: (set ? set(inode, &inode_u, p) : 0) ?: - bch2_inode_write(&trans, iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOFAIL); + bch2_inode_write(&trans, &iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(c, inode, &inode_u, fields); + bch2_inode_update_after_write(&trans, inode, &inode_u, fields); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -209,51 +195,82 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; + struct btree_trans trans; + struct bch_subvolume subvol; int ret; - inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum, &inode_u); + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, + bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_trans_exit(&trans); + if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inode, &inode_u); - - inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); - unlock_new_inode(&inode->v); return &inode->v; } -static int inum_test(struct inode *inode, void *p) -{ - unsigned long *ino = p; - - return *ino == inode->i_ino; -} - -static struct bch_inode_info * -__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, bool tmpfile) +struct bch_inode_info * +__bch2_create(struct user_namespace *mnt_userns, + struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; - struct user_namespace *ns = dir->v.i_sb->s_user_ns; struct btree_trans trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; + struct bch_subvolume subvol; u64 journal_seq = 0; int ret; @@ -274,27 +291,34 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, bch2_inode_init_early(c, &inode_u); - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); bch2_trans_init(&trans, c, 8, - 2048 + (!tmpfile ? dentry->d_name.len : 0)); + 2048 + (!(flags & BCH_CREATE_TMPFILE) + ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); - ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, - !tmpfile ? &dentry->d_name : NULL, - from_kuid(ns, current_fsuid()), - from_kgid(ns, current_fsgid()), + ret = bch2_create_trans(&trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, + from_kuid(mnt_userns, current_fsuid()), + from_kgid(mnt_userns, current_fsgid()), mode, rdev, - default_acl, acl) ?: + default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_before_quota; - ret = bch2_trans_commit(&trans, NULL, &journal_seq, - BTREE_INSERT_NOUNLOCK); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(&trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(&trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -304,15 +328,14 @@ err_before_quota: goto err_trans; } - if (!tmpfile) { - bch2_inode_update_after_write(c, dir, &dir_u, + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dir, journal_seq); mutex_unlock(&dir->ei_update_lock); } - bch2_vfs_inode_init(c, inode, &inode_u); - journal_seq_copy(c, inode, journal_seq); + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -324,8 +347,12 @@ err_before_quota: */ inode->v.i_state |= I_CREATING; - old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, - inum_test, NULL, &inode->v.i_ino)); + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); BUG_ON(!old); if (unlikely(old != inode)) { @@ -333,7 +360,6 @@ err_before_quota: * We raced, another process pulled the new inode into cache * before us: */ - journal_seq_copy(c, old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -352,7 +378,7 @@ err: posix_acl_release(acl); return inode; err_trans: - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_exit(&trans); @@ -371,22 +397,25 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; - u64 inum; + subvol_inum inum = { .subvol = 1 }; + int ret; - inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, - &dentry->d_name); + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); - if (inum) + if (!ret) vinode = bch2_vfs_inode_get(c, inum); return d_splice_alias(vinode, dentry); } -static int bch2_mknod(struct inode *vdir, struct dentry *dentry, +static int bch2_mknod(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -395,10 +424,11 @@ static int bch2_mknod(struct inode *vdir, struct dentry *dentry, return 0; } -static int bch2_create(struct inode *vdir, struct dentry *dentry, +static int bch2_create(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, @@ -413,20 +443,16 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, - BTREE_INSERT_NOUNLOCK, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_link_trans(&trans, - dir->v.i_ino, - inode->v.i_ino, &dir_u, &inode_u, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - journal_seq_copy(c, inode, dir->ei_journal_seq); - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_exit(&trans); @@ -453,7 +479,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, return 0; } -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + bool deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); @@ -465,20 +492,17 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, - BTREE_INSERT_NOUNLOCK| + ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, - dir->v.i_ino, &dir_u, - &inode_u, &dentry->d_name)); + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - journal_seq_copy(c, inode, dir->ei_journal_seq); - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_MTIME); } @@ -488,14 +512,21 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return ret; } -static int bch2_symlink(struct inode *vdir, struct dentry *dentry, +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, false); +} + +static int bch2_symlink(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, const char *symname) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -510,8 +541,6 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, if (unlikely(ret)) goto err; - journal_seq_copy(c, dir, inode->ei_journal_seq); - ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; @@ -523,12 +552,14 @@ err: return ret; } -static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_mkdir(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); + return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); } -static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, +static int bch2_rename2(struct user_namespace *mnt_userns, + struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { @@ -544,7 +575,6 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; - u64 journal_seq = 0; int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) @@ -584,11 +614,10 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, goto err; } - ret = __bch2_trans_do(&trans, NULL, &journal_seq, - BTREE_INSERT_NOUNLOCK, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_rename_trans(&trans, - src_dir->v.i_ino, &src_dir_u, - dst_dir->v.i_ino, &dst_dir_u, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, @@ -601,25 +630,19 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(c, src_dir, &src_dir_u, + bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, src_dir, journal_seq); - if (src_dir != dst_dir) { - bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + if (src_dir != dst_dir) + bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dst_dir, journal_seq); - } - bch2_inode_update_after_write(c, src_inode, &src_inode_u, + bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, ATTR_CTIME); - journal_seq_copy(c, src_inode, journal_seq); - if (dst_inode) { - bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + if (dst_inode) + bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, ATTR_CTIME); - journal_seq_copy(c, dst_inode, journal_seq); - } err: bch2_trans_exit(&trans); @@ -642,17 +665,21 @@ err: return ret; } -void bch2_setattr_copy(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - struct iattr *attr) +static void bch2_setattr_copy(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); + bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); + bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); + + if (ia_valid & ATTR_SIZE) + bi->bi_size = attr->ia_size; if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); @@ -668,19 +695,20 @@ void bch2_setattr_copy(struct bch_inode_info *inode, : inode->v.i_gid; if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) + !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) mode &= ~S_ISGID; bi->bi_mode = mode; } } -static int bch2_setattr_nonsize(struct bch_inode_info *inode, - struct iattr *attr) +int bch2_setattr_nonsize(struct user_namespace *mnt_userns, + struct bch_inode_info *inode, + struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans trans; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; int ret; @@ -706,34 +734,32 @@ retry: kfree(acl); acl = NULL; - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); if (ret) goto btree_err; - bch2_setattr_copy(inode, &inode_u, attr); + bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); if (ret) goto btree_err; } - ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOUNLOCK| + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); btree_err: - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); if (ret == -EINTR) goto retry; if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -745,7 +771,8 @@ err: return ret; } -static int bch2_getattr(const struct path *path, struct kstat *stat, +static int bch2_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); @@ -785,26 +812,29 @@ static int bch2_getattr(const struct path *path, struct kstat *stat, return 0; } -static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) +static int bch2_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); int ret; lockdep_assert_held(&inode->v.i_rwsem); - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(mnt_userns, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(inode, iattr) - : bch2_setattr_nonsize(inode, iattr); + ? bch2_truncate(mnt_userns, inode, iattr) + : bch2_setattr_nonsize(mnt_userns, inode, iattr); } -static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_tmpfile(struct user_namespace *mnt_userns, + struct inode *vdir, struct dentry *dentry, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -873,36 +903,49 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; + u32 snapshot; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; + start >>= 9; + bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(ei->v.i_ino, start >> 9), 0); retry: - while ((k = bch2_btree_iter_peek(iter)).k && + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + + while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && - bkey_cmp(iter->pos, end) < 0) { + bkey_cmp(iter.pos, end) < 0) { enum btree_id data_btree = BTREE_ID_extents; if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -923,7 +966,7 @@ retry: offset_into_extent), cur.k); bch2_key_resize(&cur.k->k, sectors); - cur.k->k.p = iter->pos; + cur.k->k.p = iter.pos; cur.k->k.p.offset += cur.k->k.size; if (have_extent) { @@ -936,10 +979,12 @@ retry: bkey_copy(prev.k, cur.k); have_extent = true; - bch2_btree_iter_set_pos(iter, - POS(iter->pos.inode, iter->pos.offset + sectors)); + bch2_btree_iter_set_pos(&iter, + POS(iter.pos.inode, iter.pos.offset + sectors)); } - + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: if (ret == -EINTR) goto retry; @@ -947,8 +992,7 @@ retry: ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; @@ -984,16 +1028,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode->v.i_ino, ctx); -} - -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); + return bch2_readdir(c, inode_inum(inode), ctx); } static const struct file_operations bch_file_operations = { @@ -1010,7 +1045,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1080,7 +1115,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1093,51 +1128,243 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; -static struct inode *bch2_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +struct bcachefs_fid { + u64 inum; + u32 subvol; + u32 gen; +} __packed; + +struct bcachefs_fid_with_parent { + struct bcachefs_fid fid; + struct bcachefs_fid dir; +} __packed; + +static int bcachefs_fid_valid(int fh_len, int fh_type) { - struct bch_fs *c = sb->s_fs_info; - struct inode *vinode; + switch (fh_type) { + case FILEID_BCACHEFS_WITHOUT_PARENT: + return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); + case FILEID_BCACHEFS_WITH_PARENT: + return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); + default: + return false; + } +} + +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) +{ + return (struct bcachefs_fid) { + .inum = inode->ei_inode.bi_inum, + .subvol = inode->ei_subvol, + .gen = inode->ei_inode.bi_generation, + }; +} + +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, + struct inode *vdir) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *dir = to_bch_ei(vdir); + + if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) + return FILEID_INVALID; + + if (!S_ISDIR(inode->v.i_mode) && dir) { + struct bcachefs_fid_with_parent *fid = (void *) fh; + + fid->fid = bch2_inode_to_fid(inode); + fid->dir = bch2_inode_to_fid(dir); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITH_PARENT; + } else { + struct bcachefs_fid *fid = (void *) fh; + + *fid = bch2_inode_to_fid(inode); - if (ino < BCACHEFS_ROOT_INO) - return ERR_PTR(-ESTALE); + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITHOUT_PARENT; + } +} - vinode = bch2_vfs_inode_get(c, ino); - if (IS_ERR(vinode)) - return ERR_CAST(vinode); - if (generation && vinode->i_generation != generation) { - /* we didn't find the right inode.. */ +static struct inode *bch2_nfs_get_inode(struct super_block *sb, + struct bcachefs_fid fid) +{ + struct bch_fs *c = sb->s_fs_info; + struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { + .subvol = fid.subvol, + .inum = fid.inum, + }); + if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { iput(vinode); - return ERR_PTR(-ESTALE); + vinode = ERR_PTR(-ESTALE); } return vinode; } -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type)) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); } -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid_with_parent *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type) || + fh_type != FILEID_BCACHEFS_WITH_PARENT) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); +} + +static struct dentry *bch2_get_parent(struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + subvol_inum parent_inum = { + .subvol = inode->ei_inode.bi_parent_subvol ?: + inode->ei_subvol, + .inum = inode->ei_inode.bi_dir, + }; + + if (!parent_inum.inum) + return NULL; + + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter1; + struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; + unsigned name_len; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + + k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { + ret = -ENOENT; + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } else { + /* + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret < 0) + break; + if (ret) + continue; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } + } + + ret = -ENOENT; + goto err; +found: + name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); + + memcpy(name, d.v->d_name, name_len); + name[name_len] = '\0'; +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter1); + bch2_trans_iter_exit(&trans, &iter2); + bch2_trans_exit(&trans); + + return ret; } static const struct export_operations bch_export_ops = { + .encode_fh = bch2_encode_fh, .fh_to_dentry = bch2_fh_to_dentry, .fh_to_parent = bch2_fh_to_parent, - //.get_parent = bch2_get_parent, + .get_parent = bch2_get_parent, + .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct bch_fs *c, +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, - struct bch_inode_unpacked *bi) + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) { - bch2_inode_update_after_write(c, inode, bi, ~0); + bch2_inode_update_after_write(trans, inode, bi, ~0); + + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; @@ -1146,9 +1373,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_size = bi->bi_size; inode->ei_flags = 0; - inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1184,7 +1411,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) mutex_init(&inode->ei_update_lock); pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); - inode->ei_journal_seq = 0; return &inode->v; } @@ -1246,10 +1472,57 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino, true); + bch2_inode_rm(c, inode_inum(inode), true); } } +void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) +{ + struct super_block *sb = c->vfs_sb; + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + spin_unlock(&sb->s_inode_list_lock); +again: + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + if (!(inode->i_state & I_DONTCACHE)) { + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + + spin_lock(&inode->i_lock); + if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && + !(inode->i_state & I_FREEING)) { + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); +} + static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -1299,13 +1572,14 @@ static int bch2_sync_fs(struct super_block *sb, int wait) static struct bch_fs *bch2_path_to_fs(const char *path) { struct bch_fs *c; - struct block_device *bdev = lookup_bdev(path); + dev_t dev; + int ret; - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); - c = bch2_dev_to_fs(bdev->bd_dev); - bdput(bdev); + c = bch2_dev_to_fs(dev); if (c) closure_put(&c->cl); return c ?: ERR_PTR(-ENOENT); @@ -1316,9 +1590,6 @@ static char **split_devs(const char *_dev_name, unsigned *nr) char *dev_name = NULL, **devs = NULL, *s; size_t i, nr_devs = 0; - if (strlen(_dev_name) == 0) - return NULL; - dev_name = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) return NULL; @@ -1494,6 +1765,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (ret) return ERR_PTR(ret); + if (!dev_name || strlen(dev_name) == 0) + return ERR_PTR(-EINVAL); + devs = split_devs(dev_name, &nr_devs); if (!devs) return ERR_PTR(-ENOMEM); @@ -1561,6 +1835,8 @@ got_sb: sb->s_xattr = bch2_xattr_handlers; sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.nsec_per_time_unit; + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); c->vfs_sb = sb; strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -1568,9 +1844,7 @@ got_sb: if (ret) goto err_put_super; - sb->s_bdi->congested_fn = bch2_congested; - sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; @@ -1589,7 +1863,9 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); + sb->s_shrink.seeks = 0; + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { bch_err(c, "error mounting: error getting root inode %i", (int) PTR_ERR(vinode)); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 2d82ed7dd740..b2211ec7f302 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -36,7 +36,6 @@ struct bch_inode_info { unsigned long ei_flags; struct mutex ei_update_lock; - u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; @@ -45,16 +44,32 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; + u32 ei_subvol; + /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: */ #define EI_INODE_ERROR 0 +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) @@ -135,6 +150,10 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +struct bch_inode_info * +__bch2_create(struct user_namespace *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -154,24 +173,33 @@ static inline int bch2_set_projid(struct bch_fs *c, KEY_TYPE_QUOTA_PREALLOC); } -struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); -void bch2_inode_update_after_write(struct bch_fs *, +void bch2_inode_update_after_write(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); +int bch2_setattr_nonsize(struct user_namespace *, + struct bch_inode_info *, + struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, bool); + +void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); #else +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 86a6189503e4..361dbf338023 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -9,6 +9,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "subvolume.h" #include "super.h" #include "xattr.h" @@ -17,15 +18,16 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + u32 snapshot) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 sectors = 0; int ret; for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(inum, 0), 0, k, ret) { + SPOS(inum, 0, snapshot), 0, k, ret) { if (k.k->p.inode != inum) break; @@ -33,33 +35,138 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) sectors += k.k->size; } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret ?: sectors; } +static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 subdirs = 0; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode != inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_DIR) + subdirs++; + } + + bch2_trans_iter_exit(trans, &iter); + + return ret ?: subdirs; +} + +static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS(0, snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not fonud", snapshot); + ret = -ENOENT; + goto err; + } + + *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; + +} + +static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + + *snapshot = le32_to_cpu(s.snapshot); + *inum = le64_to_cpu(s.inode); + return ret; +} + +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); +} + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + POS(0, inode_nr), + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) { + ret = -ENOENT; + goto err; + } + + ret = bch2_inode_unpack(k, inode); +err: + if (ret && ret != -EINTR) + bch_err(trans->c, "error %i fetching inode %llu", + ret, inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, - POS(0, inode_nr), 0); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, *snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (snapshot) - *snapshot = iter->pos.snapshot; - ret = k.k->type == KEY_TYPE_inode - ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) : -ENOENT; + if (!ret) + *snapshot = iter.pos.snapshot; err: - bch2_trans_iter_free(trans, iter); + if (ret && ret != -EINTR) + bch_err(trans->c, "error %i fetching inode %llu:%u", + ret, inode_nr, *snapshot); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -70,16 +177,41 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); } +static int __lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0); + if (ret) + return ret; + + d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); + return 0; +} + static int __write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - struct btree_iter *inode_iter = - bch2_trans_get_iter(trans, BTREE_ID_inodes, - SPOS(0, inode->bi_inum, snapshot), - BTREE_ITER_INTENT); - int ret = bch2_inode_write(trans, inode_iter, inode); - bch2_trans_iter_put(trans, inode_iter); + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode->bi_inum, snapshot), + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_inode_write(trans, &iter, inode); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -96,110 +228,176 @@ static int write_inode(struct btree_trans *trans, return ret; } +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(trans->c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) { + ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); + if (ret) + goto err; + } + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (ret == -EINTR) + goto retry; + + return ret; +} + static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bch_inode_unpacked dir_inode; struct bch_hash_info dir_hash_info; int ret; - ret = lookup_inode(trans, pos.inode, &dir_inode, NULL); + ret = lookup_first_inode(trans, pos.inode, &dir_inode); if (ret) return ret; dir_hash_info = bch2_hash_info_init(c, &dir_inode); - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, iter); - bch2_trans_iter_put(trans, iter); - return ret; -} - -static int remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - int ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __remove_dirent(trans, pos)); - if (ret) - bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret); + &dir_hash_info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, +static int lookup_lostfound(struct btree_trans *trans, u32 subvol, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; struct bch_inode_unpacked root; struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - u64 inum; + subvol_inum root_inum = { .subvol = subvol }; + u64 inum = 0; + unsigned d_type = 0; u32 snapshot; int ret; - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); - if (ret && ret != -ENOENT) + ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) + return ret; + + ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root); - inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, - &lostfound_str); - if (!inum) { + + ret = __lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (ret == -ENOENT) { bch_notice(c, "creating lost+found"); goto create_lostfound; } - ret = lookup_inode(trans, inum, lostfound, &snapshot); - if (ret && ret != -ENOENT) { - /* - * The check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ + if (ret && ret != -EINTR) bch_err(c, "error looking up lost+found: %i", ret); + if (ret) return ret; - } - if (ret == -ENOENT) { -create_lostfound: - bch2_inode_init_early(c, lostfound); - - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_create_trans(trans, - BCACHEFS_ROOT_INO, &root, - lostfound, - &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL)); - if (ret) - bch_err(c, "error creating lost+found: %i", ret); + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return ret; } - return 0; + /* + * The check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + return __lookup_inode(trans, inum, lostfound, &snapshot); + +create_lostfound: + bch2_inode_init_early(c, lostfound); + + ret = bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0); + if (ret && ret != -EINTR) + bch_err(c, "error creating lost+found: %i", ret); + return ret; } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode) +static int __reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) { struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 subvol; int ret; - ret = lookup_lostfound(trans, &lostfound); + ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + + ret = lookup_lostfound(trans, subvol, &lostfound); if (ret) return ret; if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = write_inode(trans, &lostfound, U32_MAX); + ret = __write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } @@ -209,33 +407,51 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); - ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, - mode_to_type(inode->bi_mode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE)); + ret = bch2_dirent_create(trans, + (subvol_inum) { + .subvol = subvol, + .inum = lostfound.bi_inum, + }, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; + + return __write_inode(trans, inode, inode_snapshot); +} + +static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + int ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + __reattach_inode(trans, inode, inode_snapshot)); if (ret) { bch_err(trans->c, "error %i reattaching inode %llu", ret, inode->bi_inum); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; - - return write_inode(trans, inode, U32_MAX); + return ret; } static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, + POS(inode->bi_dir, inode->bi_dir_offset), 0); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto out; @@ -244,45 +460,251 @@ static int remove_backpointer(struct btree_trans *trans, goto out; } - ret = remove_dirent(trans, k.k->p); + ret = __remove_dirent(trans, k.k->p); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) +{ + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (bkey_cmp(s->pos, pos)) + s->nr = 0; + s->pos = pos; + + /* Might get called multiple times due to lock restarts */ + if (s->nr && s->d[s->nr - 1] == pos.snapshot) + return 0; + + return snapshots_seen_add(c, s, pos.snapshot); +} + +/** + * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, + * and @ancestor hasn't been overwritten in @seen + * + * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + */ +static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) +{ + ssize_t i; + + BUG_ON(id > ancestor); + + id = snapshot_t(c, id)->equiv; + ancestor = snapshot_t(c, ancestor)->equiv; + + /* @ancestor should be the snapshot most recently added to @seen */ + BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); + BUG_ON(seen->pos.snapshot != ancestor); + + if (id == ancestor) + return true; + + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + + for (i = seen->nr - 2; + i >= 0 && seen->d[i] >= id; + --i) + if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && + bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + return false; + + return true; +} + +/** + * ref_visible - given a key with snapshot id @src that points to a key with + * snapshot id @dst, test whether there is some snapshot in which @dst is + * visible. + * + * This assumes we're visiting @src keys in natural key order. + * + * @s - list of snapshot IDs already seen at @src + * @src - snapshot ID of src key + * @dst - snapshot ID of dst key + */ +static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) +{ + return dst <= src + ? key_visible_in_snapshot(c, s, dst, src) + : bch2_snapshot_is_ancestor(c, src, dst); +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + struct inode_walker { - bool first_this_inode; - bool have_inode; - u64 cur_inum; - u32 snapshot; - struct bch_inode_unpacked inode; + bool first_this_inode; + u64 cur_inum; + + size_t nr; + size_t size; + struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; + } *d; }; +static void inode_walker_exit(struct inode_walker *w) +{ + kfree(w->d); + w->d = NULL; +} + static struct inode_walker inode_walker_init(void) { - return (struct inode_walker) { - .cur_inum = -1, - .have_inode = false, + return (struct inode_walker) { 0, }; +} + +static int inode_walker_realloc(struct inode_walker *w) +{ + if (w->nr == w->size) { + size_t new_size = max_t(size_t, 8UL, w->size * 2); + void *d = krealloc(w->d, new_size * sizeof(w->d[0]), + GFP_KERNEL); + if (!d) + return -ENOMEM; + + w->d = d; + w->size = new_size; + } + + return 0; +} + +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) +{ + struct bch_inode_unpacked u; + int ret; + + ret = inode_walker_realloc(w); + if (ret) + return ret; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + w->d[w->nr++] = (struct inode_walker_entry) { + .inode = u, + .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, }; + + return 0; } -static int walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) +static int __walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos) { - if (inum != w->cur_inum) { - int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + unsigned i, ancestor_pos; + int ret; - if (ret && ret != -ENOENT) - return ret; + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; - w->have_inode = !ret; - w->cur_inum = inum; - w->first_this_inode = true; - } else { + if (pos.inode == w->cur_inum) { w->first_this_inode = false; + goto lookup_snapshot; } - return 0; + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != pos.inode) + break; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + w->cur_inum = pos.inode; + w->first_this_inode = true; +lookup_snapshot: + for (i = 0; i < w->nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + goto found; + return INT_MAX; +found: + BUG_ON(pos.snapshot > w->d[i].snapshot); + + if (pos.snapshot != w->d[i].snapshot) { + ancestor_pos = i; + + while (i && w->d[i - 1].snapshot > pos.snapshot) + --i; + + ret = inode_walker_realloc(w); + if (ret) + return ret; + + array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); + w->d[i].snapshot = pos.snapshot; + w->d[i].count = 0; + } + + return i; +} + +static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != inum) + break; + + if (!bkey_is_inode(k.k)) + continue; + + if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { + add_inode(c, w, k); + if (k.k->p.snapshot >= s->pos.snapshot) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + char buf[200]; + int ret = 0; + + if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: + return ret; } static int hash_redo_key(struct btree_trans *trans, @@ -290,6 +712,9 @@ static int hash_redo_key(struct btree_trans *trans, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c k) { + bch_err(trans->c, "hash_redo_key() not implemented yet"); + return -EINVAL; +#if 0 struct bkey_i *delete; struct bkey_i *tmp; @@ -305,28 +730,10 @@ static int hash_redo_key(struct btree_trans *trans, bkey_init(&delete->k); delete->k.p = k_iter->pos; - return bch2_trans_update(trans, k_iter, delete, 0) ?: + return bch2_btree_iter_traverse(k_iter) ?: + bch2_trans_update(trans, k_iter, delete, 0) ?: bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); -} - -static int fsck_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *info, - struct btree_iter *iter) -{ - int ret; -retry: - ret = bch2_hash_delete_at(trans, desc, info, iter) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret == -EINTR) { - ret = bch2_btree_iter_traverse(iter); - if (!ret) - goto retry; - } - - return ret; +#endif } static int hash_check_key(struct btree_trans *trans, @@ -335,7 +742,7 @@ static int hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; char buf[200]; struct bkey_s_c k; u64 hash; @@ -362,20 +769,17 @@ static int hash_check_key(struct btree_trans *trans, "duplicate hash table keys:\n%s", (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf))) { - ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter); - if (ret) - return ret; - ret = 1; + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; break; } if (bkey_deleted(k.k)) { - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); goto bad_hash; } } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; bad_hash: if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " @@ -384,9 +788,7 @@ bad_hash: (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) return 0; - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - hash_redo_key(trans, desc, hash_info, k_iter, hash_k)); + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); if (ret) { bch_err(c, "hash_redo_key err %i", ret); return ret; @@ -398,30 +800,64 @@ fsck_err: static int check_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_inode inode) + struct bch_inode_unpacked *prev, + bool full) { struct bch_fs *c = trans->c; + struct bkey_s_c k; struct bch_inode_unpacked u; bool do_update = false; - int ret = 0; + int ret; - ret = bch2_inode_unpack(inode, &u); + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; - if (bch2_fs_inconsistent_on(ret, c, - "error unpacking inode %llu in fsck", - inode.k->p.inode)) + ret = bkey_err(k); + if (ret) return ret; + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + + /* + * if snapshot id isn't a leaf node, skip it - deletion in + * particular is not atomic, so on the internal snapshot nodes + * we can see inodes marked for deletion after a clean shutdown + */ + if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) + return 0; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && + !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED))) + return 0; + + if (prev->bi_inum != u.bi_inum) + *prev = u; + + if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || + inode_d_type(prev) != inode_d_type(&u), c, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + if (u.bi_flags & BCH_INODE_UNLINKED && (!c->sb.clean || fsck_err(c, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch_verbose(c, "deleting inode %llu", u.bi_inum); - bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum, false); + ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; @@ -441,9 +877,10 @@ static int check_inode(struct btree_trans *trans, * just switch units to bytes and that issue goes away */ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), + SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, + iter->pos.snapshot), POS(u.bi_inum, U64_MAX), - NULL); + 0, NULL); if (ret) { bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; @@ -468,7 +905,7 @@ static int check_inode(struct btree_trans *trans, bch_verbose(c, "recounting sectors for inode %llu", u.bi_inum); - sectors = bch2_count_inode_sectors(trans, u.bi_inum); + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { bch_err(c, "error in fsck: error %i recounting inode sectors", (int) sectors); @@ -488,10 +925,7 @@ static int check_inode(struct btree_trans *trans, } if (do_update) { - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_inode_write(trans, iter, &u)); + ret = write_inode(trans, &u, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i " "updating inode", ret); @@ -504,41 +938,99 @@ noinline_for_stack static int check_inodes(struct bch_fs *c, bool full) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_s_c_inode inode; + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) - continue; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - inode = bkey_s_c_to_inode(k); + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_inode(&trans, &iter, &prev, full)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - if (full || - (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) { - ret = check_inode(&trans, iter, inode); - if (ret) - break; - } + bch2_trans_exit(&trans); + return ret; +} + +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + subvol = bkey_s_c_to_subvolume(k); + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + if (ret && ret != -EINTR) + bch_err(trans->c, "error deleting subvolume %llu: %i", + iter->pos.offset, ret); + if (ret) + return ret; } - bch2_trans_iter_put(&trans, iter); - BUG_ON(ret == -EINTR); + return 0; +} + +noinline_for_stack +static int check_subvols(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes, + POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_subvol(&trans, &iter)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } +/* + * Checking for overlapping extents needs to be reimplemented + */ +#if 0 static int fix_overlapping_extent(struct btree_trans *trans, struct bkey_s_c k, struct bpos cut_at) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i *u; int ret; @@ -558,45 +1050,208 @@ static int fix_overlapping_extent(struct btree_trans *trans, * assume things about extent overwrites - we should be running the * triggers manually here */ - iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p, - BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); - BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - ret = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: + BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } +#endif -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode) +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) { - struct btree_iter *iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); + bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); + if (!ret && k.k->type != KEY_TYPE_dirent) + ret = -ENOENT; + if (ret) { + bch2_trans_iter_exit(trans, iter); + return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }; + } + + return bkey_s_c_to_dirent(k); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + +static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); + ret = bkey_err(d.s_c); if (ret) - goto out; - if (k.k->type != KEY_TYPE_dirent) - goto out; + return ret; - ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; -out: - bch2_trans_iter_free(trans, iter); + ret = dirent_points_to_inode(d, inode); + bch2_trans_iter_exit(trans, &iter); return ret; } -static bool inode_backpointer_matches(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { - return d.k->p.inode == inode->bi_dir && - d.k->p.offset == inode->bi_dir_offset; + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_sectors == i->count) + continue; + + count2 = lockrestart_do(trans, + bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_sectors == i->count) + continue; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->cur_inum, i->snapshot, + i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) + continue; + + i->inode.bi_sectors = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + ret2 = -EINTR; + } +fsck_err: + return ret ?: ret2; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker *inode, + struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct inode_walker_entry *i; + char buf[200]; + int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + return ret; + } +#if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) + return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + } +#endif + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (ret == INT_MAX) + return 0; + + i = inode->d + ret; + ret = 0; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->type != KEY_TYPE_reservation && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { + bch2_fs_lazy_rw(c); + return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, + k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, NULL) ?: -EINTR; + } + } + } + + if (bkey_extent_is_allocation(k.k)) + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) + i->count += k.k->size; +#if 0 + bch2_bkey_buf_reassemble(&prev, c, k); +#endif + +fsck_err: + return ret; } /* @@ -607,303 +1262,429 @@ noinline_for_stack static int check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_buf prev; - u64 i_sectors = 0; + struct btree_iter iter; int ret = 0; +#if 0 + struct bkey_buf prev; bch2_bkey_buf_init(&prev); prev.k->k = KEY(0, 0, 0); +#endif + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - if (w.have_inode && - w.cur_inum != k.k->p.inode && - !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && - fsck_err_on(w.inode.bi_sectors != i_sectors, c, - "inode %llu has incorrect i_sectors: got %llu, should be %llu", - w.inode.bi_inum, - w.inode.bi_sectors, i_sectors)) { - w.inode.bi_sectors = i_sectors; - - ret = write_inode(&trans, &w.inode, w.snapshot); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent(&trans, &iter, &w, &s)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); +#if 0 + bch2_bkey_buf_exit(&prev, c); +#endif + inode_walker_exit(&w); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + + return ret; +} + +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_nlink == i->count) + continue; + + count2 = lockrestart_do(trans, + bch2_count_subdirs(trans, w->cur_inum, i->snapshot)); + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; + } + + if (fsck_err_on(i->inode.bi_nlink != i->count, c, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); if (ret) break; + ret2 = -EINTR; } + } +fsck_err: + return ret ?: ret2; +} - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { - char buf1[200]; - char buf2[200]; +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; + char buf[200]; + int ret = 0; - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); - bch2_bkey_val_to_text(&PBUF(buf2), c, k); + if (!target->bi_dir && + !target->bi_dir_offset) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (!inode_points_to_dirent(target, d)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) - return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target->bi_mode) && + backpointer_exists, c, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); + if (ret) + goto err; + return 0; } - ret = walk_inode(&trans, &w, k.k->p.inode); - if (ret) - break; + if (fsck_err_on(backpointer_exists && + !target->bi_nlink, c, + "inode %llu has multiple links but i_nlink 0", + target->bi_inum)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_UNLINKED; - if (w.first_this_inode) - i_sectors = 0; - - if (fsck_err_on(!w.have_inode, c, - "extent type %u for missing inode %llu", - k.k->type, k.k->p.inode) || - fsck_err_on(w.have_inode && - !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, - "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.inode.bi_mode)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, 0), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; } - if (fsck_err_on(w.have_inode && - !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; + if (fsck_err_on(!backpointer_exists, c, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; } + } - if (bkey_extent_is_allocation(k.k)) - i_sectors += k.k->size; - bch2_bkey_buf_reassemble(&prev, c, k); + if (fsck_err_on(d.v->d_type != inode_d_type(target), c, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + return ret; - bch2_btree_iter_advance(iter); + d = dirent_i_to_s_c(n); } + + if (d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && + (c->sb.version < bcachefs_metadata_version_subvol_dirent || + fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + return ret; + + d = dirent_i_to_s_c(n); + } +err: fsck_err: - if (ret == -EINTR) - goto retry; - bch2_trans_iter_put(&trans, iter); - bch2_bkey_buf_exit(&prev, c); - return bch2_trans_exit(&trans) ?: ret; + return ret; } -/* - * Walk dirents: verify that they all have a corresponding S_ISDIR inode, - * validate d_type - */ -noinline_for_stack -static int check_dirents(struct bch_fs *c) +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) { - struct inode_walker w = inode_walker_init(); - struct bch_hash_info hash_info; - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; char buf[200]; - unsigned nr_subdirs = 0; - int ret = 0; + int ret; - bch_verbose(c, "checking dirents"); + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + ret = bkey_err(k); + if (ret) + return ret; - iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - struct bkey_s_c_dirent d; - struct bch_inode_unpacked target; - u32 target_snapshot; - bool have_target; - bool backpointer_exists = true; - u64 d_inum; - - if (w.have_inode && - w.cur_inum != k.k->p.inode && - fsck_err_on(w.inode.bi_nlink != nr_subdirs, c, - "directory %llu with wrong i_nlink: got %u, should be %u", - w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) { - w.inode.bi_nlink = nr_subdirs; - ret = write_inode(&trans, &w.inode, w.snapshot); - if (ret) - break; - } + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; - ret = walk_inode(&trans, &w, k.k->p.inode); + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); if (ret) - break; + return ret; + } - if (w.first_this_inode) - nr_subdirs = 0; + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) + return ret; - if (fsck_err_on(!w.have_inode, c, - "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf)) || - fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, - "dirent in non directory inode type %u:\n%s", - mode_to_type(w.inode.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - ret = lockrestart_do(&trans, - bch2_btree_delete_at(&trans, iter, 0)); - if (ret) - goto err; - goto next; - } + if (fsck_err_on(ret == INT_MAX, c, + "dirent in nonexisting directory:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - if (!w.have_inode) - goto next; + if (ret == INT_MAX) + return 0; - if (w.first_this_inode) - hash_info = bch2_hash_info_init(c, &w.inode); + i = dir->d + ret; + ret = 0; - ret = hash_check_key(&trans, bch2_dirent_hash_desc, - &hash_info, iter, k); - if (ret > 0) { - ret = 0; - goto next; - } - if (ret) - goto fsck_err; + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, 0); - if (k.k->type != KEY_TYPE_dirent) - goto next; + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); - d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); + ret = hash_check_key(trans, bch2_dirent_hash_desc, + hash_info, iter, k); + if (ret < 0) + return ret; + if (ret) /* dirent has been deleted */ + return 0; + + if (k.k->type != KEY_TYPE_dirent) + return 0; + + d = bkey_s_c_to_dirent(k); - ret = lookup_inode(&trans, d_inum, &target, &target_snapshot); + if (d.v->d_type == DT_SUBVOL) { + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; + + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); if (ret && ret != -ENOENT) - break; + return ret; - have_target = !ret; - ret = 0; + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", + le64_to_cpu(d.v->d_child_subvol))) + return __remove_dirent(trans, d.k->p); + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && ret != -ENOENT) + return ret; + + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } - if (fsck_err_on(!have_target, c, + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + return ret; + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) + return ret; + + if (fsck_err_on(!target->nr, c, "dirent points to missing inode:\n%s", (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { - ret = remove_dirent(&trans, d.k->p); + ret = __remove_dirent(trans, d.k->p); if (ret) - goto err; - goto next; + return ret; } - if (!have_target) - goto next; - - if (!target.bi_dir && - !target.bi_dir_offset) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(&trans, &target, target_snapshot); + for (i = target->d; i < target->d + target->nr; i++) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); if (ret) - goto err; + return ret; } + } - if (!inode_backpointer_matches(d, &target)) { - ret = inode_backpointer_exists(&trans, &target); - if (ret < 0) - goto err; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; - backpointer_exists = ret; - ret = 0; +fsck_err: + return ret; +} - if (fsck_err_on(S_ISDIR(target.bi_mode) && - backpointer_exists, c, - "directory %llu with multiple links", - target.bi_inum)) { - ret = remove_dirent(&trans, d.k->p); - if (ret) - goto err; - continue; - } +/* + * Walk dirents: verify that they all have a corresponding S_ISDIR inode, + * validate d_type + */ +noinline_for_stack +static int check_dirents(struct bch_fs *c) +{ + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); + struct snapshots_seen s; + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; + int ret = 0; - if (fsck_err_on(backpointer_exists && - !target.bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - d_inum)) { - target.bi_nlink++; - target.bi_flags &= ~BCH_INODE_UNLINKED; + bch_verbose(c, "checking dirents"); - ret = write_inode(&trans, &target, target_snapshot); - if (ret) - goto err; - } + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - if (fsck_err_on(!backpointer_exists, c, - "inode %llu has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - d_inum, - target.bi_dir, - target.bi_dir_offset, - k.k->p.inode, - k.k->p.offset)) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(&trans, &target, target_snapshot); - if (ret) - goto err; - } - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, - "incorrect d_type: should be %u:\n%s", - mode_to_type(target.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - struct bkey_i_dirent *n; + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_dirent(&trans, &iter, &hash_info, + &dir, &target, &s)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto err; - } + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + inode_walker_exit(&dir); + inode_walker_exit(&target); + return ret; +} - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(target.bi_mode); +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, + struct inode_walker *inode) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret; - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_trans_update(&trans, iter, &n->k_i, 0)); - kfree(n); - if (ret) - goto err; + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; - } + ret = bkey_err(k); + if (ret) + return ret; - nr_subdirs += d.v->d_type == DT_DIR; -next: - bch2_btree_iter_advance(iter); - } -err: -fsck_err: - if (ret == -EINTR) - goto retry; + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; - bch2_trans_iter_put(&trans, iter); - return bch2_trans_exit(&trans) ?: ret; + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, + "xattr for missing inode %llu", + k.k->p.inode)) + return bch2_btree_delete_at(trans, iter, 0); + + if (ret == INT_MAX) + return 0; + + ret = 0; + + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: + return ret; } /* @@ -912,90 +1693,101 @@ fsck_err: noinline_for_stack static int check_xattrs(struct bch_fs *c) { - struct inode_walker w = inode_walker_init(); + struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + struct btree_iter iter; int ret = 0; bch_verbose(c, "checking xattrs"); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - ret = walk_inode(&trans, &w, k.k->p.inode); - if (ret) - break; - - if (fsck_err_on(!w.have_inode, c, - "xattr for missing inode %llu", - k.k->p.inode)) { - ret = bch2_btree_delete_at(&trans, iter, 0); - if (ret) - break; - continue; - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - if (w.first_this_inode && w.have_inode) - hash_info = bch2_hash_info_init(c, &w.inode); - - ret = hash_check_key(&trans, bch2_xattr_hash_desc, - &hash_info, iter, k); + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_xattr(&trans, &iter, &hash_info, + &inode)); if (ret) break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - bch2_btree_iter_advance(iter); - } -fsck_err: - if (ret == -EINTR) - goto retry; - - bch2_trans_iter_put(&trans, iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } -/* Get root directory, create if it doesn't exist: */ -static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) +static int check_root_trans(struct btree_trans *trans) { - struct bkey_inode_buf packed; + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root_inode; u32 snapshot; + u64 inum; int ret; - bch_verbose(c, "checking root directory"); - - ret = bch2_trans_do(c, NULL, NULL, 0, - lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot)); + ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && ret != -ENOENT) return ret; - if (fsck_err_on(ret, c, "root directory missing")) - goto create_root; + if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + struct bkey_i_subvolume root_subvol; - if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, - "root inode not a directory")) - goto create_root; + snapshot = U32_MAX; + inum = BCACHEFS_ROOT_INO; - return 0; + bkey_subvolume_init(&root_subvol.k_i); + root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + if (ret) { + bch_err(c, "error writing root subvol: %i", ret); + goto err; + } + + } + + ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && ret != -ENOENT) + return ret; + + if (mustfix_fsck_err_on(ret, c, "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); + root_inode.bi_inum = inum; + + ret = __write_inode(trans, &root_inode, snapshot); + if (ret) + bch_err(c, "error writing root inode: %i", ret); + } +err: fsck_err: return ret; -create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode->bi_inum = BCACHEFS_ROOT_INO; +} - bch2_inode_pack(c, &packed, root_inode); +/* Get root directory, create if it doesn't exist: */ +noinline_for_stack +static int check_root(struct bch_fs *c) +{ + bch_verbose(c, "checking root directory"); - return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + return bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + check_root_trans(&trans)); } struct pathbuf { @@ -1004,10 +1796,23 @@ struct pathbuf { struct pathbuf_entry { u64 inum; + u32 snapshot; } *entries; }; -static int path_down(struct pathbuf *p, u64 inum) +static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) +{ + struct pathbuf_entry *i; + + for (i = p->entries; i < p->entries + p->nr; i++) + if (i->inum == inum && + i->snapshot == snapshot) + return true; + + return false; +} + +static int path_down(struct pathbuf *p, u64 inum, u32 snapshot) { if (p->nr == p->size) { size_t new_size = max_t(size_t, 256UL, p->size * 2); @@ -1023,73 +1828,109 @@ static int path_down(struct pathbuf *p, u64 inum) }; p->entries[p->nr++] = (struct pathbuf_entry) { - .inum = inum, + .inum = inum, + .snapshot = snapshot, }; return 0; } +/* + * Check that a given inode is reachable from the root: + * + * XXX: we should also be verifying that inodes are in the right subvolumes + */ static int check_path(struct btree_trans *trans, struct pathbuf *p, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + u32 snapshot) { struct bch_fs *c = trans->c; - u32 snapshot; - size_t i; int ret = 0; + snapshot = snapshot_t(c, snapshot)->equiv; p->nr = 0; - while (inode->bi_inum != BCACHEFS_ROOT_INO) { + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + struct btree_iter dirent_iter; + struct bkey_s_c_dirent d; + u32 parent_snapshot = snapshot; + + if (inode->bi_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &parent_snapshot, &inum); + if (ret) + break; + } + ret = lockrestart_do(trans, - inode_backpointer_exists(trans, inode)); - if (ret < 0) + PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot))).k)); + if (ret && ret != -ENOENT) break; - if (!ret) { - if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu", - inode->bi_inum, - mode_to_type(inode->bi_mode), + if (!ret && !dirent_points_to_inode(d, inode)) { + bch2_trans_iter_exit(trans, &dirent_iter); + ret = -ENOENT; + } + + if (ret == -ENOENT) { + if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + bch2_d_type_str(inode_d_type(inode)), inode->bi_nlink, inode->bi_dir, inode->bi_dir_offset)) - ret = reattach_inode(trans, inode); + ret = reattach_inode(trans, inode, snapshot); break; } - ret = 0; + + bch2_trans_iter_exit(trans, &dirent_iter); if (!S_ISDIR(inode->bi_mode)) break; - ret = path_down(p, inode->bi_inum); + ret = path_down(p, inode->bi_inum, snapshot); if (ret) { bch_err(c, "memory allocation failure"); return ret; } - for (i = 0; i < p->nr; i++) { - if (inode->bi_dir != p->entries[i].inum) - continue; + snapshot = parent_snapshot; + + ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err(c, "error looking up parent directory: %i", ret); + break; + } + + if (path_is_dup(p, inode->bi_inum, snapshot)) { + struct pathbuf_entry *i; /* XXX print path */ + bch_err(c, "directory structure loop"); + + for (i = p->entries; i < p->entries + p->nr; i++) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + if (!fsck_err(c, "directory structure loop")) return 0; - ret = lockrestart_do(trans, - remove_backpointer(trans, inode)); + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); if (ret) { bch_err(c, "error removing dirent: %i", ret); break; } - ret = reattach_inode(trans, inode); - break; - } - - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err(c, "error looking up parent directory: %i", ret); - break; + ret = reattach_inode(trans, inode, snapshot); } } fsck_err: @@ -1103,10 +1944,11 @@ fsck_err: * After check_dirents(), if an inode backpointer doesn't exist that means it's * unreachable: */ +noinline_for_stack static int check_directory_structure(struct bch_fs *c) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; struct pathbuf path = { 0, 0, NULL }; @@ -1116,28 +1958,33 @@ static int check_directory_structure(struct bch_fs *c) for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) continue; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); + ret = bch2_inode_unpack(k, &u); if (ret) { /* Should have been caught earlier in fsck: */ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); break; } - ret = check_path(&trans, &path, &u); + if (u.bi_flags & BCH_INODE_UNLINKED) + continue; + + ret = check_path(&trans, &path, &u, iter.pos.snapshot); if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(ret == -EINTR); kfree(path.entries); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } struct nlink_table { @@ -1185,8 +2032,9 @@ static int nlink_cmp(const void *_l, const void *_r) return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); } -static void inc_link(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum) +static void inc_link(struct bch_fs *c, struct snapshots_seen *s, + struct nlink_table *links, + u64 range_start, u64 range_end, u64 inum, u32 snapshot) { struct nlink *link, key = { .inum = inum, .snapshot = U32_MAX, @@ -1195,10 +2043,20 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links, if (inum < range_start || inum >= range_end) return; - link = bsearch(&key, links->d, links->nr, - sizeof(links->d[0]), nlink_cmp); - if (link) - link->count++; + link = __inline_bsearch(&key, links->d, links->nr, + sizeof(links->d[0]), nlink_cmp); + if (!link) + return; + + while (link > links->d && link[0].inum == link[-1].inum) + --link; + + for (; link < links->d + links->nr && link->inum == inum; link++) + if (ref_visible(c, s, snapshot, link->snapshot)) { + link->count++; + if (link->snapshot >= snapshot) + break; + } } noinline_for_stack @@ -1207,9 +2065,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, u64 start, u64 *end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_inode inode; struct bch_inode_unpacked u; int ret = 0; @@ -1218,22 +2075,21 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) continue; - inode = bkey_s_c_to_inode(k); + /* Should never fail, checked by bch2_inode_invalid: */ + BUG_ON(bch2_inode_unpack(k, &u)); /* * Backpointer and directory structure checks are sufficient for * directories, since they can't have hardlinks: */ - if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) + if (S_ISDIR(le16_to_cpu(u.bi_mode))) continue; - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(inode, &u)); - if (!u.bi_nlink) continue; @@ -1245,7 +2101,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) @@ -1259,34 +2115,43 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links u64 range_start, u64 range_end) { struct btree_trans trans; - struct btree_iter *iter; + struct snapshots_seen s; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent d; int ret; + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = snapshots_seen_update(c, &s, k.k->p); + if (ret) + break; + switch (k.k->type) { case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); - if (d.v->d_type != DT_DIR) - inc_link(c, links, range_start, range_end, - le64_to_cpu(d.v->d_inum)); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + d.k->p.snapshot); break; } - - bch2_trans_cond_resched(&trans); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); return ret; } @@ -1296,9 +2161,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, u64 range_start, u64 range_end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_inode inode; struct bch_inode_unpacked u; struct nlink *link = links->d; int ret = 0; @@ -1308,23 +2172,24 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset >= range_end) break; - if (k.k->type != KEY_TYPE_inode) + if (!bkey_is_inode(k.k)) continue; - inode = bkey_s_c_to_inode(k); - if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) - continue; + BUG_ON(bch2_inode_unpack(k, &u)); - BUG_ON(bch2_inode_unpack(inode, &u)); + if (S_ISDIR(le16_to_cpu(u.bi_mode))) + continue; if (!u.bi_nlink) continue; - while (link->inum < k.k->p.offset) { + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { link++; BUG_ON(link >= links->d + links->nr); } @@ -1335,16 +2200,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_inode_write(&trans, iter, &u)); + ret = write_inode(&trans, &u, k.k->p.snapshot); if (ret) bch_err(c, "error in fsck: error %i updating inode", ret); } } fsck_err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) @@ -1390,21 +2252,91 @@ static int check_nlinks(struct bch_fs *c) return ret; } +static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bkey_s_c k; + struct bkey_s_c_reflink_p p; + struct bkey_i_reflink_p *u; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_reflink_p) + return 0; + + p = bkey_s_c_to_reflink_p(k); + + if (!p.v->front_pad && !p.v->back_pad) + return 0; + + u = bch2_trans_kmalloc(trans, sizeof(*u)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(&u->k_i, k); + u->v.front_pad = 0; + u->v.back_pad = 0; + + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); +} + +noinline_for_stack +static int fix_reflink_p(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + + bch_verbose(c, "fixing reflink_p keys"); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->type == KEY_TYPE_reflink_p) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(&trans, &iter)); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + /* * Checks for inconsistencies that shouldn't happen, unless we have a bug. * Doesn't fix them yet, mainly because they haven't yet been observed: */ int bch2_fsck_full(struct bch_fs *c) { - struct bch_inode_unpacked root_inode; - - return check_inodes(c, true) ?: + return bch2_fs_snapshots_check(c) ?: + check_inodes(c, true) ?: + check_subvols(c) ?: check_extents(c) ?: check_dirents(c) ?: check_xattrs(c) ?: - check_root(c, &root_inode) ?: + check_root(c) ?: check_directory_structure(c) ?: - check_nlinks(c); + check_nlinks(c) ?: + fix_reflink_p(c); } int bch2_fsck_walk_inodes_only(struct bch_fs *c) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 17d8eb5223cd..ffce68a80490 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -6,8 +6,10 @@ #include "btree_update.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "inode.h" #include "str_hash.h" +#include "subvolume.h" #include "varint.h" #include <linux/random.h> @@ -33,29 +35,6 @@ static const u8 bits_table[8] = { 13 * 8 - 8, }; -static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) -{ - __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; - unsigned shift, bytes, bits = likely(!hi) - ? fls64(lo) - : fls64(hi) + 64; - - for (shift = 1; shift <= 8; shift++) - if (bits < bits_table[shift - 1]) - goto got_shift; - - BUG(); -got_shift: - bytes = byte_table[shift - 1]; - - BUG_ON(out + bytes > end); - - memcpy(out, (u8 *) in + 16 - bytes, bytes); - *out |= (1 << 8) >> shift; - - return bytes; -} - static int inode_decode_field(const u8 *in, const u8 *end, u64 out[2], unsigned *out_bits) { @@ -90,42 +69,11 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - struct bkey_i_inode *k = &packed->inode; - u8 *out = k->v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - unsigned bytes; - -#define x(_name, _bits) \ - out += inode_encode_field(out, end, 0, inode->_name); \ - nr_fields++; \ - \ - if (inode->_name) { \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } - - BCH_INODE_FIELDS() -#undef x - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - bytes = out - (u8 *) &packed->inode.v; - set_bkey_val_bytes(&packed->inode.k, bytes); - memset_u64s_tail(&packed->inode.v, 0, bytes); - - SET_INODE_NR_FIELDS(&k->v, nr_fields); -} - -static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +void bch2_inode_pack(struct bch_fs *c, + struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - struct bkey_i_inode *k = &packed->inode; + struct bkey_i_inode_v2 *k = &packed->inode; u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; @@ -133,11 +81,19 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, unsigned bytes; int ret; + bkey_inode_v2_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + #define x(_name, _bits) \ nr_fields++; \ \ if (inode->_name) { \ - ret = bch2_varint_encode(out, inode->_name); \ + ret = bch2_varint_encode_fast(out, inode->_name); \ out += ret; \ \ if (_bits > 64) \ @@ -163,30 +119,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODE_NR_FIELDS(&k->v, nr_fields); -} - -void bch2_inode_pack(struct bch_fs *c, - struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); - - if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { - SET_INODE_NEW_VARINT(&packed->inode.v, true); - bch2_inode_pack_v2(packed, inode); - } else { - bch2_inode_pack_v1(packed, inode); - } + SET_INODEv2_NR_FIELDS(&k->v, nr_fields); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; - int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), + int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); BUG_ON(ret); BUG_ON(unpacked.bi_inum != inode->bi_inum); @@ -235,24 +173,23 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return 0; } -static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) +static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, + const u8 *in, const u8 *end, + unsigned nr_fields) { - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); unsigned fieldnr = 0; int ret; u64 v[2]; #define x(_name, _bits) \ - if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ - ret = bch2_varint_decode(in, end, &v[0]); \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ if (ret < 0) \ return ret; \ in += ret; \ \ if (_bits > 64) { \ - ret = bch2_varint_decode(in, end, &v[1]); \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ if (ret < 0) \ return ret; \ in += ret; \ @@ -275,50 +212,81 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, return 0; } -int bch2_inode_unpack(struct bkey_s_c_inode inode, +int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - if (INODE_NEW_VARINT(inode.v)) { - return bch2_inode_unpack_v2(inode, unpacked); - } else { - return bch2_inode_unpack_v1(inode, unpacked); + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODE_NR_FIELDS(inode.v)); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } + break; + } + case KEY_TYPE_inode_v2: { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODEv2_NR_FIELDS(inode.v)); + } + default: + BUG(); } - - return 0; } -struct btree_iter *bch2_inode_peek(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u64 inum, unsigned flags) +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_CACHED|flags); - k = bch2_btree_iter_peek_cached(iter); + if (0 && trans->c->opts.inodes_use_key_cache) + flags |= BTREE_ITER_CACHED; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), flags); + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT; + ret = bkey_is_inode(k.k) ? 0 : -ENOENT; if (ret) goto err; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bch2_inode_unpack(k, inode); if (ret) goto err; - return iter; + return 0; err: - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } int bch2_inode_write(struct btree_trans *trans, @@ -338,8 +306,8 @@ int bch2_inode_write(struct btree_trans *trans, const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; if (k.k->p.inode) return "nonzero k.p.inode"; @@ -353,7 +321,7 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) return "invalid str hash type"; - if (bch2_inode_unpack(inode, &unpacked)) + if (bch2_inode_unpack(k, &unpacked)) return "invalid variable length fields"; if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) @@ -366,28 +334,79 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) unpacked.bi_nlink != 0) return "flagged as unlinked but bi_nlink != 0"; + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) + return "subvolume root but not a directory"; + return NULL; } -void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); struct bch_inode_unpacked unpacked; - if (bch2_inode_unpack(inode, &unpacked)) { - pr_buf(out, "(unpack error)"); - return; - } + if (k.k->p.inode) + return "nonzero k.p.inode"; + + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) + return "incorrect value size"; + + if (k.k->p.offset < BLOCKDEV_INODE_MAX) + return "fs inode in blockdev range"; - pr_buf(out, "mode: %o ", unpacked.bi_mode); + if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) + return "invalid str hash type"; + + if (bch2_inode_unpack(k, &unpacked)) + return "invalid variable length fields"; + + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) + return "invalid data checksum type"; + + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) + return "invalid data checksum type"; + + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && + unpacked.bi_nlink != 0) + return "flagged as unlinked but bi_nlink != 0"; + + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) + return "subvolume root but not a directory"; + + return NULL; +} + +static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + pr_buf(out, "mode %o flags %x journal_seq %llu", + inode->bi_mode, inode->bi_flags, + inode->bi_journal_seq); #define x(_name, _bits) \ - pr_buf(out, #_name ": %llu ", (u64) unpacked._name); + pr_buf(out, " "#_name " %llu", (u64) inode->_name); BCH_INODE_FIELDS() #undef x } +void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + pr_buf(out, "inum: %llu ", inode->bi_inum); + __bch2_inode_unpacked_to_text(out, inode); +} + +void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bch_inode_unpacked inode; + + if (bch2_inode_unpack(k, &inode)) { + pr_buf(out, "(unpack error)"); + return; + } + + __bch2_inode_unpacked_to_text(out, &inode); +} + const char *bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k) { @@ -461,6 +480,7 @@ static inline u32 bkey_generation(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: BUG(); case KEY_TYPE_inode_generation: return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); @@ -469,12 +489,15 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -struct btree_iter *bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) { struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; struct bkey_s_c k; u64 min, max, start, pos, *hint; int ret = 0; @@ -500,9 +523,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, start = min; pos = start; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -515,9 +538,9 @@ again: } if (k.k->p.snapshot == snapshot && - k.k->type != KEY_TYPE_inode && + !bkey_is_inode(k.k) && !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); continue; } @@ -540,8 +563,8 @@ again: ret = -ENOSPC; if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } /* Retry from start */ @@ -553,32 +576,94 @@ found_slot: k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } /* We may have raced while the iterator wasn't pointing at pos: */ - if (k.k->type == KEY_TYPE_inode || + if (bkey_is_inode(k.k) || bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) goto again; *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - return iter; + return 0; +} + +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + u64 offset = 0; + int ret = 0; + + while (!ret || ret == -EINTR) { + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_trans_iter_init(trans, &iter, id, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + + if (!k.k || iter.pos.inode != inum.inum) { + bch2_trans_iter_exit(trans, &iter); + break; + } + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + if (btree_node_type_is_extents(iter.btree_id)) { + unsigned max_sectors = + min_t(u64, U64_MAX - iter.pos.offset, + KEY_SIZE_MAX & (~0 << trans->c->block_bits)); + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + + ret = bch2_extent_trim_atomic(trans, &iter, &delete); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); + } + + return ret; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; - struct bpos start = POS(inode_nr, 0); - struct bpos end = POS(inode_nr + 1, 0); struct bch_inode_unpacked inode_u; struct bkey_s_c k; + unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; int ret; + if (0 && cached && c->opts.inodes_use_key_cache) + iter_flags |= BTREE_ITER_CACHED; + bch2_trans_init(&trans, c, 0, 1024); /* @@ -589,50 +674,48 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, - start, end, NULL); + ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: bch2_trans_begin(&trans); - if (cached) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_cached(iter); - } else { - iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); - } + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), iter_flags); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_inode) { + if (!bkey_is_inode(k.k)) { bch2_fs_inconsistent(trans.c, "inode %llu not found when deleting", - inode_nr); + inum.inum); ret = -EIO; goto err; } - bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + BUG_ON(inode_u.bi_subvol); bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter->pos; + delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?: + ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -640,21 +723,22 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_inode_peek(trans, inode, inode_nr, 0); - ret = PTR_ERR_OR_ZERO(iter); - bch2_trans_iter_put(trans, iter); + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + bch2_inode_find_by_inum_trans(&trans, inum, inode)); } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 2cb081ae44d9..723186d8afb6 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -7,6 +7,7 @@ extern const char * const bch2_inode_opts[]; const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode (struct bkey_ops) { \ @@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_inode_to_text, \ } +#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ +} + +static inline bool bkey_is_inode(const struct bkey *k) +{ + return k->type == KEY_TYPE_inode || + k->type == KEY_TYPE_inode_v2; +} + const char *bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, @@ -34,6 +46,7 @@ typedef u64 u96; struct bch_inode_unpacked { u64 bi_inum; + u64 bi_journal_seq; __le64 bi_hash_seed; u32 bi_flags; u16 bi_mode; @@ -44,7 +57,7 @@ struct bch_inode_unpacked { }; struct bkey_inode_buf { - struct bkey_i_inode inode; + struct bkey_i_inode_v2 inode; #define x(_name, _bits) + 8 + _bits / 8 u8 _pad[0 + BCH_INODE_FIELDS()]; @@ -53,10 +66,12 @@ struct bkey_inode_buf { void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); + +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); -struct btree_iter *bch2_inode_peek(struct btree_trans *, - struct bch_inode_unpacked *, u64, unsigned); +int bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *); @@ -69,12 +84,15 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -struct btree_iter *bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, u32, u64); +int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, u64, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); -int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { @@ -131,6 +149,11 @@ static inline u8 mode_to_type(umode_t mode) return (mode >> 12) & 15; } +static inline u8 inode_d_type(struct bch_inode_unpacked *inode) +{ + return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index ddb85f9d474e..5a3c9eff1b50 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -27,6 +27,7 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" @@ -135,10 +136,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -186,26 +187,24 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, int bch2_sum_sector_overwrites(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *new, - bool *maybe_extending, bool *usage_increasing, s64 *i_sectors_delta, s64 *disk_sectors_delta) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c old; unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; *usage_increasing = false; *i_sectors_delta = 0; *disk_sectors_delta = 0; - iter = bch2_trans_copy_iter(trans, extent_iter); + bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -220,38 +219,21 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, : 0; if (!*usage_increasing && - (new_replicas > bch2_bkey_replicas(c, old) || + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; - if (bkey_cmp(old.k->p, new->k.p) >= 0) { - /* - * Check if there's already data above where we're - * going to be writing to - this means we're definitely - * not extending the file: - * - * Note that it's not sufficient to check if there's - * data up to the sector offset we're going to be - * writing to, because i_size could be up to one block - * less: - */ - if (!bkey_cmp(old.k->p, new->k.p)) - old = bch2_btree_iter_next(iter); - - if (old.k && !bkey_err(old) && - old.k->p.inode == extent_iter->pos.inode && - bkey_extent_is_data(old.k)) - *maybe_extending = false; - + if (bkey_cmp(old.k->p, new->k.p) >= 0) break; - } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, @@ -260,214 +242,208 @@ int bch2_extent_update(struct btree_trans *trans, s64 *i_sectors_delta_total, bool check_enospc) { - /* this must live until after bch2_trans_commit(): */ - struct bkey_inode_buf inode_p; - bool extending = false, usage_increasing; + struct btree_iter inode_iter; + struct bch_inode_unpacked inode_u; + struct bpos next_pos; + bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; - ret = bch2_extent_trim_atomic(k, iter); + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret; + new_i_size = min(k->k.p.offset << 9, new_i_size); + next_pos = k->k.p; + ret = bch2_sum_sector_overwrites(trans, iter, k, - &extending, &usage_increasing, &i_sectors_delta, &disk_sectors_delta); if (ret) return ret; - if (!usage_increasing) - check_enospc = false; - if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { ret = bch2_disk_reservation_add(trans->c, disk_res, disk_sectors_delta - disk_res->sectors, - !check_enospc + !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret; } - new_i_size = extending - ? min(k->k.p.offset << 9, new_i_size) - : 0; - - if (i_sectors_delta || new_i_size) { - struct btree_iter *inode_iter; - struct bch_inode_unpacked inode_u; - - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); - if (ret) - return ret; - - /* - * XXX: - * writeback can race a bit with truncate, because truncate - * first updates the inode then truncates the pagecache. This is - * ugly, but lets us preserve the invariant that the in memory - * i_size is always >= the on disk i_size. - * - BUG_ON(new_i_size > inode_u.bi_size && - (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); - */ - BUG_ON(new_i_size > inode_u.bi_size && !extending); - - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) - inode_u.bi_size = new_i_size; - else - new_i_size = 0; - - inode_u.bi_sectors += i_sectors_delta; - - if (i_sectors_delta || new_i_size) { - bch2_inode_pack(trans->c, &inode_p, &inode_u); + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + return ret; - inode_p.inode.k.p.snapshot = iter->snapshot; + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; - ret = bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i, 0); - } - - bch2_trans_iter_put(trans, inode_iter); - - if (ret) - return ret; - } + inode_u.bi_sectors += i_sectors_delta; ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL); - BUG_ON(ret == -ENOSPC); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) return ret; if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; } +/* + * Returns -EINTR if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, + subvol_inum inum, u64 end, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { + while (!ret || ret == -EINTR) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + if (ret) + ret2 = ret; + bch2_trans_begin(trans); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); + if (bkey_cmp(iter->pos, end_pos) >= 0) { + bch2_btree_iter_set_pos(iter, end_pos); + break; + } + ret = bkey_err(k); if (ret) - goto btree_err; + continue; bkey_init(&delete.k); delete.k.p = iter->pos; /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, - &disk_res, journal_seq, + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, NULL, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; - } - - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); } return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, - u64 *journal_seq, s64 *i_sectors_delta) +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inum, start), - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, iter, POS(inum, end), - journal_seq, i_sectors_delta); + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret == -EINTR) - ret = 0; - - return ret; + return ret == -EINTR ? 0 : ret; } int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; + struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; + BUG_ON(!inum.subvol); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { bch2_trans_begin(&trans); k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); - k->k.p.snapshot = iter->snapshot; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (ret == -EINTR) + continue; + if (ret) + break; - bch2_bkey_buf_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter->pos, sk.k); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, iter, sk.k, + ret = bch2_extent_update(&trans, inum, &iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) continue; if (ret) break; - if (bkey_cmp(iter->pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + if (ec_ob) + bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); + + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -741,6 +717,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ? ((unsigned long) buf & (PAGE_SIZE - 1)) : 0), PAGE_SIZE); + pages = min(pages, BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; @@ -906,7 +884,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; void *ec_buf; - struct bpos ec_pos = op->pos; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; @@ -923,7 +900,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: - BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -1077,9 +1053,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, dst->bi_iter.bi_size = total_output; do_write: - /* might have done a realloc... */ - bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); - *_dst = dst; return more; csum_err: @@ -1628,12 +1601,12 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; @@ -1644,12 +1617,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); if (bkey_err(k)) goto err; @@ -1676,7 +1649,7 @@ retry: goto err; out: bch2_rbio_done(rbio); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return; @@ -1692,7 +1665,10 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->read_pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1708,12 +1684,12 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); } else { flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - __bch2_read(c, rbio, iter, inode, &failed, flags); + __bch2_read(c, rbio, iter, inum, &failed, flags); } } @@ -1742,7 +1718,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - struct btree_iter *iter = NULL; + struct btree_iter iter; struct bkey_i *new; struct bkey_s_c k; int ret = 0; @@ -1750,9 +1726,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto out; @@ -1787,9 +1763,10 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - ret = bch2_trans_update(trans, iter, new, 0); + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1811,8 +1788,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; struct bch_csum csum; + nofs_flags = memalloc_nofs_save(); + /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { src->bi_iter.bi_size = crc.compressed_size << 9; @@ -1877,6 +1857,8 @@ nodecode: rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } +out: + memalloc_nofs_restore(nofs_flags); return; csum_err: /* @@ -1887,7 +1869,7 @@ csum_err: if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - return; + goto out; } bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, @@ -1895,12 +1877,12 @@ csum_err: rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - return; + goto out; decompression_err: bch_err_inum_ratelimited(c, rbio->read_pos.inode, "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - return; + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1955,7 +1937,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, struct bkey_buf *orig_k) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 reflink_offset; int ret; @@ -1963,10 +1945,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1983,10 +1965,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, goto err; } - *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2057,7 +2039,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_NONE && + (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || @@ -2150,6 +2132,7 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; + rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; @@ -2252,13 +2235,14 @@ out_read_done: } void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + u32 snapshot; int ret; BUG_ON(flags & BCH_READ_NODECODE); @@ -2267,23 +2251,37 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - bch2_btree_iter_set_pos(iter, - POS(inode, bvec_iter.bi_sector)); + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(&trans)) { + ret = -EINTR; + break; + } + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -2314,7 +2312,7 @@ retry: if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags); if (ret) @@ -2325,20 +2323,26 @@ retry: swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } - bch2_trans_iter_put(&trans, iter); +err: + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + if (ret) { - bch_err_inum_ratelimited(c, inode, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); } - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index bc0a0bd6f849..1aa422dccef7 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -48,12 +48,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } -static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -{ - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -} - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_MOVINGGC @@ -62,13 +56,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) } int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct disk_reservation *, - u64 *, u64, s64 *, bool); + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64 *, u64, s64 *, bool); + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *, s64 *); -int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); int bch2_write_index_default(struct bch_write_op *); @@ -90,6 +85,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_have.nr = 0; op->target = 0; op->opts = opts; + op->subvol = 0; op->pos = POS_MAX; op->version = ZERO_VERSION; op->write_point = (struct write_point_specifier) { 0 }; @@ -157,10 +153,10 @@ static inline void bch2_read_extent(struct btree_trans *trans, } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - u64, struct bch_io_failures *, unsigned flags); + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inode) + subvol_inum inum) { struct bch_io_failures failed = { .nr = 0 }; @@ -168,8 +164,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED); diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index e7aca7c9823a..78bff13d36f2 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -62,6 +62,7 @@ struct bch_read_bio { /* * pos we read from - different from data_pos for indirect extents: */ + u32 subvol; struct bpos read_pos; /* @@ -94,7 +95,8 @@ struct bch_write_bio { bounce:1, put_bio:1, have_ioref:1, - used_mempool:1; + used_mempool:1, + first_btree_write:1; struct bio bio; }; @@ -121,6 +123,7 @@ struct bch_write_op { u16 nonce; struct bch_io_opts opts; + u32 subvol; struct bpos pos; struct bversion version; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d714779a28d0..14bea8a2535e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j) buf->must_flush = false; buf->separate_flush = false; - memset(buf->has_inode, 0, sizeof(buf->has_inode)); - memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); buf->data->u64s = 0; @@ -109,7 +107,12 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - j->err_seq = journal_cur_seq(j); + /* + * XXX: we're not using j->lock here because this can be called from + * interrupt context, this can race with journal_write_done() + */ + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } @@ -335,55 +338,6 @@ static void journal_write_work(struct work_struct *work) journal_entry_close(j); } -/* - * Given an inode number, if that inode number has data in the journal that - * hasn't yet been flushed, return the journal sequence number that needs to be - * flushed: - */ -u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - union journal_res_state s; - unsigned i; - u64 seq; - - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - s = READ_ONCE(j->reservations); - i = s.idx; - - while (1) { - if (test_bit(h, j->buf[i].has_inode)) - goto out; - - if (i == s.unwritten_idx) - break; - - i = (i - 1) & JOURNAL_BUF_MASK; - seq--; - } - - seq = 0; -out: - spin_unlock(&j->lock); - - return seq; -} - -void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - struct journal_buf *buf; - - spin_lock(&j->lock); - - if ((buf = journal_seq_to_buf(j, seq))) - set_bit(h, buf->has_inode); - - spin_unlock(&j->lock); -} - static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -602,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); + if (WARN_ONCE(seq > journal_cur_seq(j), + "requested to flush journal seq %llu, but currently at %llu", + seq, journal_cur_seq(j))) + goto out; /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { @@ -1071,7 +1028,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, bch2_journal_space_available(j); spin_unlock(&j->lock); - return 0; + return bch2_journal_reclaim_start(j); } /* init/exit: */ diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 145c0edf9cf3..c39cbbf1bccd 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -141,7 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j) return j->pin.back - 1; } -u64 bch2_inode_journal_seq(struct journal *, u64); void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) @@ -163,18 +162,6 @@ static inline void journal_state_inc(union journal_res_state *s) s->buf3_count += s->idx == 3; } -static inline void bch2_journal_set_has_inode(struct journal *j, - struct journal_res *res, - u64 inum) -{ - struct journal_buf *buf = &j->buf[res->idx]; - unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); - - /* avoid atomic op if possible */ - if (unlikely(!test_bit(bit, buf->has_inode))) - set_bit(bit, buf->has_inode); -} - /* * Amount of space that will be taken up by some keys in the journal (i.e. * including the jset header) @@ -291,7 +278,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, @@ -446,6 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, ret = 0; if ((flags & JOURNAL_RES_GET_RESERVED) || + test_bit(JOURNAL_NOCHANGES, &j->flags) || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 2da6839fcdc0..5c8304e05abd 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -450,7 +450,7 @@ static int journal_entry_validate_dev_usage(struct bch_fs *c, struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned expected = sizeof(*u); unsigned dev; int ret = 0; @@ -1259,14 +1259,15 @@ static void journal_write_done(struct closure *cl) if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; - j->seq_ondisk = seq; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; + if (!err) { + j->seq_ondisk = seq; - if (!JSET_NO_FLUSH(w->data)) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - } + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -1515,7 +1516,7 @@ retry_alloc: w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) + if (test_bit(JOURNAL_NOCHANGES, &j->flags)) goto no_io; for_each_rw_member(ca, c, i) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 70f896717537..ca482c6743c3 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -11,7 +11,6 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> -#include <linux/sched/task.h> #include <trace/events/bcachefs.h> /* Free space calculations: */ @@ -35,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; + unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) + ? ((journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr) + : ja->nr; /* * Don't use the last bucket unless writing the new last_seq @@ -645,6 +646,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (fifo_free(&j->pin) <= 32) min_nr = 1; + if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + min_nr = 1; + trace_journal_reclaim_start(c, min_nr, j->prereserved.reserved, @@ -654,7 +658,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL); + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr, min_key_cache); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index f2060f903cbc..79bc0e49389b 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -250,19 +250,28 @@ void bch2_blacklist_entries_gc(struct work_struct *work) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; - for_each_btree_node(&trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH, b) - if (test_bit(BCH_FS_STOPPING, &c->flags)) { - bch2_trans_exit(&trans); - return; - } - bch2_trans_iter_free(&trans, iter); + bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + 0, 0, BTREE_ITER_PREFETCH); +retry: + bch2_trans_begin(&trans); + + b = bch2_btree_iter_peek_node(&iter); + + while (!(ret = PTR_ERR_OR_ZERO(b)) && + b && + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); } - ret = bch2_trans_exit(&trans); + bch2_trans_exit(&trans); if (ret) return; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 61674ae1ab5f..d484513289aa 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -34,8 +34,6 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; - /* bloom filter: */ - unsigned long has_inode[1024 / sizeof(unsigned long)]; }; /* @@ -154,6 +152,7 @@ enum { JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NOCHANGES, }; /* Embedded in struct bch_fs */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 6ebe49ba2248..6defc33322b3 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags enum btree_id btree_id) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0; @@ -47,13 +47,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - while ((k = bch2_btree_iter_peek(iter)).k && + while ((bch2_trans_begin(&trans), + (k = bch2_btree_iter_peek(&iter)).k) && !(ret = bkey_err(k))) { if (!bch2_bkey_has_device(k, dev_idx)) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } @@ -71,9 +73,18 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags */ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&sk.k->k)) + sk.k->k.size = 0; - ret = bch2_trans_update(&trans, iter, sk.k, 0) ?: + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); @@ -87,9 +98,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); BUG_ON(ret == -EINTR); @@ -106,7 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct closure cl; struct btree *b; struct bkey_buf k; @@ -122,12 +133,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH, b) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), dev_idx)) - continue; + goto next; bch2_bkey_buf_copy(&k, c, &b->key); @@ -138,18 +153,23 @@ retry: break; } - ret = bch2_btree_node_update_key(c, iter, b, k.k); + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); if (ret == -EINTR) { - b = bch2_btree_iter_peek_node(iter); ret = 0; - goto retry; + continue; } + if (ret) { bch_err(c, "Error updating btree node key: %i", ret); break; } +next: + bch2_btree_iter_next_node(&iter); } - bch2_trans_iter_free(&trans, iter); + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); if (ret) goto err; @@ -161,7 +181,7 @@ retry: ret = 0; err: - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&k, c); BUG_ON(ret == -EINTR); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index abcc852731a6..64e39c10e34b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -8,11 +8,13 @@ #include "btree_update_interior.h" #include "buckets.h" #include "disk_groups.h" +#include "ec.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" #include "move.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include "keylist.h" @@ -53,13 +55,89 @@ struct moving_context { wait_queue_head_t wait; }; +static int insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; + struct snapshots_seen s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + + snapshots_seen_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; + + if (!snapshot_t(c, old_pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { +next: + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (bkey_cmp(old_pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; + size_t i; + + for (i = 0; i < s.nr; i++) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = new_pos; + update->k.p.snapshot = k.k->p.snapshot; + + bch2_trans_iter_init(trans, &update_iter, id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + break; + + ret = snapshots_seen_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + kfree(s.d); + + return ret; +} + static int bch2_migrate_index_update(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct migrate_write *m = container_of(op, struct migrate_write, op); + struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); struct keylist *keys = &op->insert_keys; struct bkey_buf _new, _insert; int ret = 0; @@ -70,9 +148,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k; @@ -80,13 +158,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct bkey_i_extent *new; const union bch_extent_entry *entry; struct extent_ptr_decoded p; + struct bpos next_pos; bool did_work = false; - bool extending = false, should_check_enospc; + bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - bch2_trans_reset(&trans, 0); + bch2_trans_begin(&trans); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -102,9 +181,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); new = bkey_i_to_extent(_new.k); - bch2_cut_front(iter->pos, &new->k_i); + bch2_cut_front(iter.pos, &new->k_i); - bch2_cut_front(iter->pos, insert); + bch2_cut_front(iter.pos, insert); bch2_cut_back(new->k.p, insert); bch2_cut_back(insert->k.p, &new->k_i); @@ -119,7 +198,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) extent_for_each_ptr(extent_i_to_s(new), new_ptr) new_ptr->cached = true; - bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); + __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); } extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { @@ -146,8 +225,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.background_target, op->opts.data_replicas); - ret = bch2_sum_sector_overwrites(&trans, iter, insert, - &extending, + ret = bch2_sum_sector_overwrites(&trans, &iter, insert, &should_check_enospc, &i_sectors_delta, &disk_sectors_delta); @@ -163,20 +241,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto out; } - ret = bch2_trans_update(&trans, iter, insert, 0) ?: + next_pos = insert->k.p; + + ret = insert_snapshot_whiteouts(&trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| m->data_opts.btree_insert_flags); -err: - if (!ret) + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); atomic_long_inc(&c->extent_migrate_done); + if (ec_ob) + bch2_ob_add_backpointer(c, ec_ob, &insert->k); + } +err: if (ret == -EINTR) ret = 0; if (ret) break; next: - while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { + while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { bch2_keylist_pop_front(keys); if (bch2_keylist_empty(keys)) goto out; @@ -184,18 +271,18 @@ next: continue; nomatch: if (m->ctxt) { - BUG_ON(k.k->p.offset <= iter->pos.offset); + BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->ctxt->stats->keys_raced); - atomic64_add(k.k->p.offset - iter->pos.offset, + atomic64_add(k.k->p.offset - iter.pos.offset, &m->ctxt->stats->sectors_raced); } atomic_long_inc(&c->extent_migrate_raced); trace_move_race(&new->k); - bch2_btree_iter_next_slot(iter); + bch2_btree_iter_advance(&iter); goto next; } out: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&_insert, c); bch2_bkey_buf_exit(&_new, c); @@ -216,11 +303,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) m->op.crc = rbio->pick.crc; m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { - m->op.nonce = m->op.crc.nonce + m->op.crc.offset; - m->op.csum_type = m->op.crc.csum_type; - } - if (m->data_cmd == DATA_REWRITE) bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); } @@ -235,6 +317,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; struct extent_ptr_decoded p; int ret; @@ -255,6 +338,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.target = data_opts.target, m->op.write_point = wp; + /* + * op->csum_type is normally initialized from the fs/file's current + * options - but if an extent is encrypted, we require that it stays + * encrypted: + */ + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (bch2_csum_type_is_encryption(crc.csum_type)) { + m->op.nonce = crc.nonce + crc.offset; + m->op.csum_type = crc.csum_type; + break; + } + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { m->op.alloc_reserve = RESERVE_MOVINGGC; m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; @@ -328,12 +423,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -511,13 +606,13 @@ err: static int lookup_inode(struct btree_trans *trans, struct bpos pos, struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) goto err; @@ -527,15 +622,15 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, goto err; } - ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + ret = bkey_is_inode(k.k) ? 0 : -EIO; if (ret) goto err; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bch2_inode_unpack(k, inode); if (ret) goto err; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -553,7 +648,7 @@ static int __bch2_move_data(struct bch_fs *c, struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct data_opts data_opts; enum data_cmd data_cmd; @@ -567,8 +662,9 @@ static int __bch2_move_data(struct bch_fs *c, stats->btree_id = btree_id; stats->pos = start; - iter = bch2_trans_get_iter(&trans, btree_id, start, - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, btree_id, start, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); if (rate) bch2_ratelimit_reset(rate); @@ -597,9 +693,11 @@ static int __bch2_move_data(struct bch_fs *c, } } while (delay); - k = bch2_btree_iter_peek(iter); + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek(&iter); - stats->pos = iter->pos; + stats->pos = iter.pos; if (!k.k) break; @@ -652,8 +750,7 @@ static int __bch2_move_data(struct bch_fs *c, data_cmd, data_opts); if (ret2) { if (ret2 == -EINTR) { - bch2_trans_reset(&trans, 0); - bch2_trans_cond_resched(&trans); + bch2_trans_begin(&trans); continue; } @@ -673,18 +770,41 @@ next: atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), &stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(iter); - bch2_trans_cond_resched(&trans); + bch2_btree_iter_advance(&iter); } out: - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } +inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + + scnprintf(stats->name, sizeof(stats->name), + "%s", name); +} + +static inline void progress_list_add(struct bch_fs *c, + struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_add(&stats->list, &c->data_progress_list); + mutex_unlock(&c->data_progress_lock); +} + +static inline void progress_list_del(struct bch_fs *c, + struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_del(&stats->list); + mutex_unlock(&c->data_progress_lock); +} + int bch2_move_data(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, enum btree_id end_btree_id, struct bpos end_pos, @@ -697,6 +817,7 @@ int bch2_move_data(struct bch_fs *c, enum btree_id id; int ret; + progress_list_add(c, stats); closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); @@ -730,6 +851,7 @@ int bch2_move_data(struct bch_fs *c, atomic64_read(&stats->sectors_moved), atomic64_read(&stats->keys_moved)); + progress_list_del(c, stats); return ret; } @@ -746,7 +868,7 @@ static int bch2_move_btree(struct bch_fs *c, bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; enum btree_id id; struct data_opts data_opts; @@ -754,6 +876,7 @@ static int bch2_move_btree(struct bch_fs *c, int ret = 0; bch2_trans_init(&trans, c, 0, 0); + progress_list_add(c, stats); stats->data_type = BCH_DATA_btree; @@ -762,17 +885,21 @@ static int bch2_move_btree(struct bch_fs *c, id++) { stats->btree_id = id; - for_each_btree_node(&trans, iter, id, - id == start_btree_id ? start_pos : POS_MIN, - BTREE_ITER_PREFETCH, b) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; if ((cmp_int(id, end_btree_id) ?: - bkey_cmp(b->key.k.p, end_pos)) > 0) + bpos_cmp(b->key.k.p, end_pos)) > 0) break; - stats->pos = iter->pos; + stats->pos = iter.pos; switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { case DATA_SKIP: @@ -786,13 +913,19 @@ static int bch2_move_btree(struct bch_fs *c, BUG(); } - ret = bch2_btree_node_rewrite(c, iter, - b->data->keys.seq, 0) ?: ret; + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; + if (ret == -EINTR) + continue; + if (ret) + break; next: - bch2_trans_cond_resched(&trans); + bch2_btree_iter_next_node(&iter); } + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_iter_free(&trans, iter) ?: ret; if (kthread && kthread_should_stop()) break; } @@ -802,6 +935,11 @@ next: if (ret) bch_err(c, "error %i in bch2_move_btree", ret); + /* flush relevant btree updates */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + + progress_list_del(c, stats); return ret; } @@ -821,16 +959,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, struct data_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); - unsigned replicas = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - replicas = c->opts.metadata_replicas; - break; - case KEY_TYPE_extent: - replicas = io_opts->data_replicas; - break; - } + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; if (!nr_good || nr_good >= replicas) return DATA_SKIP; @@ -921,7 +1052,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) ret = bch2_move_btree(c, 0, POS_MIN, - BTREE_ID_NR, POS_MAX, + BTREE_ID_NR, SPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); @@ -943,6 +1074,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: + bch_move_stats_init(stats, "rereplicate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -950,10 +1082,6 @@ int bch2_data_job(struct bch_fs *c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, rereplicate_btree_pred, c, stats) ?: ret; - - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, @@ -967,6 +1095,7 @@ int bch2_data_job(struct bch_fs *c, if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; + bch_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); @@ -984,6 +1113,7 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_REWRITE_OLD_NODES: + bch_move_stats_init(stats, "rewrite_old_nodes"); ret = bch2_scan_old_btree_nodes(c, stats); break; default: diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 5076153689d1..2a789a1158ca 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); +inline void bch_move_stats_init(struct bch_move_stats *stats, + char *name); + + #endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index fc0de165af9f..9df6d18137a5 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -6,6 +6,8 @@ struct bch_move_stats { enum bch_data_type data_type; enum btree_id btree_id; struct bpos pos; + struct list_head list; + char name[32]; atomic64_t keys_moved; atomic64_t keys_raced; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 2acca0ddb6fd..5c9eafc026c9 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -85,6 +85,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, BUG_ON(i != j); #endif if (i >= 0 && + p.ptr.dev == h->data[i].dev && p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && p.ptr.gen == h->data[i].gen) { /* @@ -146,7 +147,8 @@ static int bch2_copygc(struct bch_fs *c) size_t b, heap_size = 0; int ret; - memset(&move_stats, 0, sizeof(move_stats)); + bch_move_stats_init(&move_stats, "copygc"); + /* * Find buckets with lowest sector counts, skipping completely * empty buckets, by building a maxheap sorted by sector count, diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 64bf5a382d63..a955ef2008c9 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = { NULL }; +const char * const bch2_csum_types[] = { + BCH_CSUM_TYPES() + NULL +}; + const char * const bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; +const char * const bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL +}; + const char * const bch2_compression_opts[] = { BCH_COMPRESSION_OPTS() NULL }; const char * const bch2_str_hash_types[] = { + BCH_STR_HASH_TYPES() + NULL +}; + +const char * const bch2_str_hash_opts[] = { BCH_STR_HASH_OPTS() NULL }; @@ -63,6 +78,19 @@ const char * const bch2_member_states[] = { #undef x +const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", + [DT_CHR] = "chr", + [DT_DIR] = "dir", + [DT_BLK] = "blk", + [DT_REG] = "reg", + [DT_LNK] = "lnk", + [DT_SOCK] = "sock", + [DT_WHT] = "whiteout", + [DT_SUBVOL] = "subvol", +}; + void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1e2fc5de5ca4..afb1bb2a62d2 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -12,12 +12,21 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_data_types[]; extern const char * const bch2_cache_replacement_policies[]; extern const char * const bch2_member_states[]; +extern const char * const bch2_d_types[]; + +static inline const char *bch2_d_type_str(unsigned d_type) +{ + return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; +} /* * Mount options; we also store defaults in the superblock. @@ -134,7 +143,7 @@ enum opt_type { NULL, NULL) \ x(str_hash, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_types), \ + OPT_STR(bch2_str_hash_opts), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ @@ -170,8 +179,18 @@ enum opt_type { x(shard_inode_numbers, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, false, \ + BCH_SB_SHARD_INUMS, true, \ NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ + x(btree_node_mem_ptr_optimization, u8, \ + OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ x(gc_reserve_percent, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 35b409e0f366..8f8f4b0accd6 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -3,6 +3,7 @@ #include "btree_update.h" #include "inode.h" #include "quota.h" +#include "subvolume.h" #include "super-io.h" static const char *bch2_sb_validate_quota(struct bch_sb *sb, @@ -357,7 +358,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -372,9 +373,10 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } void bch2_fs_quota_exit(struct bch_fs *c) @@ -414,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c) } } +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_subvolume subvolume; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 1; + + ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + if (ret) + return ret; + + /* + * We don't do quota accounting in snapshots: + */ + if (BCH_SUBVOLUME_SNAP(&subvolume)) + goto advance; + + if (!bkey_is_inode(k.k)) + goto advance; + + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); + return 0; +} + int bch2_fs_quota_read(struct bch_fs *c) { unsigned i, qtypes = enabled_qtypes(c); struct bch_memquota_type *q; struct btree_trans trans; - struct btree_iter *iter; - struct bch_inode_unpacked u; - struct bkey_s_c k; + struct btree_iter iter; int ret; mutex_lock(&c->sb_lock); @@ -436,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - switch (k.k->type) { - case KEY_TYPE_inode: - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); - } - } - bch2_trans_iter_put(&trans, iter); - - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + do { + ret = lockrestart_do(&trans, + bch2_fs_quota_read_inode(&trans, &iter)); + } while (!ret); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } /* Enable/disable/delete quotas for an entire filesystem: */ @@ -717,13 +755,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_i_quota *new_quota, struct qc_dqblk *qdq) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (unlikely(ret)) @@ -742,8 +780,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans, if (qdq->d_fieldmask & QC_INO_HARD) new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0); - bch2_trans_iter_put(trans, iter); + ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -760,7 +798,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, + ret = bch2_trans_do(c, NULL, NULL, 0, bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index a0dbf41d1d37..a573fede05b1 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs_rebalance *r = &c->rebalance; struct io_clock *clock = &c->io_clock[WRITE]; struct rebalance_work w, p; + struct bch_move_stats move_stats; unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; @@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg) prev_start = jiffies; prev_cputime = curr_cputime(); + bch_move_stats_init(&move_stats, "rebalance"); while (!kthread_wait_freezable(r->enabled)) { cond_resched(); @@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg) prev_cputime = cputime; r->state = REBALANCE_RUNNING; - memset(&r->move_stats, 0, sizeof(r->move_stats)); + memset(&move_stats, 0, sizeof(move_stats)); rebalance_work_reset(c); bch2_move_data(c, @@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg) NULL, /* &r->pd.rate, */ writepoint_ptr(&c->rebalance_write_point), rebalance_pred, NULL, - &r->move_stats); + &move_stats); } return 0; @@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) h1); break; case REBALANCE_RUNNING: - pr_buf(out, "running\n" - "pos "); - bch2_bpos_to_text(out, r->move_stats.pos); - pr_buf(out, "\n"); + pr_buf(out, "running\n"); break; } } diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 2f62a643c39f..7462a92e9598 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -19,7 +19,6 @@ struct bch_fs_rebalance { enum rebalance_state state; u64 throttled_until_iotime; unsigned long throttled_until_cputime; - struct bch_move_stats move_stats; unsigned enabled:1; }; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 9bd6348842e0..c3b4d116275c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -20,6 +20,7 @@ #include "quota.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include <linux/sort.h> @@ -39,6 +40,20 @@ static void drop_alloc_keys(struct journal_keys *keys) keys->nr = dst; } +/* + * Btree node pointers have a field to stack a pointer to the in memory btree + * node; we need to zero out this field when reading in btree nodes, or when + * reading in keys from the journal: + */ +static void zero_out_btree_mem_ptr(struct journal_keys *keys) +{ + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->k->k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; +} + /* iterate over keys read from the journal: */ static int __journal_key_cmp(enum btree_id l_btree_id, @@ -312,7 +327,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, (k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&tmp, c, k); - bch2_btree_node_prefetch(c, NULL, tmp.k, + bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, b->c.btree_id, b->c.level - 1); bch2_btree_and_journal_iter_advance(&iter); @@ -322,10 +337,11 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, bch2_bkey_buf_exit(&tmp, c); } -static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, +static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, enum btree_id btree_id, btree_walk_key_fn key_fn) { + struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf tmp; @@ -349,11 +365,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b btree_and_journal_iter_prefetch(c, b, iter); - ret = bch2_btree_and_journal_walk_recurse(c, child, + ret = bch2_btree_and_journal_walk_recurse(trans, child, btree_id, key_fn); six_unlock_read(&child->c.lock); } else { - ret = key_fn(c, k); + ret = key_fn(trans, k); } if (ret) @@ -367,9 +383,10 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b return ret; } -int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, +int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, btree_walk_key_fn key_fn) { + struct bch_fs *c = trans->c; struct btree *b = c->btree_roots[btree_id].b; int ret = 0; @@ -377,7 +394,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, return 0; six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn); + ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); six_unlock_read(&b->c.lock); return ret; @@ -501,64 +518,38 @@ static void replay_now_at(struct journal *j, u64 seq) } static int __bch2_journal_replay_key(struct btree_trans *trans, - enum btree_id id, unsigned level, - struct bkey_i *k) + struct journal_key *k) { - struct btree_iter *iter; + struct btree_iter iter; + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; int ret; - iter = bch2_trans_get_node_iter(trans, id, k->k.p, - BTREE_MAX_DEPTH, level, - BTREE_ITER_INTENT); + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL; - /* - * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run - * extent_handle_overwrites() and extent_update_to_keys() - but we don't - * want that here, journal replay is supposed to treat extents like - * regular keys: - */ - BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_put(trans, iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) { - unsigned commit_flags = BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW; + unsigned commit_flags = + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED; if (!k->allocated) commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; return bch2_trans_do(c, NULL, NULL, commit_flags, - __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); -} - -static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter *iter; - int ret; - - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_put(trans, iter); - return ret; -} - -static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_alloc_replay_key(&trans, k)); + __bch2_journal_replay_key(&trans, k)); } static int journal_sort_seq_cmp(const void *_l, const void *_r) @@ -596,7 +587,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (!i->level && i->btree_id == BTREE_ID_alloc) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_alloc_replay_key(c, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -725,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c, ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); - for (i = 0; i < nr_types; i++) { + for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); @@ -954,6 +945,74 @@ fsck_err: return ret; } +static int bch2_fs_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; + root_snapshot.v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + ret = bch2_btree_insert(c, BTREE_ID_snapshots, + &root_snapshot.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_subvolumes, + &root_volume.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + return 0; +} + +static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch_err(c, "root inode not found"); + ret = -ENOENT; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; @@ -972,6 +1031,8 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + else + bch_info(c, "recovering from unclean shutdown"); if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -990,7 +1051,6 @@ int bch2_fs_recovery(struct bch_fs *c) bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ret = -EINVAL; goto err; - } if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { @@ -1005,11 +1065,20 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } - if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { - bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); - c->opts.version_upgrade = true; - c->opts.fsck = true; - c->opts.fix_errors = FSCK_OPT_YES; + if (!c->opts.nochanges) { + if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { + bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; + } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; + } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { + bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); + c->opts.version_upgrade = true; + } } ret = bch2_blacklist_table_initialize(c); @@ -1074,6 +1143,8 @@ use_clean: drop_alloc_keys(&c->journal_keys); } + zero_out_btree_mem_ptr(&c->journal_keys); + ret = journal_replay_early(c, clean, &c->journal_entries); if (ret) goto err; @@ -1176,6 +1247,31 @@ use_clean: bch_verbose(c, "alloc write done"); } + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); + + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + } + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + /* set bi_subvol on root inode */ + err = "error upgrade root inode for subvolumes"; + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_fs_upgrade_for_subvolumes(&trans)); + if (ret) + goto err; + } + if (c->opts.fsck) { bch_info(c, "starting fsck"); err = "error in fsck"; @@ -1202,7 +1298,9 @@ use_clean: if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { - struct bch_move_stats stats = { 0 }; + struct bch_move_stats stats; + + bch_move_stats_init(&stats, "recovery"); bch_info(c, "scanning for old btree nodes"); ret = bch2_fs_read_write(c); @@ -1334,9 +1432,22 @@ int bch2_fs_initialize(struct bch_fs *c) } } + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; bch2_inode_pack(c, &packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; @@ -1351,11 +1462,12 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating lost+found"; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + bch2_create_trans(&trans, + BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, 0, 0, S_IFDIR|0700, 0, - NULL, NULL)); + NULL, NULL, (subvol_inum) { 0 }, 0)); if (ret) { bch_err(c, "error creating lost+found"); goto err; @@ -1368,7 +1480,7 @@ int bch2_fs_initialize(struct bch_fs *c) } err = "error writing first journal entry"; - ret = bch2_journal_meta(&c->journal); + ret = bch2_journal_flush(&c->journal); if (ret) goto err; diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index e5565e4f335a..e45c70b3693f 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -45,9 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k); +typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); -int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn); +int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct list_head *); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index a420729288d4..8dcac7815c9f 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -7,6 +7,7 @@ #include "inode.h" #include "io.h" #include "reflink.h" +#include "subvolume.h" #include <linux/sched/signal.h> @@ -31,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_bytes(p.k) != sizeof(*p.v)) return "incorrect value size"; + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && + le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) + return "idx < front_pad"; + return NULL; } @@ -39,27 +44,28 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); + pr_buf(out, "idx %llu front_pad %u back_pad %u", + le64_to_cpu(p.v->idx), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); } -enum merge_result bch2_reflink_p_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) +bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) { struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); - struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); - if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) - return BCH_MERGE_NOMERGE; + /* + * Disabled for now, the triggers code needs to be reworked for merging + * of reflink pointers to work: + */ + return false; - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - bch2_cut_front_s(l.k->p, _r); - return BCH_MERGE_PARTIAL; - } + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return false; bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + return true; } /* indirect extents */ @@ -84,6 +90,14 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} + /* indirect inline data */ const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, @@ -110,7 +124,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, struct bkey_i *orig) { struct bch_fs *c = trans->c; - struct btree_iter *reflink_iter; + struct btree_iter reflink_iter = { NULL }; struct bkey_s_c k; struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; @@ -120,11 +134,11 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (orig->k.type == KEY_TYPE_inline_data) bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, + for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, POS(0, c->reflink_hint), BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { - if (reflink_iter->pos.inode) { - bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + if (reflink_iter.pos.inode) { + bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); continue; } @@ -136,16 +150,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, goto err; /* rewind iter to start of hole, if necessary: */ - bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + bch2_btree_iter_set_pos_to_extent_start(&reflink_iter); - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k)); + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) goto err; bkey_init(&r_v->k); r_v->k.type = bkey_type_to_indirect(&orig->k); - r_v->k.p = reflink_iter->pos; + r_v->k.p = reflink_iter.pos; bch2_key_resize(&r_v->k, orig->k.size); r_v->k.version = orig->k.version; @@ -155,26 +169,25 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - ret = bch2_trans_update(trans, reflink_iter, r_v, 0); + ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); if (ret) goto err; - r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); - if (IS_ERR(r_p)) { - ret = PTR_ERR(r_p); - goto err; - } - + /* + * orig is in a bkey_buf which statically allocates 5 64s for the val, + * so we know it will be big enough: + */ orig->k.type = KEY_TYPE_reflink_p; r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + memset(&r_p->v, 0, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); err: - if (!IS_ERR(reflink_iter)) - c->reflink_hint = reflink_iter->pos.offset; - bch2_trans_iter_put(trans, reflink_iter); + c->reflink_hint = reflink_iter.pos.offset; + bch2_trans_iter_exit(trans, &reflink_iter); return ret; } @@ -184,7 +197,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_continue(iter, 0, k, ret) { + for_each_btree_key_continue_norestart(*iter, 0, k, ret) { if (bkey_cmp(iter->pos, end) >= 0) break; @@ -192,22 +205,27 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) return k; } - bch2_btree_iter_set_pos(iter, end); - return bkey_s_c_null; + if (bkey_cmp(iter->pos, end) >= 0) + bch2_btree_iter_set_pos(iter, end); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } s64 bch2_remap_range(struct bch_fs *c, - struct bpos dst_start, struct bpos src_start, - u64 remap_sectors, u64 *journal_seq, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, u64 new_i_size, s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *dst_iter, *src_iter; + struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; struct bpos src_want; u64 dst_done; + u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; if (!percpu_ref_tryget(&c->writes)) @@ -222,13 +240,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); - dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); while ((ret == 0 || ret == -EINTR) && - bkey_cmp(dst_iter->pos, dst_end) < 0) { + bkey_cmp(dst_iter.pos, dst_end) < 0) { struct disk_reservation disk_res = { 0 }; bch2_trans_begin(&trans); @@ -238,31 +256,45 @@ s64 bch2_remap_range(struct bch_fs *c, break; } - dst_done = dst_iter->pos.offset - dst_start.offset; + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + &src_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(src_iter, src_want); + bch2_btree_iter_set_pos(&src_iter, src_want); - src_k = get_next_src(src_iter, src_end); + src_k = get_next_src(&src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; - if (bkey_cmp(src_want, src_iter->pos) < 0) { - ret = bch2_fpunch_at(&trans, dst_iter, - bpos_min(dst_end, - POS(dst_iter->pos.inode, dst_iter->pos.offset + - src_iter->pos.offset - src_want.offset)), - journal_seq, i_sectors_delta); + if (bkey_cmp(src_want, src_iter.pos) < 0) { + ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), + i_sectors_delta); continue; } if (src_k.k->type != KEY_TYPE_reflink_p) { + bch2_btree_iter_set_pos_to_extent_start(&src_iter); + bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); - bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k)); - - ret = bch2_make_extent_indirect(&trans, src_iter, + ret = bch2_make_extent_indirect(&trans, &src_iter, new_src.k); if (ret) continue; @@ -285,46 +317,47 @@ s64 bch2_remap_range(struct bch_fs *c, BUG(); } - new_dst.k->k.p = dst_iter->pos; + new_dst.k->k.p = dst_iter.pos; bch2_key_resize(&new_dst.k->k, min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_iter, new_dst.k, - &disk_res, journal_seq, + dst_end.offset - dst_iter.pos.offset)); + + ret = bch2_extent_update(&trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, NULL, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); } - bch2_trans_iter_put(&trans, dst_iter); - bch2_trans_iter_put(&trans, src_iter); + bch2_trans_iter_exit(&trans, &dst_iter); + bch2_trans_iter_exit(&trans, &src_iter); - BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end)); - BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end)); + BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0); - dst_done = dst_iter->pos.offset - dst_start.offset; - new_i_size = min(dst_iter->pos.offset << 9, new_i_size); - - bch2_trans_begin(&trans); + dst_done = dst_iter.pos.offset - dst_start.offset; + new_i_size = min(dst_iter.pos.offset << 9, new_i_size); do { struct bch_inode_unpacked inode_u; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; + + bch2_trans_begin(&trans); - inode_iter = bch2_inode_peek(&trans, &inode_u, - dst_start.inode, BTREE_ITER_INTENT); - ret2 = PTR_ERR_OR_ZERO(inode_iter); + ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, journal_seq, 0); + ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); } - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); } while (ret2 == -EINTR); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index bfc785619ee8..3745873fd88d 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -5,8 +5,7 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -enum merge_result bch2_reflink_p_merge(struct bch_fs *, - struct bkey_s, struct bkey_s); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); #define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ @@ -58,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) } } -s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, - u64, u64 *, u64, s64 *); +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index dbbbcc6dcec6..002006593044 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -1010,6 +1010,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; + if (e->data_type == BCH_DATA_cached) + continue; + for (i = 0; i < e->nr_devs; i++) { struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index b8492b51a262..57d636740d2f 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -8,24 +8,25 @@ #include "error.h" #include "inode.h" #include "siphash.h" +#include "subvolume.h" #include "super.h" #include <linux/crc32c.h> #include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { switch (opt) { case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_CRC32C; + return BCH_STR_HASH_crc32c; case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_CRC64; + return BCH_STR_HASH_crc64; case BCH_STR_HASH_OPT_siphash: return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_SIPHASH - : BCH_STR_HASH_SIPHASH_OLD; + ? BCH_STR_HASH_siphash + : BCH_STR_HASH_siphash_old; default: BUG(); } @@ -50,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) .siphash_key = { .k0 = bi->bi_hash_seed } }; - if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; @@ -76,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Init(&ctx->siphash, &info->siphash_key); break; default: @@ -98,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, const void *data, size_t len) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(ctx->crc32c, data, len); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(ctx->crc64, data, len); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Update(&ctx->siphash, data, len); break; default: @@ -117,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: return ctx->crc32c; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: return ctx->crc64 >> 1; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: return SipHash24_End(&ctx->siphash) >> 1; default: BUG(); @@ -137,28 +138,40 @@ struct bch_hash_desc { u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); bool (*cmp_key)(struct bkey_s_c, const void *); bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); + bool (*is_visible)(subvol_inum inum, struct bkey_s_c); }; -static __always_inline struct btree_iter * +static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) +{ + return k.k->type == desc.key_type && + (!desc.is_visible || desc.is_visible(inum, k)); +} + +static __always_inline int bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key, + subvol_inum inum, const void *key, unsigned flags) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; - if (k.k->type == desc.key_type) { + if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return iter; + return 0; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -166,35 +179,38 @@ bch2_hash_lookup(struct btree_trans *trans, break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret ?: -ENOENT); + return ret ?: -ENOENT; } -static __always_inline struct btree_iter * +static __always_inline int bch2_hash_hole(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; - if (k.k->type != desc.key_type) - return iter; + if (!is_visible_key(desc, inum, k)) + return 0; } + bch2_trans_iter_exit(trans, iter); - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, iter); - - return ERR_PTR(ret ?: -ENOSPC); + return ret ?: -ENOSPC; } static __always_inline @@ -203,28 +219,27 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, const struct bch_hash_info *info, struct btree_iter *start) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_copy_iter(trans, start); + bch2_trans_copy_iter(&iter, start); - bch2_btree_iter_next_slot(iter); + bch2_btree_iter_advance(&iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; if (k.k->type == desc.key_type && desc.hash_bkey(info, k) <= start->pos.offset) { - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ret = 1; break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -232,20 +247,28 @@ static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, struct bkey_i *insert, int flags) + subvol_inum inum, + struct bkey_i *insert, int flags) { - struct btree_iter *iter, *slot = NULL; + struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter.pos.inode != inum.inum) break; - if (k.k->type == desc.key_type) { + if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -253,9 +276,9 @@ int bch2_hash_set(struct btree_trans *trans, continue; } - if (!slot && + if (!slot.path && !(flags & BCH_HASH_SET_MUST_REPLACE)) - slot = bch2_trans_copy_iter(trans, iter); + bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -264,8 +287,8 @@ int bch2_hash_set(struct btree_trans *trans, if (!ret) ret = -ENOSPC; out: - bch2_trans_iter_put(trans, slot); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &slot); + bch2_trans_iter_exit(trans, &iter); return ret; found: @@ -277,11 +300,11 @@ not_found: } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { - if (!found && slot) + if (!found && slot.path) swap(iter, slot); - insert->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, insert, 0); + insert->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, insert, 0); } goto out; @@ -291,7 +314,8 @@ static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - struct btree_iter *iter) + struct btree_iter *iter, + unsigned update_flags) { struct bkey_i *delete; int ret; @@ -309,25 +333,25 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, 0); + return bch2_trans_update(trans, iter, delete, update_flags); } static __always_inline int bch2_hash_delete(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_hash_lookup(trans, desc, info, inode, key, + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); + if (ret) + return ret; - ret = bch2_hash_delete_at(trans, desc, info, iter); - bch2_trans_iter_put(trans, iter); + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 index 000000000000..7e909a118189 --- /dev/null +++ b/fs/bcachefs/subvolume.c @@ -0,0 +1,1084 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "error.h" +#include "fs.h" +#include "subvolume.h" + +/* Snapshot tree: */ + +static void bch2_delete_dead_snapshots_work(struct work_struct *); +static void bch2_delete_dead_snapshots(struct bch_fs *); + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol)); +} + +const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || + bkey_cmp(k.k->p, POS(0, 1)) < 0) + return "bad pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) + return "bad val size"; + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + if (id && id <= k.k->p.offset) + return "bad parent node"; + + if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) + return "children not normalized"; + + if (s.v->children[0] && + s.v->children[0] == s.v->children[1]) + return "duplicate child nodes"; + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + if (id >= k.k->p.offset) + return "bad child node"; + } + + return NULL; +} + +int bch2_mark_snapshot(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; + + t = genradix_ptr_alloc(&c->snapshots, + U32_MAX - new.k->p.offset, + GFP_KERNEL); + if (!t) + return -ENOMEM; + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + } else { + t->parent = 0; + t->children[0] = 0; + t->children[1] = 0; + t->subvol = 0; + } + + return 0; +} + +static int snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; + + if (!ret) + *s = *bkey_s_c_to_snapshot(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +static int bch2_snapshots_set_equiv(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + unsigned i; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 id = k.k->p.offset, child[2]; + unsigned nr_live = 0, live_idx; + + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + ret = snapshot_live(trans, child[i]); + if (ret < 0) + break; + + if (ret) + live_idx = i; + nr_live += ret; + } + + snapshot_t(c, id)->equiv = nr_live == 1 + ? snapshot_t(c, child[live_idx])->equiv + : id; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + bch_err(c, "error walking snapshots: %i", ret); + + return ret; +} + +/* fsck: */ +static int bch2_snapshot_check(struct btree_trans *trans, + struct bkey_s_c_snapshot s) +{ + struct bch_subvolume subvol; + struct bch_snapshot v; + u32 i, id; + int ret; + + id = le32_to_cpu(s.v->subvol); + ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + s.k->p.offset); + return -EINVAL; + } + + id = le32_to_cpu(s.v->parent); + if (id) { + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.children[0]) != s.k->p.offset && + le32_to_cpu(v.children[1]) != s.k->p.offset) { + bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", + id, s.k->p.offset); + return -EINVAL; + } + } + + for (i = 0; i < 2 && s.v->children[i]; i++) { + id = le32_to_cpu(s.v->children[i]); + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent child %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.parent) != s.k->p.offset) { + bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), s.k->p.offset); + return -EINVAL; + } + } + + return 0; +} + +int bch2_fs_snapshots_check(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_snapshot s; + unsigned id; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error %i checking snapshots", ret); + goto err; + } + + for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; +again_2: + id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + ret = snapshot_lookup(&trans, id, &s); + + if (ret == -EINTR) { + k = bch2_btree_iter_peek(&iter); + goto again_2; + } else if (ret == -ENOENT) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, id); + else if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + genradix_free(&c->snapshots); +} + +int bch2_fs_snapshots_start(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + bool have_deleted = false; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) + break; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(c, "found wrong key type %u in snapshot node table", + k.k->type); + continue; + } + + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + have_deleted = true; + + ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; +err: + bch2_trans_exit(&trans); + + if (!ret && have_deleted) { + bch_info(c, "restarting deletion of dead snapshots"); + if (c->opts.fsck) { + bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); + } else { + bch2_delete_dead_snapshots(c); + } + } + + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_snapshot *s; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + goto err; + + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto err; + + bkey_reassemble(&s->k_i, k); + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct bkey_s_c k; + struct bkey_s_c_snapshot s; + struct bkey_i_snapshot *parent; + u32 parent_id; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + s = bkey_s_c_to_snapshot(k); + + BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); + parent_id = le32_to_cpu(s.v->parent); + + if (parent_id) { + bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, + POS(0, parent_id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&p_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); + ret = -ENOENT; + goto err; + } + + parent = bch2_trans_kmalloc(trans, sizeof(*parent)); + ret = PTR_ERR_OR_ZERO(parent); + if (ret) + goto err; + + bkey_reassemble(&parent->k_i, k); + + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (i == 2) + bch_err(trans->c, "snapshot %u missing child pointer to %u", + parent_id, id); + else + parent->v.children[i] = 0; + + if (le32_to_cpu(parent->v.children[0]) < + le32_to_cpu(parent->v.children[1])) + swap(parent->v.children[0], + parent->v.children[1]); + + ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); + if (ret) + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -ENOSPC; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_snapshot_init(&n->k_i); + n->k.p = iter.pos; + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + bch2_trans_update(trans, &iter, &n->k_i, 0); + + ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + break; + + new_snapids[i] = iter.pos.offset; + } + + if (parent) { + bch2_btree_iter_set_pos(&iter, POS(0, parent)); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not found", parent); + ret = -ENOENT; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, k); + + if (n->v.children[0] || n->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); + bch2_trans_update(trans, &iter, &n->k_i, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +{ + BUG_ON(snapshot_list_has_id(s, id)); + + if (s->nr == s->size) { + size_t new_size = max(8U, s->size * 2); + void *n = krealloc(s->d, + new_size * sizeof(s->d[0]), + GFP_KERNEL); + if (!n) { + pr_err("error allocating snapshot ID list"); + return -ENOMEM; + } + + s->d = n; + s->size = new_size; + }; + + s->d[s->nr++] = id; + return 0; +} + +static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, + struct snapshot_id_list *deleted, + enum btree_id btree_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct snapshot_id_list equiv_seen = { 0 }; + struct bpos last_pos = POS_MIN; + int ret = 0; + + /* + * XXX: We should also delete whiteouts that no longer overwrite + * anything + */ + + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k))) { + u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; + + if (bkey_cmp(k.k->p, last_pos)) + equiv_seen.nr = 0; + last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(&equiv_seen, equiv)) { + if (btree_id == BTREE_ID_inodes && + bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) + continue; + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + if (ret) + break; + } else { + ret = snapshot_id_add(&equiv_seen, equiv); + if (ret) + break; + } + + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); + + kfree(equiv_seen.d); + + return ret; +} + +static void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + struct snapshot_id_list deleted = { 0 }; + u32 i, id, children[2]; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + continue; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = snapshot_live(&trans, children[0]) ?: + snapshot_live(&trans, children[1]); + if (ret < 0) + break; + if (ret) + continue; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); + if (ret) { + bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_id_add(&deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_snapshots(id)) + continue; + + ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); + if (ret) { + bch_err(c, "error deleting snapshot keys: %i", ret); + goto err; + } + } + + for (i = 0; i < deleted.nr; i++) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.d[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %i", + deleted.d[i], ret); + goto err; + } + } +err: + kfree(deleted.d); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} + +static void bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (unlikely(!percpu_ref_tryget(&c->writes))) + return; + + if (!queue_work(system_long_wq, &c->snapshot_delete_work)) + percpu_ref_put(&c->writes); +} + +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + bch2_delete_dead_snapshots(trans->c); + return 0; +} + +/* Subvolumes: */ + +const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) + return "invalid pos"; + + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + return "invalid pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) + return "bad val size"; + + return NULL; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + pr_buf(out, "root %llu snapshot id %u", + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); +} + +int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), + iter_flags); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; + + if (ret == -ENOENT && inconsistent_if_not_found) + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + if (!ret) + *s = *bkey_s_c_to_subvolume(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + u32 *snapid) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, true, + BTREE_ITER_CACHED| + BTREE_ITER_WITH_UPDATES, + &s); + + *snapid = le32_to_cpu(s.snapshot); + return ret; +} + +/* + * Delete subvolume, mark snapshot ID as deleted, queue up snapshot + * deletion/cleanup: + */ +int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + struct btree_trans_commit_hook *h; + struct bkey_i *delete; + u32 snapid; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + goto err; + + bkey_init(&delete->k); + delete->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, delete, 0); + if (ret) + goto err; + + ret = bch2_snapshot_node_set_deleted(trans, snapid); + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->fn = bch2_delete_dead_snapshots_hook; + bch2_trans_commit_hook(trans, h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); + struct snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; + memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + + for (id = s.d; id < s.d + s.nr; id++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { + bch_err(c, "error %i deleting subvolume %u", ret, *id); + break; + } + } + + kfree(s.d); + } + + percpu_ref_put(&c->writes); +} + +struct subvolume_unlink_hook { + struct btree_trans_commit_hook h; + u32 subvol; +}; + +int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); + struct bch_fs *c = trans->c; + int ret = 0; + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) + return ret; + + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; + + if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + percpu_ref_put(&c->writes); + return 0; +} + +int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_subvolume *n; + struct subvolume_unlink_hook *h; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, k); + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + if (ret) + goto err; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, + u32 *new_snapshotid, + bool ro) +{ + struct bch_fs *c = trans->c; + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + struct bkey_s_c k; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; + int ret = 0; + + for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + break; + + /* + * bch2_subvolume_delete() doesn't flush the btree key cache - + * ideally it would but that's tricky + */ + if (bkey_deleted(k.k) && + !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos)) + goto found_slot; + } + + if (!ret) + ret = -ENOSPC; + goto err; +found_slot: + snapshot_subvols[0] = dst_iter.pos.offset; + snapshot_subvols[1] = src_subvolid; + + if (src_subvolid) { + /* Creating a snapshot: */ + src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, + POS(0, src_subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch_err(c, "subvolume %u not found", src_subvolid); + ret = -ENOENT; + goto err; + } + + bkey_reassemble(&src_subvol->k_i, k); + parent = le32_to_cpu(src_subvol->v.snapshot); + } + + ret = bch2_snapshot_node_create(trans, parent, new_nodes, + snapshot_subvols, + src_subvolid ? 2 : 1); + if (ret) + goto err; + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); + bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); + } + + new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + goto err; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + new_subvol->k.p = dst_iter.pos; + bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +err: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int bch2_fs_subvolumes_init(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); + mutex_init(&c->snapshots_unlinked_lock); + return 0; +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 index 000000000000..e4c3fdcdf22f --- /dev/null +++ b/fs/bcachefs/subvolume.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + +#include "subvolume_types.h" + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ +} + +int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); + +static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return genradix_ptr(&c->snapshots, U32_MAX - id); +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s = snapshot_t(c, id); + + return s->children[0] || s->children[1]; +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s; + u32 parent = bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + while (id && id < ancestor) + id = bch2_snapshot_parent(c, id); + + return id == ancestor; +} + +struct snapshots_seen { + struct bpos pos; + size_t nr; + size_t size; + u32 *d; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + kfree(s->d); + s->d = NULL; +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + if (s->nr == s->size) { + size_t new_size = max(s->size, (size_t) 128) * 2; + u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); + + if (!d) { + bch_err(c, "error reallocating snapshots_seen table (new size %zu)", + new_size); + return -ENOMEM; + } + + s->size = new_size; + s->d = d; + } + + s->d[s->nr++] = id; + return 0; +} + +static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +{ + unsigned i; + + for (i = 0; i < s->nr; i++) + if (id == s->d[i]) + return true; + return false; +} + +int bch2_fs_snapshots_check(struct bch_fs *); +void bch2_fs_snapshots_exit(struct bch_fs *); +int bch2_fs_snapshots_start(struct bch_fs *); + +const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ +} + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +int bch2_subvolume_delete(struct btree_trans *, u32); +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +int bch2_fs_subvolumes_init(struct bch_fs *); + +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 index 000000000000..9410b9587591 --- /dev/null +++ b/fs/bcachefs/subvolume_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + +struct snapshot_id_list { + u32 nr; + u32 size; + u32 *d; +}; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 977885166d55..88a8e54fbd7a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -439,10 +439,8 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) __copy_super(&c->disk_sb, src); - if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) - set_bit(BCH_FS_ERROR, &c->flags); - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) + set_bit(BCH_FS_INITIALIZED, &c->flags); ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) @@ -680,7 +678,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) sb->offset = sb->layout.sb_offset[idx]; - SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); + SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), null_nonce(), sb); @@ -807,7 +805,8 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, - "Unable to write superblock to sufficient devices")) + "Unable to write superblock to sufficient devices (from %ps)", + (void *) _RET_IP_)) ret = -1; out: /* Make new options visible after they're persistent: */ @@ -983,6 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 155cefd4b460..3744b6d519a7 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -39,6 +39,7 @@ #include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" #include "sysfs.h" @@ -165,44 +166,6 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) &c->dev_usage_journal_res, u64s * nr); } -int bch2_congested(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - struct backing_dev_info *bdi; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - rcu_read_lock(); - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - const struct bch_devs_mask *devs = - bch2_target_to_mask(c, c->opts.foreground_target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_member_device_rcu(ca, c, i, devs) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } - rcu_read_unlock(); - - return ret; -} - /* Filesystem RO/RW: */ /* @@ -307,7 +270,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_RW, &c->flags)) { - BUG_ON(c->journal.reclaim_thread); + bch2_journal_reclaim_stop(&c->journal); return; } @@ -442,13 +405,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - /* - * We need to write out a journal entry before we start doing btree - * updates, to ensure that on unclean shutdown new journal blacklist - * entries are created: - */ - bch2_journal_meta(&c->journal); - clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); for_each_rw_member(ca, c, i) @@ -469,12 +425,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) for_each_rw_member(ca, c, i) bch2_wake_allocator(ca); - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) { - bch_err(c, "error starting journal reclaim: %i", ret); - return ret; - } - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -512,6 +462,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); @@ -530,12 +481,12 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); - if (c->btree_iters_bufs) + if (c->btree_paths_bufs) for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); free_percpu(c->online_reserved); - free_percpu(c->btree_iters_bufs); + free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -551,8 +502,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->io_complete_wq ); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_error_wq) - destroy_workqueue(c->btree_error_wq); + if (c->btree_io_complete_wq) + destroy_workqueue(c->btree_io_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -586,8 +537,7 @@ void __bch2_fs_stop(struct bch_fs *c) for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -605,7 +555,6 @@ void __bch2_fs_stop(struct bch_fs *c) for_each_member_device(ca, c, i) cancel_work_sync(&ca->io_error_work); - cancel_work_sync(&c->btree_write_error_work); cancel_work_sync(&c->read_only_work); for (i = 0; i < c->sb.nr_devices; i++) @@ -639,48 +588,53 @@ void bch2_fs_stop(struct bch_fs *c) bch2_fs_free(c); } -static const char *bch2_fs_online(struct bch_fs *c) +static int bch2_fs_online(struct bch_fs *c) { struct bch_dev *ca; - const char *err = NULL; unsigned i; - int ret; + int ret = 0; lockdep_assert_held(&bch_fs_list_lock); - if (!list_empty(&c->list)) - return NULL; - - if (__bch2_uuid_to_fs(c->sb.uuid)) - return "filesystem UUID already open"; + if (__bch2_uuid_to_fs(c->sb.uuid)) { + bch_err(c, "filesystem UUID already open"); + return -EINVAL; + } ret = bch2_fs_chardev_init(c); - if (ret) - return "error creating character device"; + if (ret) { + bch_err(c, "error creating character device"); + return ret; + } bch2_fs_debug_init(c); - if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || - kobject_add(&c->internal, &c->kobj, "internal") || - kobject_add(&c->opts_dir, &c->kobj, "options") || - kobject_add(&c->time_stats, &c->kobj, "time_stats") || - bch2_opts_create_sysfs_files(&c->opts_dir)) - return "error creating sysfs objects"; + ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + kobject_add(&c->internal, &c->kobj, "internal") ?: + kobject_add(&c->opts_dir, &c->kobj, "options") ?: + kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + bch2_opts_create_sysfs_files(&c->opts_dir); + if (ret) { + bch_err(c, "error creating sysfs objects"); + return ret; + } down_write(&c->state_lock); - err = "error creating sysfs objects"; - for_each_member_device(ca, c, i) - if (bch2_dev_sysfs_online(c, ca)) { + for_each_member_device(ca, c, i) { + ret = bch2_dev_sysfs_online(c, ca); + if (ret) { + bch_err(c, "error creating sysfs objects"); percpu_ref_put(&ca->ref); goto err; } + } + BUG_ON(!list_empty(&c->list)); list_add(&c->list, &bch_fs_list); - err = NULL; err: up_write(&c->state_lock); - return err; + return ret; } static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) @@ -688,13 +642,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) struct bch_sb_field_members *mi; struct bch_fs *c; unsigned i, iter_size; - const char *err; + int ret = 0; pr_verbose_init(opts, ""); c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); - if (!c) + if (!c) { + c = ERR_PTR(-ENOMEM); goto out; + } __module_get(THIS_MODULE); @@ -732,10 +688,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->usage_scratch_lock); mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->snapshot_table_lock); - bio_list_init(&c->btree_write_error_list); spin_lock_init(&c->btree_write_error_lock); - INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); INIT_WORK(&c->journal_seq_blacklist_gc_work, bch2_blacklist_entries_gc); @@ -752,6 +707,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->ec_stripe_new_list); mutex_init(&c->ec_stripe_new_lock); + INIT_LIST_HEAD(&c->data_progress_list); + mutex_init(&c->data_progress_lock); + spin_lock_init(&c->ec_stripes_heap_lock); seqcount_init(&c->gc_pos_lock); @@ -773,17 +731,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->sectors_available_lock); - if (percpu_init_rwsem(&c->mark_lock)) + ret = percpu_init_rwsem(&c->mark_lock); + if (ret) goto err; mutex_lock(&c->sb_lock); + ret = bch2_sb_to_fs(c, sb); + mutex_unlock(&c->sb_lock); - if (bch2_sb_to_fs(c, sb)) { - mutex_unlock(&c->sb_lock); + if (ret) goto err; - } - - mutex_unlock(&c->sb_lock); scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); @@ -794,8 +751,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->block_bits = ilog2(c->opts.block_size); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - if (bch2_fs_init_fault("fs_alloc")) + if (bch2_fs_init_fault("fs_alloc")) { + bch_err(c, "fs_alloc fault injected"); + ret = -EFAULT; goto err; + } iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * @@ -805,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_error_wq = alloc_workqueue("bcachefs_error", + !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || @@ -819,33 +779,44 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || + !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL)) || - bch2_io_clock_init(&c->io_clock[READ]) || - bch2_io_clock_init(&c->io_clock[WRITE]) || - bch2_fs_journal_init(&c->journal) || - bch2_fs_replicas_init(c) || - bch2_fs_btree_cache_init(c) || - bch2_fs_btree_key_cache_init(&c->btree_key_cache) || - bch2_fs_btree_iter_init(c) || - bch2_fs_btree_interior_update_init(c) || - bch2_fs_io_init(c) || - bch2_fs_encryption_init(c) || - bch2_fs_compress_init(c) || - bch2_fs_ec_init(c) || - bch2_fs_fsio_init(c)) + sizeof(u64), GFP_KERNEL))) { + ret = -ENOMEM; goto err; + } + + ret = bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_journal_init(&c->journal) ?: + bch2_fs_replicas_init(c) ?: + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_init(c) ?: + bch2_fs_encryption_init(c) ?: + bch2_fs_compress_init(c) ?: + bch2_fs_ec_init(c) ?: + bch2_fs_fsio_init(c); + if (ret) + goto err; + + if (c->opts.nochanges) + set_bit(JOURNAL_NOCHANGES, &c->journal.flags); mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && - bch2_dev_alloc(c, i)) + bch2_dev_alloc(c, i)) { + ret = -EEXIST; goto err; + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, @@ -856,18 +827,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); mutex_lock(&bch_fs_list_lock); - err = bch2_fs_online(c); + ret = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); - if (err) { - bch_err(c, "bch2_fs_online() error: %s", err); + + if (ret) goto err; - } out: - pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err: bch2_fs_free(c); - c = NULL; + c = ERR_PTR(ret); goto out; } @@ -907,7 +877,6 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); @@ -943,10 +912,11 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - err = "dynamic fault"; ret = -EINVAL; - if (bch2_fs_init_fault("fs_start")) + if (bch2_fs_init_fault("fs_start")) { + bch_err(c, "fs_start fault injected"); goto err; + } set_bit(BCH_FS_STARTED, &c->flags); @@ -967,7 +937,6 @@ int bch2_fs_start(struct bch_fs *c) if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { - err = "error going read write"; ret = !test_bit(BCH_FS_RW, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); @@ -985,25 +954,22 @@ err: case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); pr_cont("mount with -o fix_errors to repair\n"); - err = "fsck error"; break; case BCH_FSCK_REPAIR_UNIMPLEMENTED: bch_err(c, "filesystem contains errors: please report this to the developers"); pr_cont("repair unimplemented: inform the developers so that it can be added\n"); - err = "fsck error"; break; case BCH_FSCK_REPAIR_IMPOSSIBLE: bch_err(c, "filesystem contains errors, but repair impossible"); - err = "fsck error"; break; case BCH_FSCK_UNKNOWN_VERSION: - err = "unknown metadata version";; + bch_err(c, "unknown metadata version"); break; case -ENOMEM: - err = "cannot allocate memory"; + bch_err(c, "cannot allocate memory"); break; case -EIO: - err = "IO error"; + bch_err(c, "IO error"); break; } @@ -1065,8 +1031,7 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) - sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, - "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1102,10 +1067,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) wait_for_completion(&ca->io_ref_completion); if (ca->kobj.state_in_sysfs) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; - - sysfs_remove_link(block, "bcachefs"); + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); sysfs_remove_link(&ca->kobj, "block"); } @@ -1142,12 +1104,12 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) } if (ca->disk_sb.bdev) { - struct kobject *block = - &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; + struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); if (ret) return ret; + ret = sysfs_create_link(&ca->kobj, block, "block"); if (ret) return ret; @@ -1427,7 +1389,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) bch2_copygc_start(c); } -static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1436,10 +1398,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (bch2_dev_allocator_start(ca)) - return "error starting allocator thread"; - - return NULL; + return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1465,9 +1424,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (new_state == BCH_MEMBER_STATE_rw && - __bch2_dev_read_write(c, ca)) - ret = -ENOMEM; + if (new_state == BCH_MEMBER_STATE_rw) + ret = __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1497,15 +1455,18 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < ca->mi.nbuckets; i++) { - ret = bch2_btree_key_cache_flush(&trans, - BTREE_ID_alloc, POS(ca->dev_idx, i)); + ret = lockrestart_do(&trans, + bch2_btree_key_cache_flush(&trans, + BTREE_ID_alloc, POS(ca->dev_idx, i))); if (ret) break; } bch2_trans_exit(&trans); - if (ret) + if (ret) { + bch_err(c, "error %i removing dev alloc info", ret); return ret; + } return bch2_btree_delete_range(c, BTREE_ID_alloc, POS(ca->dev_idx, 0), @@ -1627,6 +1588,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_dev *ca = NULL; struct bch_sb_field_members *mi; struct bch_member dev_mi; + struct bucket_array *buckets; + struct bucket *g; unsigned dev_idx, nr_devices, u64s; int ret; @@ -1730,14 +1693,24 @@ have_slot: bch2_dev_usage_journal_reserve(c); + /* + * Clear marks before marking transactionally in the btree, so that + * per-device accounting gets done correctly: + */ + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + for_each_bucket(g, buckets) + atomic64_set(&g->_mark.v, 0); + up_read(&ca->bucket_lock); + err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, ca); if (ret) goto err_late; if (ca->mi.state == BCH_MEMBER_STATE_rw) { - err = __bch2_dev_read_write(c, ca); - if (err) + ret = __bch2_dev_read_write(c, ca); + if (ret) goto err_late; } @@ -1781,24 +1754,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (err) + if (err) { + bch_err(c, "error bringing %s online: %s", path, err); goto err; + } - if (bch2_dev_attach_bdev(c, &sb)) { - err = "bch2_dev_attach_bdev() error"; + ret = bch2_dev_attach_bdev(c, &sb); + if (ret) goto err; - } ca = bch_dev_locked(c, dev_idx); - if (bch2_trans_mark_dev_sb(c, ca)) { - err = "bch2_trans_mark_dev_sb() error"; + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb", + path, ret); goto err; } if (ca->mi.state == BCH_MEMBER_STATE_rw) { - err = __bch2_dev_read_write(c, ca); - if (err) + ret = __bch2_dev_read_write(c, ca); + if (ret) goto err; } @@ -1816,7 +1792,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) err: up_write(&c->state_lock); bch2_free_super(&sb); - bch_err(c, "error bringing %s online: %s", path, err); return -EINVAL; } @@ -1890,20 +1865,23 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) { - struct block_device *bdev = lookup_bdev(path); struct bch_dev *ca; + dev_t dev; unsigned i; + int ret; - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); - for_each_member_device(ca, c, i) - if (ca->disk_sb.bdev == bdev) + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + if (ca->disk_sb.bdev->bd_dev == dev) goto found; - ca = ERR_PTR(-ENOENT); found: - bdput(bdev); + rcu_read_unlock(); + return ca; } @@ -1917,7 +1895,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_sb_field_members *mi; unsigned i, best_sb = 0; const char *err; - int ret = -ENOMEM; + int ret = 0; pr_verbose_init(opts, ""); @@ -1932,8 +1910,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) + if (!sb) { + ret = -ENOMEM; goto err; + } for (i = 0; i < nr_devices; i++) { ret = bch2_read_super(devices[i], &opts, &sb[i]); @@ -1970,18 +1950,20 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, i++; } - ret = -ENOMEM; c = bch2_fs_alloc(sb[best_sb].sb, opts); - if (!c) + if (IS_ERR(c)) { + ret = PTR_ERR(c); goto err; + } - err = "bch2_dev_online() error"; down_write(&c->state_lock); - for (i = 0; i < nr_devices; i++) - if (bch2_dev_attach_bdev(c, &sb[i])) { + for (i = 0; i < nr_devices; i++) { + ret = bch2_dev_attach_bdev(c, &sb[i]); + if (ret) { up_write(&c->state_lock); - goto err_print; + goto err; } + } up_write(&c->state_lock); err = "insufficient devices"; @@ -2004,10 +1986,11 @@ err_print: devices[0], err); ret = -EINVAL; err: - if (c) + if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); - for (i = 0; i < nr_devices; i++) - bch2_free_super(&sb[i]); + if (sb) + for (i = 0; i < nr_devices; i++) + bch2_free_super(&sb[i]); c = ERR_PTR(ret); goto out; } @@ -2033,12 +2016,12 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, if (err) goto err; } else { + allocated_fs = true; c = bch2_fs_alloc(sb->sb, opts); - err = "cannot allocate memory"; - if (!c) - goto err; - allocated_fs = true; + err = "bch2_fs_alloc() error"; + if (IS_ERR(c)) + goto err; } err = "bch2_dev_online() error"; @@ -2064,7 +2047,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, err: mutex_unlock(&bch_fs_list_lock); - if (allocated_fs) + if (allocated_fs && !IS_ERR(c)) bch2_fs_stop(c); else if (c) closure_put(&c->cl); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 5cee064995af..739e8fd18176 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -196,7 +196,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 84a7acb04d01..864be8601868 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -155,11 +155,6 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(bucket_quantiles_last_read); -read_attribute(bucket_quantiles_last_write); -read_attribute(bucket_quantiles_fragmentation); -read_attribute(bucket_quantiles_oldest_gen); - read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -171,6 +166,7 @@ read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); read_attribute(stripes_heap); +read_attribute(open_buckets); read_attribute(internal_uuid); @@ -202,6 +198,8 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); +read_attribute(data_op_data_progress); + #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ @@ -238,6 +236,37 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) return nr ? div64_u64(sectors, nr) : 0; } +static long stats_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_move_stats *stats) +{ + pr_buf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + pr_buf(out, "%s", "\n"); + + return 0; +} + +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) +{ + long ret = 0; + struct bch_move_stats *iter; + + mutex_lock(&c->data_progress_lock); + + if (list_empty(&c->data_progress_list)) + pr_buf(out, "%s", "no progress to report\n"); + else + list_for_each_entry(iter, &c->data_progress_list, list) { + stats_to_text(out, c, iter); + } + + mutex_unlock(&c->data_progress_lock); + return ret; +} + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); @@ -256,7 +285,7 @@ static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, nr_compressed_extents = 0, @@ -291,8 +320,9 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c break; } } + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); if (ret) return ret; @@ -409,6 +439,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_open_buckets) { + bch2_open_buckets_to_text(&out, c); + return out.pos - buf; + } + if (attr == &sysfs_compression_stats) { bch2_compression_stats_to_text(&out, c); return out.pos - buf; @@ -428,6 +463,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_data_op_data_progress) { + data_progress_to_text(&out, c); + return out.pos - buf; + } + return 0; } @@ -567,6 +607,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_transactions, &sysfs_stripes_heap, + &sysfs_open_buckets, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, @@ -589,6 +630,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_read, &sysfs_io_timers_write, + &sysfs_data_op_data_progress, + &sysfs_internal_uuid, NULL }; @@ -703,76 +746,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, - size_t, void *); - -static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - int rw = (private ? 1 : 0); - - return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; -} - -static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - struct bucket *g = bucket(ca, b); - return bucket_sectors_used(g->mark); -} - -static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - return bucket_gc_gen(bucket(ca, b)); -} - -static int unsigned_cmp(const void *_l, const void *_r) -{ - const unsigned *l = _l; - const unsigned *r = _r; - - return cmp_int(*l, *r); -} - -static int quantiles_to_text(struct printbuf *out, - struct bch_fs *c, struct bch_dev *ca, - bucket_map_fn *fn, void *private) -{ - size_t i, n; - /* Compute 31 quantiles */ - unsigned q[31], *p; - - down_read(&ca->bucket_lock); - n = ca->mi.nbuckets; - - p = vzalloc(n * sizeof(unsigned)); - if (!p) { - up_read(&ca->bucket_lock); - return -ENOMEM; - } - - for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(c, ca, i, private); - - sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); - up_read(&ca->bucket_lock); - - while (n && - !p[n - 1]) - --n; - - for (i = 0; i < ARRAY_SIZE(q); i++) - q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; - - vfree(p); - - for (i = 0; i < ARRAY_SIZE(q); i++) - pr_buf(out, "%u ", q[i]); - pr_buf(out, "\n"); - return 0; -} - static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) { enum alloc_reserve i; @@ -934,15 +907,6 @@ SHOW(bch2_dev) clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_bucket_quantiles_last_read) - return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_last_write) - return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_fragmentation) - return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_oldest_gen) - return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_reserve_stats) { reserve_stats_to_text(&out, ca); return out.pos - buf; @@ -1034,12 +998,6 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - /* alloc info - other stats: */ - &sysfs_bucket_quantiles_last_read, - &sysfs_bucket_quantiles_last_write, - &sysfs_bucket_quantiles_fragmentation, - &sysfs_bucket_quantiles_oldest_gen, - &sysfs_reserve_stats, /* debug: */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 63f4a83ad1de..d5a74f4db64d 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -29,45 +29,44 @@ static void delete_test_keys(struct bch_fs *c) static int test_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i_cookie k; int ret; bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(iter); - if (ret) { - bch_err(c, "lookup error in test_delete: %i", ret); - goto err; - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_trans_update(&trans, iter, &k.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { bch_err(c, "update error in test_delete: %i", ret); goto err; } pr_info("deleting once"); - ret = bch2_btree_delete_at(&trans, iter, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error (first) in test_delete: %i", ret); goto err; } pr_info("deleting twice"); - ret = bch2_btree_delete_at(&trans, iter, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error (second) in test_delete: %i", ret); goto err; } err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -75,39 +74,38 @@ err: static int test_delete_written(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i_cookie k; int ret; bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(iter); - if (ret) { - bch_err(c, "lookup error in test_delete_written: %i", ret); - goto err; - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_trans_update(&trans, iter, &k.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { bch_err(c, "update error in test_delete_written: %i", ret); goto err; } + bch2_trans_unlock(&trans); bch2_journal_flush_all_pins(&c->journal); - ret = bch2_btree_delete_at(&trans, iter, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error in test_delete_written: %i", ret); goto err; } err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -115,7 +113,7 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -131,6 +129,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i; + k.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); @@ -156,12 +155,12 @@ static int test_iterate(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) BUG_ON(k.k->p.offset != --i); BUG_ON(i); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -169,7 +168,7 @@ err: static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -185,6 +184,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i + 8; + k.k.p.snapshot = U32_MAX; k.k.size = 8; ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, @@ -209,14 +209,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); } BUG_ON(i); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -224,7 +224,7 @@ err: static int test_iterate_slots(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -240,6 +240,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i * 2; + k.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); @@ -261,7 +262,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != i); i += 2; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr * 2); @@ -278,7 +279,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); err: bch2_trans_exit(&trans); return ret; @@ -287,7 +288,7 @@ err: static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -303,6 +304,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i + 16; + k.k.p.snapshot = U32_MAX; k.k.size = 8; ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, @@ -323,7 +325,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) BUG_ON(k.k->size != 8); i += 16; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr); @@ -342,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); err: bch2_trans_exit(&trans); return 0; @@ -355,21 +357,19 @@ err: static int test_peek_end(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); - - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - bch2_trans_iter_put(&trans, iter); - + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return 0; } @@ -377,21 +377,19 @@ static int test_peek_end(struct bch_fs *c, u64 nr) static int test_peek_end_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0); - - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - bch2_trans_iter_put(&trans, iter); - + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return 0; } @@ -410,6 +408,7 @@ static int insert_test_extent(struct bch_fs *c, bkey_cookie_init(&k.k_i); k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.size = end - start; k.k_i.k.version.lo = test_version++; @@ -536,18 +535,18 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) static int rand_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(iter, POS(0, test_rand())); + bch2_btree_iter_set_pos(&iter, POS(0, test_rand())); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) { bch_err(c, "error in rand_lookup: %i", ret); @@ -555,62 +554,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } +static int rand_mixed_trans(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i_cookie *cookie, + u64 i, u64 pos) +{ + struct bkey_s_c k; + int ret; + + bch2_btree_iter_set_pos(iter, POS(0, pos)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret && ret != -EINTR) + bch_err(trans->c, "lookup error in rand_mixed: %i", ret); + if (ret) + return ret; + + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; + bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + + return 0; +} + static int rand_mixed(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + struct btree_iter iter; + struct bkey_i_cookie cookie; int ret = 0; - u64 i; + u64 i, rand; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(iter, POS(0, test_rand())); - - k = bch2_btree_iter_peek(iter); - ret = bkey_err(k); + rand = test_rand(); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + rand_mixed_trans(&trans, &iter, &cookie, i, rand)); if (ret) { - bch_err(c, "lookup error in rand_mixed: %i", ret); + bch_err(c, "update error in rand_mixed: %i", ret); break; } - - if (!(i & 3) && k.k) { - struct bkey_i_cookie k; - - bkey_cookie_init(&k.k_i); - k.k.p = iter->pos; - - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_trans_update(&trans, iter, &k.k_i, 0)); - if (ret) { - bch_err(c, "update error in rand_mixed: %i", ret); - break; - } - } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } static int __do_delete(struct btree_trans *trans, struct bpos pos) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i delete; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) goto err; @@ -621,9 +631,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bkey_init(&delete.k); delete.k.p = k.k->p; - ret = bch2_trans_update(trans, iter, &delete, 0); + ret = bch2_trans_update(trans, &iter, &delete, 0); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -653,7 +663,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i_cookie insert; int ret = 0; @@ -665,10 +675,11 @@ static int seq_insert(struct bch_fs *c, u64 nr) for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - insert.k.p = iter->pos; + insert.k.p = iter.pos; ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_trans_update(&trans, iter, &insert.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &insert.k_i, 0)); if (ret) { bch_err(c, "error in seq_insert: %i", ret); break; @@ -677,7 +688,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) if (++i == nr) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -686,7 +697,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -694,7 +705,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ; - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -703,7 +714,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr) static int seq_overwrite(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -716,13 +727,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_trans_update(&trans, iter, &u.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &u.k_i, 0)); if (ret) { bch_err(c, "error in seq_overwrite: %i", ret); break; } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -771,7 +783,7 @@ static int btree_perf_test_thread(void *data) wait_event(j->ready_wait, !atomic_read(&j->ready)); } - ret = j->fn(j->c, j->nr / j->nr_threads); + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); if (ret) j->ret = ret; @@ -847,11 +859,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, scnprintf(name_buf, sizeof(name_buf), "%s:", testname); bch2_hprint(&PBUF(nr_buf), nr); - bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); + bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", name_buf, nr_buf, nr_threads, - time / NSEC_PER_SEC, - time * nr_threads / nr, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), per_sec_buf); return j.ret; } diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index e3ad26e244ab..52de7c49cacb 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) if (!page) return -ENOMEM; - BUG_ON(!bio_add_page(bio, page, len, 0)); + if (unlikely(!bio_add_page(bio, page, len, 0))) { + __free_page(page); + break; + } + size -= len; } @@ -890,6 +894,7 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) u64 *ret; int cpu; + /* access to pcpu vars has to be blocked by other locking */ preempt_disable(); ret = this_cpu_ptr(p); preempt_enable(); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 4c67ea2866c6..80402b398442 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -18,9 +18,6 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> -#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) - struct closure; #ifdef CONFIG_BCACHEFS_DEBUG @@ -88,7 +85,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -653,35 +650,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); @@ -741,10 +709,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src) for_each_possible_cpu(cpu) *per_cpu_ptr(dst, cpu) = 0; - - preempt_disable(); - *this_cpu_ptr(dst) = src; - preempt_enable(); + this_cpu_write(*dst, src); } static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c index a3d252c741c8..a2d6bb7136c7 100644 --- a/fs/bcachefs/varint.c +++ b/fs/bcachefs/varint.c @@ -1,10 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> +#include <linux/string.h> #include <asm/unaligned.h> +#ifdef CONFIG_VALGRIND +#include <valgrind/memcheck.h> +#endif + #include "varint.h" +/** + * bch2_varint_encode - encode a variable length integer + * @out - destination to encode to + * @v - unsigned integer to encode + * + * Returns the size in bytes of the encoded integer - at most 9 bytes + */ int bch2_varint_encode(u8 *out, u64 v) { unsigned bits = fls64(v|1); @@ -13,19 +25,85 @@ int bch2_varint_encode(u8 *out, u64 v) if (likely(bytes < 9)) { v <<= bytes; v |= ~(~0 << (bytes - 1)); + v = cpu_to_le64(v); + memcpy(out, &v, bytes); } else { *out++ = 255; bytes = 9; + put_unaligned_le64(v, out); } - put_unaligned_le64(v, out); return bytes; } +/** + * bch2_varint_decode - encode a variable length integer + * @in - varint to decode + * @end - end of buffer to decode from + * @out - on success, decoded integer + * + * Returns the size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) + */ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) { + unsigned bytes = likely(in < end) + ? ffz(*in & 255) + 1 + : 1; + u64 v; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v = 0; + memcpy(&v, in, bytes); + v = le64_to_cpu(v); + v >>= bytes; + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} + +/** + * bch2_varint_encode_fast - fast version of bch2_varint_encode + * + * This version assumes it's always safe to write 8 bytes to @out, even if the + * encoded integer would be smaller. + */ +int bch2_varint_encode_fast(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +/** + * bch2_varint_decode_fast - fast version of bch2_varint_decode + * + * This version assumes that it is safe to read at most 8 bytes past the end of + * @end (we still return an error if the varint extends past @end). + */ +int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) +{ +#ifdef CONFIG_VALGRIND + VALGRIND_MAKE_MEM_DEFINED(in, 8); +#endif u64 v = get_unaligned_le64(in); - unsigned bytes = ffz(v & 255) + 1; + unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) return -1; diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h index 8daf813576b7..92a182fb3d7a 100644 --- a/fs/bcachefs/varint.h +++ b/fs/bcachefs/varint.h @@ -5,4 +5,7 @@ int bch2_varint_encode(u8 *, u64); int bch2_varint_decode(const u8 *, const u8 *, u64 *); +int bch2_varint_encode_fast(u8 *, u64); +int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); + #endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index e7b40b3ca4aa..464ed68318e7 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -118,26 +118,28 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, le16_to_cpu(xattr.v->x_val_len)); } -int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) +static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, + const char *name, void *buffer, size_t size, int type) { - struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - struct btree_trans trans; - struct btree_iter *iter; + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct btree_iter iter; struct bkey_s_c_xattr xattr; + struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), + &X_SEARCH(type, name, strlen(name)), + 0); + if (ret) + goto err1; - iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash, - inode->v.i_ino, - &X_SEARCH(type, name, strlen(name)), - 0); - ret = PTR_ERR_OR_ZERO(iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) - goto err; + goto err2; - xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); + xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); if (buffer) { if (ret > size) @@ -145,21 +147,42 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, else memcpy(buffer, xattr_val(xattr.v), ret); } - bch2_trans_iter_put(&trans, iter); -err: - bch2_trans_exit(&trans); - - BUG_ON(ret == -EINTR); +err2: + bch2_trans_iter_exit(trans, &iter); +err1: return ret == -ENOENT ? -ENODATA : ret; } -int bch2_xattr_set(struct btree_trans *trans, u64 inum, +int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, + const char *name, void *buffer, size_t size, int type) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); +} + +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) { + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; int ret; + /* + * We need to do an inode update so that bi_journal_sync gets updated + * and fsync works: + * + * Perhaps we should be updating bi_mtime too? + */ + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + bch2_inode_write(trans, &inode_iter, &inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; + if (value) { struct bkey_i_xattr *xattr; unsigned namelen = strlen(name); @@ -272,16 +295,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 inum = dentry->d_inode->i_ino; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS(inum, 0), 0, k, ret) { + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) @@ -294,9 +325,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) break; } - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_exit(&trans); if (ret) return ret; @@ -323,6 +359,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, } static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -331,8 +368,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, &hash, + return bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); } @@ -455,6 +492,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode, } static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 4151065ab853..f4f896545e1c 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -39,7 +39,8 @@ struct bch_inode_info; int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, +int bch2_xattr_set(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 0d3037419bc7..8b6359083b9c 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -99,6 +99,12 @@ enum fid_type { FILEID_FAT_WITH_PARENT = 0x72, /* + * 64 bit inode number, 32 bit subvolume, 32 bit generation number: + */ + FILEID_BCACHEFS_WITHOUT_PARENT = 0x80, + FILEID_BCACHEFS_WITH_PARENT = 0x81, + + /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index b5fcda9e65d8..fce3146378f9 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -71,10 +71,10 @@ DECLARE_EVENT_CLASS(bio, ), TP_fast_assign( - __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; + __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", @@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail, __entry->required, __entry->cl) ); -TRACE_EVENT(btree_insert_key, - TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), - TP_ARGS(c, b, k), - - TP_STRUCT__entry( - __field(u8, id ) - __field(u64, inode ) - __field(u64, offset ) - __field(u32, size ) - ), - - TP_fast_assign( - __entry->id = b->c.btree_id; - __entry->inode = k->k.p.inode; - __entry->offset = k->k.p.offset; - __entry->size = k->k.size; - ), - - TP_printk("btree %u: %llu:%llu len %u", __entry->id, - __entry->inode, __entry->offset, __entry->size) -); - DEFINE_EVENT(btree_node, btree_split, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) @@ -540,69 +518,6 @@ TRACE_EVENT(copygc_wait, __entry->wait_amount, __entry->until) ); -TRACE_EVENT(trans_get_iter, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *got_pos, - unsigned got_locks, - unsigned got_uptodate, - struct bpos *src_pos, - unsigned src_locks, - unsigned src_uptodate), - TP_ARGS(trans_ip, caller_ip, btree_id, - got_pos, got_locks, got_uptodate, - src_pos, src_locks, src_uptodate), - - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, got_pos_inode ) - __field(u64, got_pos_offset ) - __field(u32, got_pos_snapshot ) - __field(u8, got_locks ) - __field(u8, got_uptodate ) - __field(u64, src_pos_inode ) - __field(u64, src_pos_offset ) - __field(u32, src_pos_snapshot ) - __field(u8, src_locks ) - __field(u8, src_uptodate ) - ), - - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->got_pos_inode = got_pos->inode; - __entry->got_pos_offset = got_pos->offset; - __entry->got_pos_snapshot = got_pos->snapshot; - __entry->got_locks = got_locks; - __entry->got_uptodate = got_uptodate; - __entry->src_pos_inode = src_pos->inode; - __entry->src_pos_offset = src_pos->offset; - __entry->src_pos_snapshot = src_pos->snapshot; - __entry->src_locks = src_locks; - __entry->src_uptodate = src_uptodate; - ), - - TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u " - "src %llu:%llu:%u l %u u %u", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->btree_id, - __entry->got_pos_inode, - __entry->got_pos_offset, - __entry->got_pos_snapshot, - __entry->got_locks, - __entry->got_uptodate, - __entry->src_pos_inode, - __entry->src_pos_offset, - __entry->src_pos_snapshot, - __entry->src_locks, - __entry->src_uptodate) -); - TRACE_EVENT(transaction_restart_ip, TP_PROTO(unsigned long caller, unsigned long ip), TP_ARGS(caller, ip), @@ -772,92 +687,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, TP_ARGS(trans_ip, caller_ip, btree_id, pos) ); -TRACE_EVENT(iter_traverse, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos, - int ret), - TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret), - - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) - __field(s32, ret ) - ), - - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; - __entry->ret = ret; - ), - - TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->btree_id, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->ret) -); - -TRACE_EVENT(iter_set_search_pos, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *old_pos, - struct bpos *new_pos, - unsigned good_level), - TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level), - - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, old_pos_inode ) - __field(u64, old_pos_offset ) - __field(u32, old_pos_snapshot ) - __field(u64, new_pos_inode ) - __field(u64, new_pos_offset ) - __field(u32, new_pos_snapshot ) - __field(u8, good_level ) - ), - - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->old_pos_inode = old_pos->inode; - __entry->old_pos_offset = old_pos->offset; - __entry->old_pos_snapshot = old_pos->snapshot; - __entry->new_pos_inode = new_pos->inode; - __entry->new_pos_offset = new_pos->offset; - __entry->new_pos_snapshot = new_pos->snapshot; - __entry->good_level = good_level; - ), - - TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->btree_id, - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot, - __entry->good_level) -); - TRACE_EVENT(trans_restart_would_deadlock, TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, @@ -927,93 +756,42 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_pos_snapshot) ); -TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, - unsigned long bytes), - TP_ARGS(trans_ip, caller_ip, bytes), +TRACE_EVENT(trans_restart_would_deadlock_write, + TP_PROTO(unsigned long trans_ip), + TP_ARGS(trans_ip), TP_STRUCT__entry( __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(unsigned long, bytes ) ), TP_fast_assign( __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->bytes = bytes; ), - TP_printk("%ps %pS bytes %lu", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->bytes) + TP_printk("%ps", (void *) __entry->trans_ip) ); -DECLARE_EVENT_CLASS(node_lock_fail, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, btree_id, pos, - level, iter_seq, node, node_seq), +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, + unsigned long bytes), + TP_ARGS(trans_ip, caller_ip, bytes), TP_STRUCT__entry( __field(unsigned long, trans_ip ) __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) - __field(u32, level ) - __field(u32, iter_seq ) - __field(u32, node ) - __field(u32, node_seq ) + __field(unsigned long, bytes ) ), TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; - __entry->level = level; - __entry->iter_seq = iter_seq; - __entry->node = node; - __entry->node_seq = node_seq; + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; ), - TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", + TP_printk("%ps %pS bytes %lu", (void *) __entry->trans_ip, (void *) __entry->caller_ip, - __entry->btree_id, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, __entry->iter_seq, - __entry->node, __entry->node_seq) -); - -DEFINE_EVENT(node_lock_fail, node_upgrade_fail, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, btree_id, pos, - level, iter_seq, node, node_seq) -); - -DEFINE_EVENT(node_lock_fail, node_relock_fail, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, btree_id, pos, - level, iter_seq, node, node_seq) + __entry->bytes) ); #endif /* _TRACE_BCACHE_H */ diff --git a/kernel/locking/six.c b/kernel/locking/six.c index 75a735acd11b..fca1208720b6 100644 --- a/kernel/locking/six.c +++ b/kernel/locking/six.c @@ -17,7 +17,7 @@ #endif #define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -#define six_release(l) lock_release(l, 0, _RET_IP_) +#define six_release(l) lock_release(l, _RET_IP_) struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ |