diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-11-07 11:36:03 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-11-07 13:11:03 -0500 |
commit | 76fb8739030612bd51ef2c9573d2bf849e9910d4 (patch) | |
tree | 1e8d34c2e30e39265e5d0e627b51fd03a1436cbf | |
parent | 7d679fb274b1a05bf06894c681e7ab6393f4742f (diff) |
Merge with 1d669389f7: bcachefs: use a radix tree for inum bitmap in fsck
68 files changed, 1774 insertions, 1527 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 10abddae6a80..5594af719b2a 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -20,7 +20,7 @@ config BCACHEFS_FS select SIXLOCKS select RAID6_PQ select XOR_BLOCKS - ---help--- + help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -37,7 +37,7 @@ config BCACHEFS_POSIX_ACL config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS - ---help--- + help Enables many extra debugging checks and assertions. The resulting code will be significantly slower than normal; you @@ -46,5 +46,5 @@ config BCACHEFS_DEBUG config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS - ---help--- + help Include some unit and performance tests for the core btree code diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d85ced62c0dd..2fbf978424ed 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -56,4 +56,5 @@ bcachefs-y := \ tests.o \ trace.o \ util.o \ + varint.o \ xattr.o diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 9aa0b42b26b6..97508de9f721 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -209,10 +209,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, unsigned level, struct bkey_s_c k) { - if (!level) - bch2_mark_key(c, k, 0, 0, NULL, 0, - BTREE_TRIGGER_ALLOC_READ| - BTREE_TRIGGER_NOATOMIC); + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked u; + + if (level || k.k->type != KEY_TYPE_alloc) + return 0; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = __bucket(ca, k.k->p.offset, 0); + u = bch2_alloc_unpack(k); + + g->_mark.gen = u.gen; + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; return 0; } @@ -223,8 +238,11 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) unsigned i; int ret = 0; + down_read(&c->gc_lock); ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, NULL, bch2_alloc_read_fn); + up_read(&c->gc_lock); + if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; @@ -253,12 +271,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return 0; } -enum alloc_write_ret { - ALLOC_WROTE, - ALLOC_NOWROTE, - ALLOC_END, -}; - static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, unsigned flags) @@ -288,26 +300,17 @@ retry: old_u = bch2_alloc_unpack(k); - if (iter->pos.inode >= c->sb.nr_devices || - !c->devs[iter->pos.inode]) - return ALLOC_END; - percpu_down_read(&c->mark_lock); ca = bch_dev_bkey_exists(c, iter->pos.inode); ba = bucket_array(ca); - if (iter->pos.offset >= ba->nbuckets) { - percpu_up_read(&c->mark_lock); - return ALLOC_END; - } - g = &ba->b[iter->pos.offset]; m = READ_ONCE(g->mark); new_u = alloc_mem_to_key(g, m); percpu_up_read(&c->mark_lock); if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return ALLOC_NOWROTE; + return 0; a = bkey_alloc_init(&alloc_key.k); a->k.p = iter->pos; @@ -325,50 +328,55 @@ err: return ret; } -int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) +int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; - struct bch_dev *ca; - unsigned i; + u64 first_bucket, nbuckets; int ret = 0; + percpu_down_read(&c->mark_lock); + first_bucket = bucket_array(ca)->first_bucket; + nbuckets = bucket_array(ca)->nbuckets; + percpu_up_read(&c->mark_lock); + BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, + POS(ca->dev_idx, first_bucket), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - for_each_rw_member(ca, c, i) { - unsigned first_bucket; + while (iter->pos.offset < nbuckets) { + bch2_trans_cond_resched(&trans); - percpu_down_read(&c->mark_lock); - first_bucket = bucket_array(ca)->first_bucket; - percpu_up_read(&c->mark_lock); + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret) + break; + bch2_btree_iter_next_slot(iter); + } - bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); + bch2_trans_exit(&trans); - while (1) { - bch2_trans_cond_resched(&trans); + return ret; +} - ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret < 0 || ret == ALLOC_END) - break; - if (ret == ALLOC_WROTE) - *wrote = true; - bch2_btree_iter_next_slot(iter); - } +int bch2_alloc_write(struct bch_fs *c, unsigned flags) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; - if (ret < 0) { + for_each_rw_member(ca, c, i) { + bch2_dev_alloc_write(c, ca, flags); + if (ret) { percpu_ref_put(&ca->io_ref); break; } } - bch2_trans_exit(&trans); - - return ret < 0 ? ret : 0; + return ret; } /* Bucket IO clocks: */ @@ -481,6 +489,53 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) mutex_init(&clock->lock); } +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter *iter; + struct bucket *g; + struct bkey_i_alloc *a; + struct bkey_alloc_unpacked u; + u16 *time; + int ret = 0; + + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + percpu_down_read(&c->mark_lock); + g = bucket(ca, bucket_nr); + u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + + time = rw == READ ? &u.read_time : &u.write_time; + if (*time == c->bucket_clock[rw].hand) + goto out; + + *time = c->bucket_clock[rw].hand; + + bch2_alloc_pack(a, u); + + ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + /* Background allocator thread: */ /* @@ -489,8 +544,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) * commands to the newly free buckets, then puts them on the various freelists. */ -#define BUCKET_GC_GEN_MAX 96U - /** * wait_buckets_available - wait on reclaimable buckets * @@ -1259,18 +1312,6 @@ void bch2_recalc_capacity(struct bch_fs *c) c->bucket_size_max = bucket_size_max; - if (c->capacity) { - bch2_io_timer_add(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } else { - bch2_io_timer_del(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } - /* Wake up case someone was waiting for buckets */ closure_wake_up(&c->freelist_wait); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index f6b9f27f0713..d10ff56e4de1 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,6 +13,9 @@ struct bkey_alloc_unpacked { #undef x }; +/* How out of date a pointer gen is allowed to be: */ +#define BUCKET_GC_GEN_MAX 96U + /* returns true if not equal */ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, struct bkey_alloc_unpacked r) @@ -28,6 +31,8 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); void bch2_alloc_pack(struct bkey_i_alloc *, const struct bkey_alloc_unpacked); +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + static inline struct bkey_alloc_unpacked alloc_mem_to_key(struct bucket *g, struct bucket_mark m) { @@ -61,15 +66,17 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) rcu_read_lock(); p = rcu_dereference(ca->alloc_thread); - if (p) + if (p) { wake_up_process(p); + ca->allocator_state = ALLOCATOR_RUNNING; + } rcu_read_unlock(); } static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { - if (expensive_debug_checks(c)) { + if (bch2_expensive_debug_checks) { size_t iter; long i; unsigned j; @@ -91,7 +98,8 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_alloc_write(struct bch_fs *, unsigned, bool *); +int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); +int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 4a048828869b..7a92e3d53254 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -309,8 +309,6 @@ out: .dev = ca->dev_idx, }; - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); spin_unlock(&ob->lock); if (c->blocked_allocate_open_bucket) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 68e150fb8510..35311dbb189c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -265,6 +265,8 @@ do { \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -295,6 +297,16 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#ifndef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM +#endif + #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ @@ -491,7 +503,6 @@ enum { BCH_FS_ERRORS_FIXED, /* misc: */ - BCH_FS_BDEV_MOUNTED, BCH_FS_FIXED_GENS, BCH_FS_ALLOC_WRITTEN, BCH_FS_REBUILD_REPLICAS, @@ -530,6 +541,10 @@ struct journal_keys { u64 journal_seq_base; }; +struct btree_iter_buf { + struct btree_iter *iter; +}; + struct bch_fs { struct closure cl; @@ -625,6 +640,7 @@ struct bch_fs { struct mutex btree_trans_lock; struct list_head btree_trans_list; mempool_t btree_iters_pool; + struct btree_iter_buf __percpu *btree_iters_bufs; struct btree_key_cache btree_key_cache; @@ -735,7 +751,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -802,7 +818,8 @@ struct bch_fs { struct mutex verify_lock; #endif - u64 unused_inode_hint; + u64 *unused_inode_hints; + unsigned inode_shard_bits; /* * A btree node on disk could have too many bsets for an iterator to fit @@ -827,10 +844,6 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; -#define BCH_DEBUG_PARAM(name, description) bool name; - BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - struct time_stats times[BCH_TIME_STAT_NR]; }; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d5a2230e403c..94b5418587e3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -340,7 +340,8 @@ static inline void bkey_init(struct bkey *k) x(reflink_p, 15) \ x(reflink_v, 16) \ x(inline_data, 17) \ - x(btree_ptr_v2, 18) + x(btree_ptr_v2, 18) \ + x(indirect_inline_data, 19) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -668,10 +669,10 @@ struct bch_inode_generation { } __attribute__((packed, aligned(8))); #define BCH_INODE_FIELDS() \ - x(bi_atime, 64) \ - x(bi_ctime, 64) \ - x(bi_mtime, 64) \ - x(bi_otime, 64) \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ x(bi_size, 64) \ x(bi_sectors, 64) \ x(bi_uid, 32) \ @@ -738,7 +739,8 @@ enum { #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); /* Dirents */ @@ -886,6 +888,12 @@ struct bch_reflink_v { __u64 _data[0]; }; +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[0]; +}; + /* Inline data */ struct bch_inline_data { @@ -1032,7 +1040,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); x(journal, 2) \ x(btree, 3) \ x(user, 4) \ - x(cached, 5) + x(cached, 5) \ + x(parity, 6) enum bch_data_type { #define x(t, n) BCH_DATA_##t, @@ -1321,13 +1330,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(incompressible, 10) \ x(btree_ptr_v2, 11) \ x(extents_above_btree_updates, 12) \ - x(btree_updates_journalled, 13) + x(btree_updates_journalled, 13) \ + x(reflink_inline_data, 14) \ + x(new_varint, 15) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)) + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_new_varint))\ enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 4d0c9129cd4a..c06d0a965be1 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out, if ((*p & mask) != mask) { *p += 1ULL << offset; - EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); + EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); return true; } @@ -1054,9 +1054,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, } __pure __flatten -int __bch2_bkey_cmp_packed(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) +int bch2_bkey_cmp_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { struct bkey unpacked; diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index cbcfbd26bc58..2d2c640305e2 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -67,13 +67,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -#define bkey_packed_typecheck(_k) \ -({ \ - BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ - !type_is(_k, struct bkey_packed *)); \ - type_is(_k, struct bkey_packed *); \ -}) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -81,9 +74,6 @@ enum bkey_lr_packed { BKEY_PACKED_NONE, }; -#define bkey_lr_packed_typecheck(_l, _r) \ - (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) - #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) @@ -132,9 +122,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, const struct bpos *); __pure -int __bch2_bkey_cmp_packed(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); +int bch2_bkey_cmp_packed(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); __pure int __bch2_bkey_cmp_left_packed(const struct btree *, @@ -160,37 +150,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } -/* - * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to - * skip dispatching on k->format: - */ -#define bkey_cmp_packed(_b, _l, _r) \ -({ \ - int _cmp; \ - \ - switch (bkey_lr_packed_typecheck(_l, _r)) { \ - case BKEY_PACKED_NONE: \ - _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ - ((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_LEFT: \ - _cmp = bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_l), \ - &((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_RIGHT: \ - _cmp = -bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_r), \ - &((struct bkey *) (_l))->p); \ - break; \ - case BKEY_PACKED_BOTH: \ - _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ - (void *) (_r), (_b)); \ - break; \ - } \ - _cmp; \ -}) - #if 1 static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { @@ -565,6 +524,7 @@ BKEY_VAL_ACCESSORS(reflink_p); BKEY_VAL_ACCESSORS(reflink_v); BKEY_VAL_ACCESSORS(inline_data); BKEY_VAL_ACCESSORS(btree_ptr_v2); +BKEY_VAL_ACCESSORS(indirect_inline_data); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 36e0c5152b47..99b7fce2bfd3 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -72,7 +72,11 @@ static const char *key_type_inline_data_invalid(const struct bch_fs *c, static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); + struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + pr_buf(out, "datalen %u: %*phN", + datalen, min(datalen, 32U), d.v->data); } #define bch2_bkey_ops_inline_data (struct bkey_ops) { \ @@ -232,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; enum merge_result ret; - if (key_merging_disabled(c) || + if (bch2_key_merging_disabled || !ops->key_merge || l.k->type != r.k->type || bversion_cmp(l.k->version, r.k->version) || diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index 839e78d1dc35..99e0a4011fae 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); } @@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter) * and should be dropped. */ return iter->used >= 2 && - !bkey_cmp_packed(iter->b, + !bch2_bkey_cmp_packed(iter->b, iter->data[0].k, iter->data[1].k); } @@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (int) l->needs_whiteout - (int) r->needs_whiteout; } @@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, continue; while ((next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { + !bch2_bkey_cmp_packed(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); needs_whiteout |= in->needs_whiteout; @@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(l) - (int) bkey_deleted(r); } diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index f7c2841ed8a7..26716657453f 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void bset_aux_tree_verify(const struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bset_tree *t; + const struct bset_tree *t; for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) @@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b) #endif } -void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) +void bch2_btree_keys_init(struct btree *b) { unsigned i; b->nsets = 0; memset(&b->nr, 0, sizeof(b->nr)); -#ifdef CONFIG_BCACHEFS_DEBUG - b->expensive_debug_checks = expensive_debug_checks; -#endif + for (i = 0; i < MAX_BSETS; i++) b->set[i].data_offset = U16_MAX; @@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!btree_keys_expensive_checks(b)) + if (!bch2_expensive_debug_checks) return; BUG_ON(bset_has_ro_aux_tree(t)); @@ -710,20 +708,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, } /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } @@ -922,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { BUG_ON(ret >= orig_k); for (i = ret @@ -1227,8 +1225,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b, __flatten static struct bkey_packed *bset_search_tree(const struct btree *b, - struct bset_tree *t, - struct bpos *search, + const struct bset_tree *t, + const struct bpos *search, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); @@ -1345,7 +1343,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1601,7 +1599,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { bch2_btree_node_iter_verify(iter, b); bch2_btree_node_iter_next_check(iter, b); } @@ -1620,7 +1618,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct bset_tree *t; unsigned end = 0; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { @@ -1656,7 +1654,7 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 5921cf689105..469294cc716c 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -5,7 +5,7 @@ #include <linux/kernel.h> #include <linux/types.h> -#include "bcachefs_format.h" +#include "bcachefs.h" #include "bkey.h" #include "bkey_methods.h" #include "btree_types.h" @@ -147,17 +147,6 @@ * first key in that range of bytes again. */ -extern bool bch2_expensive_debug_checks; - -static inline bool btree_keys_expensive_checks(const struct btree *b) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - return bch2_expensive_debug_checks || *b->expensive_debug_checks; -#else - return false; -#endif -} - enum bset_aux_tree_type { BSET_NO_AUX_TREE, BSET_RO_AUX_TREE, @@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree #define BSET_CACHELINE 128 -static inline size_t btree_keys_cachelines(struct btree *b) +static inline size_t btree_keys_cachelines(const struct btree *b) { return (1U << b->byte_order) / BSET_CACHELINE; } -static inline size_t btree_aux_data_bytes(struct btree *b) +static inline size_t btree_aux_data_bytes(const struct btree *b) { return btree_keys_cachelines(b) * 8; } -static inline size_t btree_aux_data_u64s(struct btree *b) +static inline size_t btree_aux_data_u64s(const struct btree *b) { return btree_aux_data_bytes(b) / sizeof(u64); } @@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); @@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b, return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } -void bch2_btree_keys_init(struct btree *, bool *); +void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, @@ -477,7 +466,7 @@ static inline int bkey_iter_cmp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: cmp_int(l, r); } @@ -654,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (btree_keys_expensive_checks(b)) + if (bch2_debug_check_btree_accounting) __bch2_verify_btree_nr_keys(b); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 29a2065ad414..325a16615a06 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -81,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; @@ -212,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else __bch2_btree_node_write(c, b, SIX_LOCK_read); @@ -253,9 +252,9 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long can_free; unsigned long touched = 0; unsigned long freed = 0; - unsigned i; + unsigned i, flags; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -264,6 +263,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, else if (!mutex_trylock(&bc->lock)) return -1; + flags = memalloc_nofs_save(); + /* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we @@ -327,6 +328,7 @@ restart: clear_btree_node_accessed(b); } + memalloc_nofs_restore(flags); mutex_unlock(&bc->lock); out: return (unsigned long) freed * btree_pages(c); @@ -339,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, btree_cache.shrink); struct btree_cache *bc = &c->btree_cache; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return 0; return btree_cache_can_free(bc) * btree_pages(c); @@ -349,11 +351,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - unsigned i; + unsigned i, flags; if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); + /* vfree() can allocate memory: */ + flags = memalloc_nofs_save(); mutex_lock(&bc->lock); #ifdef CONFIG_BCACHEFS_DEBUG @@ -389,6 +393,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) } mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); if (bc->table_init_done) rhashtable_destroy(&bc->table); @@ -585,7 +590,7 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - bch2_btree_keys_init(b, &c->expensive_debug_checks); + bch2_btree_keys_init(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -700,7 +705,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) + enum six_lock_type lock_type, + unsigned long trace_ip) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -762,7 +768,7 @@ lock_node: btree_node_unlock(iter, level + 1); if (!btree_node_lock(b, k->k.p, level, iter, lock_type, - lock_node_check_fn, (void *) k)) { + lock_node_check_fn, (void *) k, trace_ip)) { if (b->hash_val != btree_ptr_hash_val(k)) goto retry; return ERR_PTR(-EINTR); @@ -930,7 +936,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, bch2_bkey_unpack(parent, &tmp.k, k); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + SIX_LOCK_intent, _THIS_IP_); if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { struct btree_iter *linked; @@ -943,14 +949,14 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, * holding other locks that would cause us to deadlock: */ trans_for_each_iter(trans, linked) - if (btree_iter_cmp(iter, linked) < 0) + if (btree_iter_lock_cmp(iter, linked) < 0) __bch2_btree_iter_unlock(linked); if (sib == btree_prev_sib) btree_node_unlock(iter, level); ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + SIX_LOCK_intent, _THIS_IP_); /* * before btree_iter_relock() calls btree_iter_verify_locks(): diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index d0d3a85bb8be..8a19e60e9258 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, - enum six_lock_type); + enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 4f581130270c..ba4acc112ed3 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -8,6 +8,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_methods.h" +#include "bkey_on_stack.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -36,9 +37,11 @@ static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { + preempt_disable(); write_seqcount_begin(&c->gc_pos_lock); c->gc_pos = new_pos; write_seqcount_end(&c->gc_pos_lock); + preempt_enable(); } static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) @@ -98,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, int ret = 0; if (initial) { - BUG_ON(journal_seq_verify(c) && + BUG_ON(bch2_journal_seq_verify && k.k->version.lo > journal_cur_seq(&c->journal)); /* XXX change to fsck check */ @@ -206,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, struct btree_iter *iter; struct btree *b; unsigned depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 + : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -233,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, BTREE_INSERT_USE_RESERVE| BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); - else if (!btree_gc_rewrite_disabled(c) && - (btree_gc_always_rewrite(c) || max_stale > 16)) + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) bch2_btree_node_rewrite(c, iter, b->data->keys.seq, BTREE_INSERT_NOWAIT| @@ -325,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c, { struct btree *b; unsigned target_depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 + : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -567,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ + ret = 1; \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -577,6 +581,7 @@ static int bch2_gc_done(struct bch_fs *c, dst->_f, src->_f); \ dst->_f = src->_f; \ dst->dirty = true; \ + ret = 1; \ } #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ @@ -587,6 +592,7 @@ static int bch2_gc_done(struct bch_fs *c, bch2_data_types[dst->b[b].mark.data_type],\ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ + ret = 1; \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) @@ -829,7 +835,7 @@ again: out: if (!ret && (test_bit(BCH_FS_FIXED_GENS, &c->flags) || - (!iter && test_restart_gc(c)))) { + (!iter && bch2_test_restart_gc))) { /* * XXX: make sure gens we fixed got saved */ @@ -888,40 +894,77 @@ out: return ret; } +static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, false); + + if (gen_after(g->mark.gen, ptr->gen) > 16) { + percpu_up_read(&c->mark_lock); + return true; + } + } + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, false); + + if (gen_after(g->gc_gen, ptr->gen)) + g->gc_gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); + + return false; +} + /* * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree * node pointers currently never have cached pointers that can become stale: */ -static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) +static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + struct bkey_on_stack sk; + int ret = 0; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, + BTREE_ITER_PREFETCH); - percpu_down_read(&c->mark_lock); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, false); + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + if (gc_btree_gens_key(c, k)) { + bkey_on_stack_reassemble(&sk, c, k); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - if (gen_after(g->gc_gen, ptr->gen)) - g->gc_gen = ptr->gen; + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - if (gen_after(g->mark.gen, ptr->gen) > 32) { - /* rewrite btree node */ + bch2_trans_update(&trans, iter, sk.k, 0); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + if (ret == -EINTR) + continue; + if (ret) { + break; } } - percpu_up_read(&c->mark_lock); + + bch2_btree_iter_next(iter); } bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); + return ret; } @@ -1356,7 +1399,7 @@ static int bch2_gc_thread(void *arg) #else ret = bch2_gc_gens(c); #endif - if (ret) + if (ret < 0) bch_err(c, "btree gc failed: %i", ret); debug_check_no_locks_held(); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2f5097218f9c..10a00085cdd6 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b, BUG_ON(extents ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); + //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0); } #endif } @@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt, break; for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bkey_cmp_packed(bt, + b = bch2_bkey_cmp_packed(bt, ptrs[c], ptrs[d]) >= 0 ? c : d; if (d == n) b = c; while (b != a && - bkey_cmp_packed(bt, + bch2_bkey_cmp_packed(bt, ptrs[a], ptrs[b]) >= 0) b = (b - 1) / 2; @@ -750,7 +750,9 @@ static int validate_bset(struct bch_fs *c, struct btree *b, btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, b, i, - "incorrect max key"); + "incorrect max key %llu:%llu", + bn->max_key.inode, + bn->max_key.offset); if (write) compat_btree_node(b->c.level, b->c.btree_id, version, @@ -930,7 +932,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), BTREE_ERR_WANT_RETRY, c, b, i, - "unknown checksum type"); + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); @@ -957,7 +960,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), BTREE_ERR_WANT_RETRY, c, b, i, - "unknown checksum type"); + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); @@ -1040,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry const char *invalid = bch2_bkey_val_invalid(c, u.s_c); if (invalid || - (inject_invalid_keys(c) && + (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index b859a067c78b..626d0f071b70 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -104,7 +104,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 6fab76c3220c..58f1a3dd97d3 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, - void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; - struct btree_iter *linked; + struct btree_iter *linked, *deadlock_iter = NULL; u64 start_time = local_clock(); - bool ret = true; + unsigned reason = 9; /* Check if it's safe to block: */ trans_for_each_iter(trans, linked) { @@ -228,11 +228,34 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, linked->locks_want = max_t(unsigned, linked->locks_want, __fls(linked->nodes_locked) + 1); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 1; + } } else { - ret = false; + deadlock_iter = linked; + reason = 2; + } + } + + if (linked->btree_id != iter->btree_id) { + if (linked->btree_id > iter->btree_id) { + deadlock_iter = linked; + reason = 3; + } + continue; + } + + /* + * Within the same btree, cached iterators come before non + * cached iterators: + */ + if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { + if (btree_iter_is_cached(iter)) { + deadlock_iter = linked; + reason = 4; } + continue; } /* @@ -240,30 +263,29 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, * another iterator has possible descendants locked of the node * we're about to lock, it must have the ancestors locked too: */ - if (linked->btree_id == iter->btree_id && - level > __fls(linked->nodes_locked)) { + if (level > __fls(linked->nodes_locked)) { if (!(trans->nounlock)) { linked->locks_want = max(level + 1, max_t(unsigned, linked->locks_want, iter->locks_want)); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 5; + } } else { - ret = false; + deadlock_iter = linked; + reason = 6; } } /* Must lock btree nodes in key order: */ - if ((cmp_int(iter->btree_id, linked->btree_id) ?: - -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) - ret = false; - - if (iter->btree_id == linked->btree_id && - btree_node_locked(linked, level) && + if (btree_node_locked(linked, level) && bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, - btree_iter_type(linked))) <= 0) - ret = false; + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; + reason = 7; + } /* * Recheck if this is a node we already have locked - since one @@ -277,8 +299,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, } } - if (unlikely(!ret)) { - trace_trans_restart_would_deadlock(iter->trans->ip); + if (unlikely(deadlock_iter)) { + trace_trans_restart_would_deadlock(iter->trans->ip, ip, + reason, + deadlock_iter->btree_id, + btree_iter_type(deadlock_iter), + iter->btree_id, + btree_iter_type(iter)); return false; } @@ -471,7 +498,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, char buf1[100], buf2[100]; const char *msg; - if (!debug_check_iterators(iter->trans->c)) + if (!bch2_debug_check_iterators) return; if (btree_iter_type(iter) == BTREE_ITER_CACHED) { @@ -567,7 +594,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) { struct btree_iter *iter; - if (!debug_check_iterators(trans->c)) + if (!bch2_debug_check_iterators) return; trans_for_each_iter_with_node(trans, b, iter) @@ -739,7 +766,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (debug_check_iterators(iter->trans->c)) + if (bch2_debug_check_iterators) bch2_btree_node_iter_verify(node_iter, b); } @@ -769,7 +796,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ret = bkey_disassemble(l->b, k, u); - if (debug_check_bkeys(iter->trans->c)) + if (bch2_debug_check_bkeys) bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; @@ -945,7 +972,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) } static inline int btree_iter_lock_root(struct btree_iter *iter, - unsigned depth_want) + unsigned depth_want, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; @@ -974,7 +1002,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, iter, lock_type, - lock_root_check_fn, rootp))) + lock_root_check_fn, rootp, + trace_ip))) return -EINTR; if (likely(b == READ_ONCE(*rootp) && @@ -1046,7 +1075,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, btree_node_unlock(iter, plevel); } -static __always_inline int btree_iter_down(struct btree_iter *iter) +static __always_inline int btree_iter_down(struct btree_iter *iter, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; @@ -1060,7 +1090,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter) bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); + b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -1084,7 +1114,7 @@ static void btree_iter_up(struct btree_iter *iter) btree_node_unlock(iter, iter->level++); } -static int btree_iter_traverse_one(struct btree_iter *); +static int btree_iter_traverse_one(struct btree_iter *, unsigned long); static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) { @@ -1104,11 +1134,12 @@ retry_all: sorted[nr_sorted++] = iter->idx; #define btree_iter_cmp_by_idx(_l, _r) \ - btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) + btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); #undef btree_iter_cmp_by_idx bch2_trans_unlock(trans); + cond_resched(); if (unlikely(ret == -ENOMEM)) { struct closure cl; @@ -1139,7 +1170,7 @@ retry_all: if (!(trans->iters_linked & (1ULL << idx))) continue; - ret = btree_iter_traverse_one(&trans->iters[idx]); + ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); if (ret) goto retry_all; } @@ -1202,7 +1233,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_iter_traverse_one(struct btree_iter *iter) +static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) { unsigned depth_want = iter->level; @@ -1249,8 +1281,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) */ while (iter->level > depth_want) { int ret = btree_iter_node(iter, iter->level) - ? btree_iter_down(iter) - : btree_iter_lock_root(iter, depth_want); + ? btree_iter_down(iter, trace_ip) + : btree_iter_lock_root(iter, depth_want, trace_ip); if (unlikely(ret)) { if (ret == 1) return 0; @@ -1281,7 +1313,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) int ret; ret = bch2_trans_cond_resched(trans) ?: - btree_iter_traverse_one(iter); + btree_iter_traverse_one(iter, _RET_IP_); if (unlikely(ret)) ret = __btree_iter_traverse_all(trans, ret); @@ -1545,13 +1577,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ret.v = bkeyp_val(&l->b->format, _k); - if (debug_check_iterators(iter->trans->c)) { + if (bch2_debug_check_iterators) { struct bkey k = bkey_unpack_key(l->b, _k); BUG_ON(memcmp(&k, &iter->k, sizeof(k))); } - if (debug_check_bkeys(iter->trans->c)) + if (bch2_debug_check_bkeys) bch2_bkey_debugcheck(iter->trans->c, l->b, ret); } @@ -1970,6 +2002,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, return bch2_trans_iter_put(trans, iter); } +#if 0 static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { @@ -2018,8 +2051,7 @@ success: sizeof(struct btree_iter) * trans->nr_iters + sizeof(struct btree_insert_entry) * trans->nr_iters); - if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + kfree(trans->iters); trans->iters = new_iters; trans->updates = new_updates; @@ -2033,6 +2065,7 @@ success: return 0; } +#endif static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { @@ -2042,28 +2075,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) goto got_slot; if (trans->nr_iters == trans->size) { - int ret; - - if (trans->nr_iters >= BTREE_ITER_MAX) { - struct btree_iter *iter; - - trans_for_each_iter(trans, iter) { - pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", - bch2_btree_ids[iter->btree_id], - iter->pos.inode, - iter->pos.offset, - (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); - } + struct btree_iter *iter; - panic("trans iter oveflow\n"); + BUG_ON(trans->size < BTREE_ITER_MAX); + + trans_for_each_iter(trans, iter) { + pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); } + panic("trans iter oveflow\n"); +#if 0 ret = bch2_trans_realloc_iters(trans, trans->size * 2); if (ret) return ERR_PTR(ret); +#endif } idx = trans->nr_iters++; @@ -2305,28 +2337,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) bch2_btree_iter_traverse_all(trans); } +static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +{ + unsigned new_size = BTREE_ITER_MAX; + size_t iters_bytes = sizeof(struct btree_iter) * new_size; + size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; + void *p; + + BUG_ON(trans->used_mempool); + + p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: + mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; + trans->updates2 = p; p += updates_bytes; + trans->size = new_size; +} + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned expected_nr_iters, size_t expected_mem_bytes) { - memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->ip = _RET_IP_; /* * reallocating iterators currently completely breaks - * bch2_trans_iter_put(): + * bch2_trans_iter_put(), we always allocate the max: */ - expected_nr_iters = BTREE_ITER_MAX; - - trans->c = c; - trans->ip = _RET_IP_; - trans->size = ARRAY_SIZE(trans->iters_onstack); - trans->iters = trans->iters_onstack; - trans->updates = trans->updates_onstack; - trans->updates2 = trans->updates2_onstack; - trans->fs_usage_deltas = NULL; - - if (expected_nr_iters > trans->size) - bch2_trans_realloc_iters(trans, expected_nr_iters); + bch2_trans_alloc_iters(trans, c); if (expected_mem_bytes) bch2_trans_preload_mem(trans, expected_mem_bytes); @@ -2341,6 +2382,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, int bch2_trans_exit(struct btree_trans *trans) { + struct bch_fs *c = trans->c; + bch2_trans_unlock(trans); #ifdef CONFIG_BCACHEFS_DEBUG @@ -2353,19 +2396,21 @@ int bch2_trans_exit(struct btree_trans *trans) kfree(trans->fs_usage_deltas); kfree(trans->mem); - if (trans->used_mempool) + + trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); + if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); - else if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + trans->mem = (void *) 0x1; trans->iters = (void *) 0x1; return trans->error ? -EIO : 0; } -static void bch2_btree_iter_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *_b, - enum btree_iter_type type) +static void __maybe_unused +bch2_btree_iter_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, + enum btree_iter_type type) { pr_buf(out, " %px l=%u %s:", _b, _b->level, bch2_btree_ids[_b->btree_id]); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index bd9ec3ec9a92..f7a73619c85b 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -177,11 +177,12 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -static inline int btree_iter_cmp(const struct btree_iter *l, - const struct btree_iter *r) +/* Sort order for locking btree iterators: */ +static inline int btree_iter_lock_cmp(const struct btree_iter *l, + const struct btree_iter *r) { return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: + -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: bkey_cmp(l->pos, r->pos); } diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 61662750dfc0..0ee4f78ce67a 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -29,8 +29,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = { }; __flatten -static inline struct bkey_cached * -btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +inline struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) { struct bkey_cached_key key = { .btree_id = btree_id, @@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; } +__flatten int bch2_btree_iter_traverse_cached(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -218,7 +219,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) goto fill; } retry: - ck = btree_key_cache_find(c, iter->btree_id, iter->pos); + ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); if (!ck) { if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { iter->l[0].b = NULL; @@ -242,7 +243,7 @@ retry: enum six_lock_type lock_want = __btree_lock_want(iter, 0); if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, - bkey_cached_check_fn, iter)) { + bkey_cached_check_fn, iter, _THIS_IP_)) { if (ck->key.btree_id != iter->btree_id || bkey_cmp(ck->key.pos, iter->pos)) { goto retry; @@ -415,7 +416,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, struct bkey_cached_key key = { id, pos }; /* Fastpath - assume it won't be found: */ - if (!btree_key_cache_find(c, id, pos)) + if (!bch2_btree_key_cache_find(c, id, pos)) return 0; return btree_key_cache_flush_pos(trans, key, 0, true); @@ -462,7 +463,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - BUG_ON(btree_key_cache_find(trans->c, id, pos)); + BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); } #endif diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index b1756c6c622c..d448264abcc8 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -1,6 +1,9 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + int bch2_btree_iter_traverse_cached(struct btree_iter *); bool bch2_btree_insert_key_cached(struct btree_trans *, diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 81fbf3e18647..38323e32731f 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -176,13 +176,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, struct btree_iter *, enum six_lock_type, - six_lock_should_sleep_fn, void *); + six_lock_should_sleep_fn, void *, + unsigned long); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; bool ret; @@ -200,7 +202,7 @@ static inline bool btree_node_lock(struct btree *b, ret = likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type, - should_sleep_fn, p); + should_sleep_fn, p, ip); #ifdef CONFIG_BCACHEFS_DEBUG trans->locking = NULL; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 683b416ef427..93721fbc7794 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -130,10 +130,6 @@ struct btree { struct btree_write writes[2]; -#ifdef CONFIG_BCACHEFS_DEBUG - bool *expensive_debug_checks; -#endif - /* Key/pointer for this btree node */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; @@ -283,6 +279,11 @@ btree_iter_type(const struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } +static inline bool btree_iter_is_cached(const struct btree_iter *iter) +{ + return btree_iter_type(iter) == BTREE_ITER_CACHED; +} + static inline struct btree_iter_level *iter_l(struct btree_iter *iter) { return iter->l + iter->level; @@ -380,10 +381,6 @@ struct btree_trans { unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; - - struct btree_iter iters_onstack[2]; - struct btree_insert_entry updates_onstack[2]; - struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ @@ -591,6 +588,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_EXTENTS)| \ (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_EC)| \ (1U << BKEY_TYPE_REFLINK)) enum btree_trigger_flags { @@ -602,7 +600,6 @@ enum btree_trigger_flags { __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_ALLOC_READ, __BTREE_TRIGGER_NOATOMIC, }; @@ -614,7 +611,6 @@ enum btree_trigger_flags { #define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) static inline bool btree_node_type_needs_gc(enum btree_node_type type) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a2604b0ce2d8..4ddd1697ffde 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * the node the iterator points to: */ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_packed(b, k, &insert->k) >= 0)) + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; for_each_keylist_key(keys, insert) diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index cd699c257244..e386f8ed3922 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_packed(b, k, &insert->k)) + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) k = NULL; /* @k is the key being overwritten/deleted, if any: */ @@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct bch_fs *c = trans->c; BUG_ON(bkey_cmp(insert->k.p, iter->pos)); - BUG_ON(debug_check_bkeys(c) && + BUG_ON(bch2_debug_check_bkeys && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(iter->level, iter->btree_id))); } @@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter) static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) { - return (BTREE_NODE_TYPE_HAS_TRIGGERS & - ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & + return (((BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | + (1U << BTREE_ID_EC)) & (1U << iter->btree_id); } @@ -439,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, */ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (journal_seq_verify(c)) + if (bch2_journal_seq_verify) trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; - else if (inject_invalid_keys(c)) + else if (bch2_inject_invalid_keys) trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } @@ -679,6 +680,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static inline int btree_iter_pos_cmp(const struct btree_iter *l, + const struct btree_iter *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->pos, r->pos); +} + static void bch2_trans_update2(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) @@ -696,12 +704,12 @@ static void bch2_trans_update2(struct btree_trans *trans, iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; trans_for_each_update2(trans, i) { - if (btree_iter_cmp(n.iter, i->iter) == 0) { + if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { *i = n; return; } - if (btree_iter_cmp(n.iter, i->iter) <= 0) + if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) break; } @@ -985,7 +993,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, * Pending updates are kept sorted: first, find position of new update: */ trans_for_each_update(trans, i) - if (btree_iter_cmp(iter, i->iter) <= 0) + if (btree_iter_pos_cmp(iter, i->iter) <= 0) break; /* diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 97a8af31ded1..82f1cc4ca693 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -77,6 +77,26 @@ #include <linux/preempt.h> #include <trace/events/bcachefs.h> +static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, + enum bch_data_type data_type, + s64 sectors) +{ + switch (data_type) { + case BCH_DATA_btree: + fs_usage->btree += sectors; + break; + case BCH_DATA_user: + case BCH_DATA_parity: + fs_usage->data += sectors; + break; + case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + default: + break; + } +} + /* * Clear journal_seq_valid for buckets for which it's not needed, to prevent * wraparound: @@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c) struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - switch (e->data_type) { - case BCH_DATA_btree: - usage->btree += usage->replicas[i]; - break; - case BCH_DATA_user: - usage->data += usage->replicas[i]; - break; - case BCH_DATA_cached: - usage->cached += usage->replicas[i]; - break; - } + fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } percpu_up_write(&c->mark_lock); @@ -254,6 +264,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) BUG_ON(idx >= 2); + preempt_disable(); write_seqcount_begin(&c->usage_lock); acc_u64s_percpu((u64 *) c->usage_base, @@ -261,6 +272,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); write_seqcount_end(&c->usage_lock); + preempt_enable(); } void bch2_fs_usage_to_text(struct printbuf *out, @@ -374,9 +386,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m, return 0; } +static inline int is_stripe_data_bucket(struct bucket_mark m) +{ + return m.stripe && m.data_type != BCH_DATA_parity; +} + static inline int bucket_stripe_sectors(struct bucket_mark m) { - return m.stripe ? m.dirty_sectors : 0; + return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; } static inline enum bch_data_type bucket_type(struct bucket_mark m) @@ -410,8 +427,8 @@ int bch2_fs_usage_apply(struct bch_fs *c, */ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); if (WARN_ONCE(should_not_have_added > 0, - "disk usage increased by %lli without a reservation", - should_not_have_added)) { + "disk usage increased by %lli more than reservation of %llu", + added, disk_res ? disk_res->sectors : 0)) { atomic64_sub(should_not_have_added, &c->sectors_available); added -= should_not_have_added; ret = -1; @@ -482,6 +499,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } +__flatten void bch2_dev_usage_from_buckets(struct bch_fs *c) { struct bch_dev *ca; @@ -519,17 +537,7 @@ static inline int update_replicas(struct bch_fs *c, if (!fs_usage) return 0; - switch (r->data_type) { - case BCH_DATA_btree: - fs_usage->btree += sectors; - break; - case BCH_DATA_user: - fs_usage->data += sectors; - break; - case BCH_DATA_cached: - fs_usage->cached += sectors; - break; - } + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } @@ -755,8 +763,7 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - if (!(flags & BTREE_TRIGGER_ALLOC_READ)) - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -882,124 +889,155 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, p.crc.uncompressed_size); } -static void bucket_set_stripe(struct bch_fs *c, - const struct bch_extent_ptr *ptr, - struct bch_fs_usage *fs_usage, - u64 journal_seq, - unsigned flags, - bool enabled) -{ - bool gc = flags & BTREE_TRIGGER_GC; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, gc); - struct bucket_mark new, old; - - old = bucket_cmpxchg(g, new, ({ - new.stripe = enabled; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - })); - - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - - /* - * XXX write repair code for these, flag stripe as possibly bad - */ - if (old.gen != ptr->gen) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "stripe with stale pointer"); -#if 0 - /* - * We'd like to check for these, but these checks don't work - * yet: - */ - if (old.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "multiple stripes using same bucket"); - - if (!old.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "deleting stripe but bucket not marked as stripe bucket"); -#endif -} - -static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, - struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u16 *dirty_sectors, u16 *cached_sectors) +static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 bucket_data_type, + u16 dirty_sectors, u16 cached_sectors) { - u16 *dst_sectors = !p.ptr.cached + size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); + u16 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - u16 orig_sectors = *dst_sectors; char buf[200]; - if (gen_after(p.ptr.gen, bucket_gen)) { + if (gen_after(ptr->gen, bucket_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), - bucket_gen, - bch2_data_types[*bucket_data_type ?: ptr_data_type], - p.ptr.gen, + ptr->dev, bucket_nr, bucket_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { + if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), - bucket_gen, - bch2_data_types[*bucket_data_type ?: ptr_data_type], - p.ptr.gen, + ptr->dev, bucket_nr, bucket_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != p.ptr.gen && !p.ptr.cached) { + if (bucket_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), - bucket_gen, - bch2_data_types[*bucket_data_type ?: ptr_data_type], - p.ptr.gen, + ptr->dev, bucket_nr, bucket_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != p.ptr.gen) + if (bucket_gen != ptr->gen) return 1; - if (*bucket_data_type && *bucket_data_type != ptr_data_type) { + if (bucket_data_type && ptr_data_type && + bucket_data_type != ptr_data_type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), - bucket_gen, - bch2_data_types[*bucket_data_type], + ptr->dev, bucket_nr, bucket_gen, + bch2_data_types[bucket_data_type], bch2_data_types[ptr_data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (checked_add(*dst_sectors, sectors)) { + if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), - bucket_gen, - bch2_data_types[*bucket_data_type ?: ptr_data_type], - orig_sectors, sectors, + ptr->dev, bucket_nr, bucket_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } + return 0; +} + +static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, + unsigned ptr_idx, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags, + bool enabled) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + bool gc = flags & BTREE_TRIGGER_GC; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket_mark new, old; + char buf[200]; + int ret; + + if (enabled) + g->ec_redundancy = s->nr_redundant; + + old = bucket_cmpxchg(g, new, ({ + ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, + new.dirty_sectors, new.cached_sectors); + if (ret) + return ret; + + if (new.stripe && enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!new.stripe && !enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + new.stripe = enabled; + + if ((flags & BTREE_TRIGGER_GC) && parity) { + new.data_type = enabled ? BCH_DATA_parity : 0; + new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; + } + + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); + + if (!enabled) + g->ec_redundancy = 0; + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + return 0; +} + +static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u16 *dirty_sectors, u16 *cached_sectors) +{ + u16 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, + *dirty_sectors, *cached_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; *bucket_data_type = *dirty_sectors || *cached_sectors ? ptr_data_type : 0; return 0; @@ -1024,7 +1062,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, new.v.counter = old.v.counter = v; bucket_data_type = new.data_type; - ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, + ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, &bucket_data_type, &new.dirty_sectors, &new.cached_sectors); @@ -1057,12 +1095,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - s64 sectors, unsigned flags, - struct bch_replicas_padded *r, - unsigned *nr_data, - unsigned *nr_parity) + s64 sectors, unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; + struct bch_replicas_padded r; struct stripe *m; unsigned i, blocks_nonempty = 0; @@ -1077,14 +1113,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, return -EIO; } - BUG_ON(m->r.e.data_type != data_type); - - *nr_data = m->nr_blocks - m->nr_redundant; - *nr_parity = m->nr_redundant; - *r = m->r; - m->block_sectors[p.block] += sectors; + r = m->r; + for (i = 0; i < m->nr_blocks; i++) blocks_nonempty += m->block_sectors[i] != 0; @@ -1096,6 +1128,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); + r.e.data_type = data_type; + update_replicas(c, fs_usage, &r.e, sectors); + return 0; } @@ -1141,25 +1176,11 @@ static int bch2_mark_extent(struct bch_fs *c, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - struct bch_replicas_padded ec_r; - unsigned nr_data, nr_parity; - s64 parity_sectors; - ret = bch2_mark_stripe_ptr(c, p.ec, data_type, - fs_usage, disk_sectors, flags, - &ec_r, &nr_data, &nr_parity); + fs_usage, disk_sectors, flags); if (ret) return ret; - parity_sectors = - __ptr_disk_sectors_delta(p.crc.live_size, - offset, sectors, flags, - p.crc.compressed_size * nr_parity, - p.crc.uncompressed_size * nr_data); - - update_replicas(c, fs_usage, &ec_r.e, - disk_sectors + parity_sectors); - /* * There may be other dirty pointers in this extent, but * if so they're not required for mounting if we have an @@ -1188,6 +1209,7 @@ static int bch2_mark_stripe(struct bch_fs *c, ? bkey_s_c_to_stripe(new).v : NULL; struct stripe *m = genradix_ptr(&c->stripes[gc], idx); unsigned i; + int ret; if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", @@ -1197,9 +1219,12 @@ static int bch2_mark_stripe(struct bch_fs *c, if (!new_s) { /* Deleting: */ - for (i = 0; i < old_s->nr_blocks; i++) - bucket_set_stripe(c, old_s->ptrs + i, fs_usage, - journal_seq, flags, false); + for (i = 0; i < old_s->nr_blocks; i++) { + ret = bucket_set_stripe(c, old, i, fs_usage, + journal_seq, flags, false); + if (ret) + return ret; + } if (!gc && m->on_heap) { spin_lock(&c->ec_stripes_heap_lock); @@ -1207,6 +1232,10 @@ static int bch2_mark_stripe(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); } + if (gc) + update_replicas(c, fs_usage, &m->r.e, + -((s64) m->sectors * m->nr_redundant)); + memset(m, 0, sizeof(*m)); } else { BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); @@ -1218,11 +1247,16 @@ static int bch2_mark_stripe(struct bch_fs *c, old_s->ptrs + i, sizeof(struct bch_extent_ptr))) { - if (old_s) - bucket_set_stripe(c, old_s->ptrs + i, fs_usage, + if (old_s) { + bucket_set_stripe(c, old, i, fs_usage, journal_seq, flags, false); - bucket_set_stripe(c, new_s->ptrs + i, fs_usage, - journal_seq, flags, true); + if (ret) + return ret; + } + ret = bucket_set_stripe(c, new, i, fs_usage, + journal_seq, flags, true); + if (ret) + return ret; } } @@ -1231,19 +1265,23 @@ static int bch2_mark_stripe(struct bch_fs *c, m->algorithm = new_s->algorithm; m->nr_blocks = new_s->nr_blocks; m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; - bch2_bkey_to_replicas(&m->r.e, new); + for (i = 0; i < new_s->nr_blocks; i++) { + m->block_sectors[i] = + stripe_blockcount_get(new_s, i); + m->blocks_nonempty += !!m->block_sectors[i]; + } - /* gc recalculates these fields: */ - if (!(flags & BTREE_TRIGGER_GC)) { - m->blocks_nonempty = 0; + if (gc && old_s) + update_replicas(c, fs_usage, &m->r.e, + -((s64) m->sectors * m->nr_redundant)); - for (i = 0; i < new_s->nr_blocks; i++) { - m->block_sectors[i] = - stripe_blockcount_get(new_s, i); - m->blocks_nonempty += !!m->block_sectors[i]; - } - } + bch2_bkey_to_replicas(&m->r.e, new); + + if (gc) + update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant)); if (!gc) { spin_lock(&c->ec_stripes_heap_lock); @@ -1548,23 +1586,21 @@ static int trans_get_key(struct btree_trans *trans, return ret; } -static int bch2_trans_mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) +static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); - struct btree_iter *iter; - struct bkey_s_c k_a; - struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); struct bucket *g; + struct btree_iter *iter; + struct bkey_s_c k; int ret; - iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); + iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); if (iter) { - u = bch2_alloc_unpack(k_a); + *u = bch2_alloc_unpack(k); } else { iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, BTREE_ITER_CACHED| @@ -1574,16 +1610,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, return PTR_ERR(iter); ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; + if (ret) { + bch2_trans_iter_put(trans, iter); + return ret; + } percpu_down_read(&c->mark_lock); g = bucket(ca, pos.offset); - u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); } - ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, + *_iter = iter; + return 0; +} + +static int bch2_trans_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + int ret; + + ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); + if (ret) + return ret; + + ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; @@ -1594,7 +1650,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, goto out; bkey_alloc_init(&a->k_i); - a->k.p = pos; + a->k.p = iter->pos; bch2_alloc_pack(a, u); bch2_trans_update(trans, iter, &a->k_i, 0); out: @@ -1604,15 +1660,13 @@ out: static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, - s64 sectors, enum bch_data_type data_type, - struct bch_replicas_padded *r, - unsigned *nr_data, - unsigned *nr_parity) + s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_stripe *s; + struct bch_replicas_padded r; int ret = 0; ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); @@ -1633,15 +1687,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto out; bkey_reassemble(&s->k_i, k); - stripe_blockcount_set(&s->v, p.block, stripe_blockcount_get(&s->v, p.block) + sectors); - - *nr_data = s->v.nr_blocks - s->v.nr_redundant; - *nr_parity = s->v.nr_redundant; - bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); bch2_trans_update(trans, iter, &s->k_i, 0); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + update_replicas_list(trans, &r.e, sectors); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1686,25 +1739,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - struct bch_replicas_padded ec_r; - unsigned nr_data, nr_parity; - s64 parity_sectors; - ret = bch2_trans_mark_stripe_ptr(trans, p.ec, - disk_sectors, data_type, - &ec_r, &nr_data, &nr_parity); + disk_sectors, data_type); if (ret) return ret; - parity_sectors = - __ptr_disk_sectors_delta(p.crc.live_size, - offset, sectors, flags, - p.crc.compressed_size * nr_parity, - p.crc.uncompressed_size * nr_data); - - update_replicas_list(trans, &ec_r.e, - disk_sectors + parity_sectors); - r.e.nr_required = 0; } } @@ -1715,6 +1754,76 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } +static int bch2_trans_mark_stripe(struct btree_trans *trans, + struct bkey_s_c k, + unsigned flags) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + struct bch_replicas_padded r; + struct bkey_alloc_unpacked u; + struct bkey_i_alloc *a; + struct btree_iter *iter; + bool deleting = flags & BTREE_TRIGGER_OVERWRITE; + s64 sectors = le16_to_cpu(s->sectors); + unsigned i; + int ret = 0; + + if (deleting) + sectors = -sectors; + + bch2_bkey_to_replicas(&r.e, k); + update_replicas_list(trans, &r.e, sectors * s->nr_redundant); + + /* + * The allocator code doesn't necessarily update bucket gens in the + * btree when incrementing them, right before handing out new buckets - + * we just need to persist those updates here along with the new stripe: + */ + + for (i = 0; i < s->nr_blocks && !ret; i++) { + bool parity = i >= nr_data; + + ret = bch2_trans_start_alloc_update(trans, &iter, + &s->ptrs[i], &u); + if (ret) + break; + + if (parity) { + u.dirty_sectors += sectors; + u.data_type = u.dirty_sectors + ? BCH_DATA_parity + : 0; + } + + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto put_iter; + + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); +put_iter: + bch2_trans_iter_put(trans, iter); + } + + return ret; +} + +static __le64 *bkey_refcount(struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_reflink_v: + return &bkey_i_to_reflink_v(k)->v.refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_i_to_indirect_inline_data(k)->v.refcount; + default: + return NULL; + } +} + static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 idx, unsigned sectors, @@ -1723,7 +1832,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_i_reflink_v *r_v; + struct bkey_i *n; + __le64 *refcount; s64 ret; ret = trans_get_key(trans, BTREE_ID_REFLINK, @@ -1731,14 +1841,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, if (ret < 0) return ret; - if (k.k->type != KEY_TYPE_reflink_v) { - bch2_fs_inconsistent(c, - "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, idx); - ret = -EIO; - goto err; - } - if ((flags & BTREE_TRIGGER_OVERWRITE) && (bkey_start_offset(k.k) < idx || k.k->p.offset > idx + sectors)) @@ -1746,25 +1848,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, sectors = k.k->p.offset - idx; - r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(r_v); + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(n); if (ret) goto err; - bkey_reassemble(&r_v->k_i, k); + bkey_reassemble(n, k); - le64_add_cpu(&r_v->v.refcount, - !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); + refcount = bkey_refcount(n); + if (!refcount) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + ret = -EIO; + goto err; + } + + le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); - if (!r_v->v.refcount) { - r_v->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&r_v->k, 0); + if (!*refcount) { + n->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&n->k, 0); } bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - bch2_trans_update(trans, iter, &r_v->k_i, 0); + bch2_trans_update(trans, iter, n, 0); out: ret = sectors; err: @@ -1814,6 +1924,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, case KEY_TYPE_reflink_v: return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_user); + case KEY_TYPE_stripe: + return bch2_trans_mark_stripe(trans, k, flags); case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 653f6761862e..a3873becbb70 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -58,12 +58,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, - size_t b, int rw) -{ - bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; -} - static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) { return c->bucket_clock[rw].hand - g->io_time[rw]; diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index d5215b14d7d9..d6057d22b18e 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -41,6 +41,7 @@ struct bucket { u8 oldest_gen; u8 gc_gen; unsigned gen_valid:1; + u8 ec_redundancy; }; struct bucket_array { @@ -125,6 +126,7 @@ struct disk_reservation { struct copygc_heap_entry { u8 dev; u8 gen; + u8 replicas; u16 fragmentation; u32 sectors; u64 offset; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..3d88719ba86c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -463,7 +464,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 833537cc8fd0..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 47838fd2db06..aebf46bb1d21 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; @@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); - if (!IS_ENABLED(CONFIG_HIGHMEM) && + if (!PageHighMem(bio_iter_page(bio, start)) && bio_phys_contig(bio, start)) return (struct bbuf) { .b = page_address(bio_iter_page(bio, start)) + diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index aa10591a3b1a..bbe3fefa2651 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->written = 0; v->c.level = b->c.level; v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v, &c->expensive_debug_checks); + bch2_btree_keys_init(v); if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 56c2d1ab5f63..7ac1615e9447 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -8,44 +8,15 @@ struct bio; struct btree; struct bch_fs; -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_ALWAYS() -#undef BCH_DEBUG_PARAM - #ifdef CONFIG_BCACHEFS_DEBUG - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - void __bch2_btree_verify(struct bch_fs *, struct btree *); - -#define bypass_torture_test(d) ((d)->bypass_torture_test) - -#else /* DEBUG */ - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) { return false; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - +#else static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} - -#define bypass_torture_test(d) 0 - #endif static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) __bch2_btree_verify(c, b); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 5514f65378ad..d7ba0e7fc3b3 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant + ? BCH_DATA_user + : BCH_DATA_parity; if (!bch2_dev_get_ioref(ca, rw)) { clear_bit(idx, buf->valid); return; } + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); + while (offset < bytes) { unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, DIV_ROUND_UP(bytes, PAGE_SIZE)); @@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work) /* stripe creation: */ static int ec_stripe_bkey_insert(struct bch_fs *c, + struct ec_stripe_new *s, struct bkey_i_stripe *stripe) { struct btree_trans trans; @@ -711,7 +717,7 @@ found_slot: bch2_trans_update(&trans, iter, &stripe->k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, + ret = bch2_trans_commit(&trans, &s->res, NULL, BTREE_INSERT_NOFAIL); err: bch2_trans_iter_put(&trans, iter); @@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) ret = s->existing_stripe ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, - NULL, NULL, BTREE_INSERT_NOFAIL) - : ec_stripe_bkey_insert(c, &s->stripe.key); + &s->res, NULL, BTREE_INSERT_NOFAIL) + : ec_stripe_bkey_insert(c, s, &s->stripe.key); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); goto err_put_writes; @@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) err_put_writes: percpu_ref_put(&c->writes); err: + bch2_disk_reservation_put(c, &s->res); + open_bucket_for_each(c, &s->blocks, ob, i) { ob->ec = NULL; __bch2_open_bucket_put(c, ob); @@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, struct open_bucket *ob; unsigned i, data_idx = 0; s64 idx; + int ret; closure_init_stack(&cl); @@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, } } + if (!h->s->existing_stripe && + !h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, 0); + if (ret) { + /* What should we do here? */ + bch_err(c, "unable to create new stripe: %i", ret); + bch2_ec_stripe_head_put(c, h); + h = NULL; + goto out; + + } + + } + if (new_stripe_alloc_buckets(c, h)) { bch2_ec_stripe_head_put(c, h); h = NULL; @@ -1448,7 +1473,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, return 0; } -int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) +int bch2_stripes_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; @@ -1476,8 +1501,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) if (ret) break; - - *wrote = true; } bch2_trans_exit(&trans); @@ -1497,7 +1520,6 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: bch2_mark_key(c, k, 0, 0, NULL, 0, - BTREE_TRIGGER_ALLOC_READ| BTREE_TRIGGER_NOATOMIC); if (ret) return ret; @@ -1564,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min(h->used, 20UL); i++) { + for (i = 0; i < min_t(size_t, h->used, 20); i++) { m = genradix_ptr(&c->stripes[0], h->data[i].idx); pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index f8fc3d616cd7..15f751fc2a35 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -3,6 +3,7 @@ #define _BCACHEFS_EC_H #include "ec_types.h" +#include "buckets_types.h" #include "keylist_types.h" const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); @@ -105,6 +106,7 @@ struct ec_stripe_new { struct open_buckets blocks; u8 data_block_idx[EC_STRIPE_MAX]; struct open_buckets parity; + struct disk_reservation res; struct keylist keys; u64 inline_keys[BKEY_U64s * 8]; @@ -156,7 +158,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *); struct journal_keys; int bch2_stripes_read(struct bch_fs *, struct journal_keys *); -int bch2_stripes_write(struct bch_fs *, unsigned, bool *); +int bch2_stripes_write(struct bch_fs *, unsigned); int bch2_ec_mem_alloc(struct bch_fs *, bool); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 568f039edcff..7fae6a4ba26f 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c, return bch2_rand_range(l1 + l2) > l1; } - if (force_reconstruct_read(c)) + if (bch2_force_reconstruct_read) return p1.idx > p2.idx; return p1.idx < p2.idx; @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, !bch2_dev_is_readable(ca)) p.idx++; - if (force_reconstruct_read(c) && + if (bch2_force_reconstruct_read && !p.idx && p.has_ec) p.idx++; @@ -1200,14 +1200,14 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) le64_add_cpu(&p.v->idx, sub); break; } - case KEY_TYPE_inline_data: { - struct bkey_s_inline_data d = bkey_s_to_inline_data(k); + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); - sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); + sub = min_t(u64, sub << 9, bytes); - memmove(d.v->data, - d.v->data + sub, - bkey_val_bytes(d.k) - sub); + memmove(p, p + sub, bytes - sub); new_val_u64s -= sub >> 3; break; @@ -1245,7 +1245,9 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) switch (k.k->type) { case KEY_TYPE_inline_data: - new_val_u64s = min(new_val_u64s, k.k->size << 6); + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; break; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 29b15365d19c..74c7bb8f9104 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -445,10 +445,35 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) } } +static inline bool bkey_extent_is_inline_data(const struct bkey *k) +{ + return k->type == KEY_TYPE_inline_data || + k->type == KEY_TYPE_indirect_inline_data; +} + +static inline unsigned bkey_inline_data_offset(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_inline_data: + return sizeof(struct bch_inline_data); + case KEY_TYPE_indirect_inline_data: + return sizeof(struct bch_indirect_inline_data); + default: + BUG(); + } +} + +static inline unsigned bkey_inline_data_bytes(const struct bkey *k) +{ + return bkey_val_bytes(k) - bkey_inline_data_offset(k); +} + +#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) + static inline bool bkey_extent_is_data(const struct bkey *k) { - return bkey_extent_is_direct_data(k) || - k->type == KEY_TYPE_inline_data || + return bkey_extent_is_direct_data(k) || + bkey_extent_is_inline_data(k) || k->type == KEY_TYPE_reflink_p; } @@ -463,6 +488,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) case KEY_TYPE_reflink_p: case KEY_TYPE_reflink_v: case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: return true; default: return false; diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 878419d40992..503ce1920f39 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, new_inode, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); + ret = bch2_inode_create(trans, new_inode); if (ret) goto err; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 2d08263f3a42..1eb69ed38b10 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -26,6 +26,7 @@ #include <linux/migrate.h> #include <linux/mmu_context.h> #include <linux/pagevec.h> +#include <linux/rmap.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> #include <linux/uio.h> @@ -264,28 +265,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -299,13 +285,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -603,18 +583,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -628,10 +602,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -646,41 +620,33 @@ static void bch2_readpages_end_io(struct bio *bio) bio_put(bio); } -static inline void page_state_init_for_read(struct page *page) -{ - SetPagePrivate(page); - page->private = 0; -} - struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -688,41 +654,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -783,11 +717,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -841,18 +772,19 @@ retry: if (ret) break; - bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; + bkey_on_stack_reassemble(&sk, c, k); + ret = bch2_read_indirect_extent(trans, &offset_into_extent, &sk); if (ret) break; + k = bkey_i_to_s_c(sk.k); + sectors = min(sectors, k.k->size - offset_into_extent); bch2_trans_unlock(trans); @@ -870,7 +802,7 @@ retry: if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(&rbio->bio, k); - bch2_read_extent(c, rbio, k, offset_into_extent, flags); + bch2_read_extent(trans, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) break; @@ -890,10 +822,9 @@ retry: bkey_on_stack_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -902,7 +833,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); @@ -937,8 +868,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -1038,32 +967,33 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1087,7 +1017,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1241,7 +1171,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_PAGES * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1810,8 +1740,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; + unsigned unaligned; bool sync = dio->sync; long ret; @@ -1820,7 +1751,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) while (1) { if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -1828,7 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); if (unlikely(ret < 0)) goto err; @@ -1842,7 +1773,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1905,7 +1836,7 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; @@ -2191,6 +2122,12 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, ret = bch2_get_page_disk_reservation(c, inode, page, false); BUG_ON(ret); + /* + * This removes any writeable userspace mappings; we need to force + * .page_mkwrite to be called again before any mmapped writes, to + * redirty the full page: + */ + page_mkclean(page); __set_page_dirty_nobuffers(page); unlock: unlock_page(page); @@ -2816,235 +2753,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3240,7 +2948,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..2537a3d25ede 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); @@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 071c3a3de98d..3ac57ba29e9f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -38,9 +38,15 @@ static void bch2_vfs_inode_init(struct bch_fs *, struct bch_inode_info *, struct bch_inode_unpacked *); -static void journal_seq_copy(struct bch_inode_info *dst, +static void journal_seq_copy(struct bch_fs *c, + struct bch_inode_info *dst, u64 journal_seq) { + /* + * atomic64_cmpxchg has a fallback for archs that don't support it, + * cmpxchg does not: + */ + atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; u64 old, v = READ_ONCE(dst->ei_journal_seq); do { @@ -48,7 +54,9 @@ static void journal_seq_copy(struct bch_inode_info *dst, if (old >= journal_seq) break; - } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); + } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); + + bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); } static void __pagecache_lock_put(struct pagecache_lock *lock, long i) @@ -222,6 +230,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } +static int inum_test(struct inode *inode, void *p) +{ + unsigned long *ino = p; + + return *ino == inode->i_ino; +} + static struct bch_inode_info * __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) @@ -285,12 +300,12 @@ err_before_quota: if (!tmpfile) { bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(dir, journal_seq); + journal_seq_copy(c, dir, journal_seq); mutex_unlock(&dir->ei_update_lock); } bch2_vfs_inode_init(c, inode, &inode_u); - journal_seq_copy(inode, journal_seq); + journal_seq_copy(c, inode, journal_seq); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -301,13 +316,17 @@ err_before_quota: * thread pulling the inode in and modifying it: */ - old = to_bch_ei(insert_inode_locked2(&inode->v)); - if (unlikely(old)) { + inode->v.i_state |= I_CREATING; + old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, + inum_test, NULL, &inode->v.i_ino)); + BUG_ON(!old); + + if (unlikely(old != inode)) { /* * We raced, another process pulled the new inode into cache * before us: */ - journal_seq_copy(old, journal_seq); + journal_seq_copy(c, old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -401,7 +420,7 @@ static int __bch2_link(struct bch_fs *c, if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); - journal_seq_copy(inode, dir->ei_journal_seq); + journal_seq_copy(c, inode, dir->ei_journal_seq); bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); @@ -458,7 +477,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); - journal_seq_copy(inode, dir->ei_journal_seq); + journal_seq_copy(c, inode, dir->ei_journal_seq); bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(c, inode, &inode_u, @@ -493,7 +512,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry, if (unlikely(ret)) goto err; - journal_seq_copy(dir, inode->ei_journal_seq); + journal_seq_copy(c, dir, inode->ei_journal_seq); ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) @@ -591,22 +610,22 @@ retry: bch2_inode_update_after_write(c, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(src_dir, journal_seq); + journal_seq_copy(c, src_dir, journal_seq); if (src_dir != dst_dir) { bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(dst_dir, journal_seq); + journal_seq_copy(c, dst_dir, journal_seq); } bch2_inode_update_after_write(c, src_inode, &src_inode_u, ATTR_CTIME); - journal_seq_copy(src_inode, journal_seq); + journal_seq_copy(c, src_inode, journal_seq); if (dst_inode) { bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ATTR_CTIME); - journal_seq_copy(dst_inode, journal_seq); + journal_seq_copy(c, dst_inode, journal_seq); } err: bch2_trans_exit(&trans); @@ -804,7 +823,7 @@ static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, struct bkey_s_c k, unsigned flags) { - if (bkey_extent_is_data(k.k)) { + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -835,6 +854,12 @@ static int bch2_fill_extent(struct bch_fs *c, } return 0; + } else if (bkey_extent_is_inline_data(k.k)) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, @@ -861,6 +886,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bool have_extent = false; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; @@ -880,27 +909,26 @@ retry: continue; } - bkey_on_stack_realloc(&cur, c, k.k->u64s); - bkey_on_stack_realloc(&prev, c, k.k->u64s); - bkey_reassemble(cur.k, k); - k = bkey_i_to_s_c(cur.k); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; + bkey_on_stack_reassemble(&cur, c, k); + ret = bch2_read_indirect_extent(&trans, &offset_into_extent, &cur); if (ret) break; + k = bkey_i_to_s_c(cur.k); + bkey_on_stack_realloc(&prev, c, k.k->u64s); + sectors = min(sectors, k.k->size - offset_into_extent); - if (offset_into_extent) - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + cur.k); bch2_key_resize(&cur.k->k, sectors); cur.k->k.p = iter->pos; cur.k->k.p.offset += cur.k->k.size; @@ -915,10 +943,8 @@ retry: bkey_copy(prev.k, cur.k); have_extent = true; - if (k.k->type == KEY_TYPE_reflink_v) - bch2_btree_iter_set_pos(iter, k.k->p); - else - bch2_btree_iter_next(iter); + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, iter->pos.offset + sectors)); } if (ret == -EINTR) @@ -967,15 +993,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -993,7 +1010,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1063,7 +1080,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1239,6 +1256,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; + /* + * this assumes inodes take up 64 bytes, which is a decent average + * number: + */ + u64 avail_inodes = ((usage.capacity - usage.used) << 3); u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; @@ -1246,8 +1268,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = usage.capacity >> shift; buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = 0; - buf->f_ffree = 0; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); @@ -1283,91 +1306,36 @@ static struct bch_fs *bch2_path_to_fs(const char *dev) c = bch2_bdev_to_fs(bdev); bdput(bdev); - return c ?: ERR_PTR(-ENOENT); -} - -static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, - unsigned nr_devs, struct bch_opts opts) -{ - struct bch_fs *c, *c1, *c2; - size_t i; - - if (!nr_devs) - return ERR_PTR(-EINVAL); - - c = bch2_fs_open(devs, nr_devs, opts); - - if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { - /* - * Already open? - * Look up each block device, make sure they all belong to a - * filesystem and they all belong to the _same_ filesystem - */ - - c1 = bch2_path_to_fs(devs[0]); - if (IS_ERR(c1)) - return c; - - for (i = 1; i < nr_devs; i++) { - c2 = bch2_path_to_fs(devs[i]); - if (!IS_ERR(c2)) - closure_put(&c2->cl); - - if (c1 != c2) { - closure_put(&c1->cl); - return c; - } - } - - c = c1; - } - - if (IS_ERR(c)) - return c; - - down_write(&c->state_lock); - - if (!test_bit(BCH_FS_STARTED, &c->flags)) { - up_write(&c->state_lock); + if (c) closure_put(&c->cl); - pr_err("err mounting %s: incomplete filesystem", dev_name); - return ERR_PTR(-EINVAL); - } - - up_write(&c->state_lock); - - set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); - return c; + return c ?: ERR_PTR(-ENOENT); } -static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, - struct bch_opts opts) +static char **split_devs(const char *_dev_name, unsigned *nr) { char *dev_name = NULL, **devs = NULL, *s; - struct bch_fs *c = ERR_PTR(-ENOMEM); size_t i, nr_devs = 0; dev_name = kstrdup(_dev_name, GFP_KERNEL); if (!dev_name) - goto err; + return NULL; for (s = dev_name; s; s = strchr(s + 1, ':')) nr_devs++; - devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); - if (!devs) - goto err; + devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); + if (!devs) { + kfree(dev_name); + return NULL; + } for (i = 0, s = dev_name; s; (s = strchr(s, ':')) && (*s++ = '\0')) devs[i++] = s; - c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); -err: - kfree(devs); - kfree(dev_name); - return c; + *nr = nr_devs; + return devs; } static int bch2_remount(struct super_block *sb, int *flags, char *data) @@ -1378,7 +1346,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data) opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - ret = bch2_parse_mount_opts(&opts, data); + ret = bch2_parse_mount_opts(c, &opts, data); if (ret) return ret; @@ -1454,6 +1422,13 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void bch2_put_super(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + __bch2_fs_stop(c); +} + static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .destroy_inode = bch2_destroy_inode, @@ -1464,24 +1439,39 @@ static const struct super_operations bch_super_operations = { .show_devname = bch2_show_devname, .show_options = bch2_show_options, .remount_fs = bch2_remount, -#if 0 .put_super = bch2_put_super, +#if 0 .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, #endif }; -static int bch2_test_super(struct super_block *s, void *data) -{ - return s->s_fs_info == data; -} - static int bch2_set_super(struct super_block *s, void *data) { s->s_fs_info = data; return 0; } +static int bch2_noset_super(struct super_block *s, void *data) +{ + return -EBUSY; +} + +static int bch2_test_super(struct super_block *s, void *data) +{ + struct bch_fs *c = s->s_fs_info; + struct bch_fs **devs = data; + unsigned i; + + if (!c) + return false; + + for (i = 0; devs[i]; i++) + if (c != devs[i]) + return false; + return true; +} + static struct dentry *bch2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -1490,30 +1480,65 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, struct super_block *sb; struct inode *vinode; struct bch_opts opts = bch2_opts_empty(); - unsigned i; + char **devs; + struct bch_fs **devs_to_fs = NULL; + unsigned i, nr_devs; int ret; opt_set(opts, read_only, (flags & SB_RDONLY) != 0); - ret = bch2_parse_mount_opts(&opts, data); + ret = bch2_parse_mount_opts(NULL, &opts, data); if (ret) return ERR_PTR(ret); - c = bch2_open_as_blockdevs(dev_name, opts); - if (IS_ERR(c)) - return ERR_CAST(c); + devs = split_devs(dev_name, &nr_devs); + if (!devs) + return ERR_PTR(-ENOMEM); - sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); - if (IS_ERR(sb)) { - closure_put(&c->cl); - return ERR_CAST(sb); + devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); + if (!devs_to_fs) { + sb = ERR_PTR(-ENOMEM); + goto got_sb; } - BUG_ON(sb->s_fs_info != c); + for (i = 0; i < nr_devs; i++) + devs_to_fs[i] = bch2_path_to_fs(devs[i]); - if (sb->s_root) { - closure_put(&c->cl); + sb = sget(fs_type, bch2_test_super, bch2_noset_super, + flags|SB_NOSEC, devs_to_fs); + if (!IS_ERR(sb)) + goto got_sb; + c = bch2_fs_open(devs, nr_devs, opts); + if (IS_ERR(c)) { + sb = ERR_CAST(c); + goto got_sb; + } + + /* Some options can't be parsed until after the fs is started: */ + ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) { + bch2_fs_stop(c); + sb = ERR_PTR(ret); + goto got_sb; + } + + bch2_opts_apply(&c->opts, opts); + + sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); + if (IS_ERR(sb)) + bch2_fs_stop(c); +got_sb: + kfree(devs_to_fs); + kfree(devs[0]); + kfree(devs); + + if (IS_ERR(sb)) + return ERR_CAST(sb); + + c = sb->s_fs_info; + + if (sb->s_root) { if ((flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; @@ -1540,9 +1565,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (ret) goto err_put_super; - sb->s_bdi->congested_fn = bch2_congested; - sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; @@ -1588,11 +1611,7 @@ static void bch2_kill_sb(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; generic_shutdown_super(sb); - - if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) - bch2_fs_stop(c); - else - closure_put(&c->cl); + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 5a6df3d1973a..0c5035270846 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -537,7 +537,7 @@ retry: bch2_trans_unlock(&trans); - bch2_inode_pack(&p, &w.inode); + bch2_inode_pack(c, &p, &w.inode); ret = bch2_btree_insert(c, BTREE_ID_INODES, &p.inode.k_i, NULL, NULL, @@ -808,7 +808,7 @@ create_root: 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed, root_inode); + bch2_inode_pack(c, &packed, root_inode); return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, @@ -866,36 +866,22 @@ create_lostfound: return ret; } -struct inode_bitmap { - unsigned long *bits; - size_t size; -}; +typedef GENRADIX(unsigned long) inode_bitmap; -static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) +static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) { - return nr < b->size ? test_bit(nr, b->bits) : false; + unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); + return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; } -static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) +static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) { - if (nr >= b->size) { - size_t new_size = max_t(size_t, max_t(size_t, - PAGE_SIZE * 8, - b->size * 2), - nr + 1); - void *n; - - new_size = roundup_pow_of_two(new_size); - n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); - if (!n) { - return -ENOMEM; - } + unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); - b->bits = n; - b->size = new_size; - } + if (!w) + return -ENOMEM; - __set_bit(nr, b->bits); + *w |= 1UL << (nr & (BITS_PER_LONG - 1)); return 0; } @@ -934,7 +920,7 @@ noinline_for_stack static int check_directory_structure(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode) { - struct inode_bitmap dirs_done = { NULL, 0 }; + inode_bitmap dirs_done; struct pathbuf path = { 0, 0, NULL }; struct pathbuf_entry *e; struct btree_trans trans; @@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c, /* DFS: */ restart_dfs: + genradix_init(&dirs_done); had_unreachable = false; ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); @@ -1057,7 +1044,7 @@ retry: if (had_unreachable) { bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); memset(&dirs_done, 0, sizeof(dirs_done)); memset(&path, 0, sizeof(path)); @@ -1066,7 +1053,7 @@ retry: err: fsck_err: ret = bch2_trans_exit(&trans) ?: ret; - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); return ret; } @@ -1326,7 +1313,7 @@ static int check_inode(struct btree_trans *trans, if (do_update) { struct bkey_inode_buf p; - bch2_inode_pack(&p, &u); + bch2_inode_pack(c, &p, &u); ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 7d20f082ad45..42371de7f72a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" #include "bkey_methods.h" #include "btree_update.h" #include "error.h" #include "extents.h" #include "inode.h" #include "str_hash.h" +#include "varint.h" #include <linux/random.h> @@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - u8 *out = packed->inode.v.fields; + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; unsigned nr_fields = 0, last_nonzero_fieldnr = 0; unsigned bytes; - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); - -#define x(_name, _bits) \ +#define x(_name, _bits) \ out += inode_encode_field(out, end, 0, inode->_name); \ nr_fields++; \ \ @@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + int ret; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (inode->_name) { \ + ret = bch2_varint_encode(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + } + + BCH_INODE_FIELDS() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +void bch2_inode_pack(struct bch_fs *c, + struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bkey_inode_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + + if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { + SET_INODE_NEW_VARINT(&packed->inode.v, true); + bch2_inode_pack_v2(packed, inode); + } else { + bch2_inode_pack_v1(packed, inode); + } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; @@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); BUG_ON(unpacked.bi_mode != inode->bi_mode); -#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); BCH_INODE_FIELDS() #undef x } } -int bch2_inode_unpack(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) { const u8 *in = inode.v->fields; - const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); + const u8 *end = bkey_val_end(inode); u64 field[2]; unsigned fieldnr = 0, field_bits; int ret; - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - #define x(_name, _bits) \ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ memset(&unpacked->_name, 0, \ @@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, #undef x /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + +#define x(_name, _bits) \ + if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ + ret = bch2_varint_decode(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +int bch2_inode_unpack(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(inode, unpacked); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } return 0; } @@ -189,11 +301,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), - BTREE_ITER_SLOTS|flags); + BTREE_ITER_CACHED|flags); if (IS_ERR(iter)) return iter; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) goto err; @@ -222,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans, if (IS_ERR(inode_p)) return PTR_ERR(inode_p); - bch2_inode_pack(inode_p, inode); + bch2_inode_pack(trans->c, inode_p, inode); bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -271,6 +383,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, return; } + pr_buf(out, "mode: %o ", unpacked.bi_mode); + #define x(_name, _bits) \ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); BCH_INODE_FIELDS() @@ -359,20 +473,24 @@ static inline u32 bkey_generation(struct bkey_s_c k) } int bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) + struct bch_inode_unpacked *inode_u) { + struct bch_fs *c = trans->c; struct bkey_inode_buf *inode_p; struct btree_iter *iter = NULL; struct bkey_s_c k; - u64 start; + u64 min, max, start, *hint; int ret; - if (!max) - max = ULLONG_MAX; + unsigned cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit + ? 31 : 63) - c->inode_shard_bits; - if (trans->c->opts.inodes_32bit) - max = min_t(u64, max, U32_MAX); + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); + + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; start = READ_ONCE(*hint); @@ -388,7 +506,17 @@ again: if (bkey_cmp(iter->pos, POS(0, max)) > 0) break; - if (k.k->type != KEY_TYPE_inode) + /* + * There's a potential cache coherency issue with the btree key + * cache code here - we're iterating over the btree, skipping + * that cache. We should never see an empty slot that isn't + * actually empty due to a pending update in the key cache + * because the update that creates the inode isn't done with a + * cached iterator, but - better safe than sorry, check the + * cache before using a slot: + */ + if (k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) goto found_slot; } @@ -409,10 +537,7 @@ found_slot: inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); - bch2_trans_iter_put(trans, iter); - return 0; + return bch2_inode_write(trans, iter, inode_u); } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) @@ -422,6 +547,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) struct bkey_i_inode_generation delete; struct bpos start = POS(inode_nr, 0); struct bpos end = POS(inode_nr + 1, 0); + struct bkey_s_c k; + u64 bi_generation; int ret; /* @@ -442,51 +569,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) return ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + bi_generation = 0; + + ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); + if (ret) { + if (ret != -EINTR) + bch_err(c, "error flushing btree key cache: %i", ret); + goto err; + } iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - u32 bi_generation = 0; + k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - break; + ret = bkey_err(k); + if (ret) + goto err; - bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, - "inode %llu not found when deleting", - inode_nr); + bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, + "inode %llu not found when deleting", + inode_nr); - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bch_inode_unpacked inode_u; + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bch_inode_unpacked inode_u; - if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) - bi_generation = inode_u.bi_generation + 1; - break; - } - case KEY_TYPE_inode_generation: { - struct bkey_s_c_inode_generation g = - bkey_s_c_to_inode_generation(k); - bi_generation = le32_to_cpu(g.v->bi_generation); - break; - } - } + if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) + bi_generation = inode_u.bi_generation + 1; + break; + } + case KEY_TYPE_inode_generation: { + struct bkey_s_c_inode_generation g = + bkey_s_c_to_inode_generation(k); + bi_generation = le32_to_cpu(g.v->bi_generation); + break; + } + } - if (!bi_generation) { - bkey_init(&delete.k); - delete.k.p.offset = inode_nr; - } else { - bkey_inode_generation_init(&delete.k_i); - delete.k.p.offset = inode_nr; - delete.v.bi_generation = cpu_to_le32(bi_generation); - } + if (!bi_generation) { + bkey_init(&delete.k); + delete.k.p.offset = inode_nr; + } else { + bkey_inode_generation_init(&delete.k_i); + delete.k.p.offset = inode_nr; + delete.v.bi_generation = cpu_to_le32(bi_generation); + } - bch2_trans_update(&trans, iter, &delete.k_i, 0); + bch2_trans_update(&trans, iter, &delete.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - } while (ret == -EINTR); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); return ret; @@ -500,11 +638,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(0, inode_nr), BTREE_ITER_SLOTS); + POS(0, inode_nr), BTREE_ITER_CACHED); if (IS_ERR(iter)) return PTR_ERR(iter); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) goto err; @@ -523,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void) -{ - struct bch_inode_unpacked *u, test_inodes[] = { - { - .bi_atime = U64_MAX, - .bi_ctime = U64_MAX, - .bi_mtime = U64_MAX, - .bi_otime = U64_MAX, - .bi_size = U64_MAX, - .bi_sectors = U64_MAX, - .bi_uid = U32_MAX, - .bi_gid = U32_MAX, - .bi_nlink = U32_MAX, - .bi_generation = U32_MAX, - .bi_dev = U32_MAX, - }, - }; - - for (u = test_inodes; - u < test_inodes + ARRAY_SIZE(test_inodes); - u++) { - struct bkey_inode_buf p; - - bch2_inode_pack(&p, u); - } -} -#endif diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index bb759a46dc41..ef7e885dce0c 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_inode_generation_to_text, \ } +#if 0 +typedef struct { + u64 lo; + u32 hi; +} __packed __aligned(4) u96; +#endif +typedef u64 u96; + struct bch_inode_unpacked { u64 bi_inum; __le64 bi_hash_seed; @@ -43,7 +51,8 @@ struct bkey_inode_buf { #undef x } __attribute__((packed, aligned(8))); -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, + const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); struct btree_iter *bch2_inode_peek(struct btree_trans *, @@ -60,9 +69,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, - u64, u64, u64 *); +int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); int bch2_inode_rm(struct bch_fs *, u64); @@ -168,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, } } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void); -#else -static inline void bch2_inode_pack_test(void) {} -#endif - #endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 409c59c219df..21087d1193dc 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_on_stack.h" #include "bset.h" @@ -134,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -170,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; @@ -300,7 +301,7 @@ int bch2_extent_update(struct btree_trans *trans, inode_u.bi_sectors += delta; if (delta || new_i_size) { - bch2_inode_pack(&inode_p, &inode_u); + bch2_inode_pack(trans->c, &inode_p, &inode_u); bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); } @@ -1474,7 +1475,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, opts, DATA_PROMOTE, (struct data_opts) { - .target = opts.promote_target + .target = opts.promote_target, + .nr_replicas = 1, }, btree_id, k); BUG_ON(ret); @@ -1635,7 +1637,7 @@ retry: goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) @@ -1674,7 +1676,6 @@ retry: unsigned bytes, sectors, offset_into_extent; bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); @@ -1685,6 +1686,8 @@ retry: if (ret) break; + k = bkey_i_to_s_c(sk.k); + sectors = min(sectors, k.k->size - offset_into_extent); bch2_trans_unlock(&trans); @@ -1692,7 +1695,7 @@ retry: bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, + ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, offset_into_extent, failed, flags); switch (ret) { case READ_RETRY: @@ -2006,7 +2009,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != KEY_TYPE_reflink_v) { + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { __bcache_io_error(trans->c, "pointer to nonexistent indirect extent"); ret = -EIO; @@ -2020,11 +2024,12 @@ err: return ret; } -int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; @@ -2033,13 +2038,12 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; - if (k.k->type == KEY_TYPE_inline_data) { - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_val_bytes(d.k)); + bkey_inline_data_bytes(k.k)); swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, d.v->data); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); @@ -2192,9 +2196,9 @@ get_bio: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - rcu_read_lock(); - bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - rcu_read_unlock(); + if (pick.ptr.cached) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); @@ -2311,13 +2315,14 @@ retry: sectors = k.k->size - offset_into_extent; bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); ret = bch2_read_indirect_extent(&trans, &offset_into_extent, &sk); if (ret) goto err; + k = bkey_i_to_s_c(sk.k); + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -2336,7 +2341,7 @@ retry: if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, offset_into_extent, flags); + bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) break; diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index ded468d70f09..e6aac594f3e6 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -136,17 +136,17 @@ enum bch_read_flags { BCH_READ_IN_RETRY = 1 << 7, }; -int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bkey_s_c, unsigned, struct bch_io_failures *, unsigned); -static inline void bch2_read_extent(struct bch_fs *c, +static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, offset_into_extent, NULL, flags); } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 210ad1b0c469..c2cafd3892a4 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -18,6 +18,8 @@ #include <trace/events/bcachefs.h> +static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); + static bool __journal_entry_is_open(union journal_res_state state) { return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; @@ -305,6 +307,19 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) return seq; } +void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) +{ + size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); + struct journal_buf *buf; + + spin_lock(&j->lock); + + if ((buf = journal_seq_to_buf(j, seq))) + set_bit(h, buf->has_inode); + + spin_unlock(&j->lock); +} + static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -965,9 +980,11 @@ void bch2_fs_journal_stop(struct journal *j) wait_event(j->wait, journal_entry_close(j)); - /* do we need to write another journal entry? */ - if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) - bch2_journal_meta(j); + /* + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ + bch2_journal_meta(j); journal_quiesce(j); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 1dde0b5d963f..f60bc964ee1f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -147,6 +147,7 @@ static inline u64 journal_cur_seq(struct journal *j) } u64 bch2_inode_journal_seq(struct journal *, u64); +void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) { @@ -281,7 +282,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 57591983eebd..18e45296e7de 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, return ret; } -/** - * bch2_journal_reclaim - free up journal buckets - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -void bch2_journal_reclaim(struct journal *j) +static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned iter, min_nr = 0; u64 seq_to_flush = 0; - - lockdep_assert_held(&j->reclaim_lock); - - bch2_journal_do_discards(j); + unsigned iter; spin_lock(&j->lock); @@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j) (j->pin.size >> 1)); spin_unlock(&j->lock); - /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: - */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(j->reclaim_delay_ms))) - min_nr = 1; + return seq_to_flush; +} - if (j->prereserved.reserved * 2 > j->prereserved.remaining) { - seq_to_flush = max(seq_to_flush, journal_last_seq(j)); - min_nr = 1; - } +/** + * bch2_journal_reclaim - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +void bch2_journal_reclaim(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned min_nr = 0; + u64 seq_to_flush = 0; + + lockdep_assert_held(&j->reclaim_lock); + + do { + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); + min_nr = 0; + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; - journal_flush_pins(j, seq_to_flush, min_nr); + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; + } while (journal_flush_pins(j, seq_to_flush, min_nr)); if (!bch2_journal_error(j)) queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 4a2c4debd3f0..6633d21f604a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -95,10 +95,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op) !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) goto nomatch; - if (m->data_cmd == DATA_REWRITE && - !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) - goto nomatch; - bkey_reassemble(&_insert.k, k); insert = &_insert.k; @@ -110,9 +106,19 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_cut_back(new->k.p, insert); bch2_cut_back(insert->k.p, &new->k_i); - if (m->data_cmd == DATA_REWRITE) - bch2_bkey_drop_device(bkey_i_to_s(insert), - m->data_opts.rewrite_dev); + if (m->data_cmd == DATA_REWRITE) { + struct bch_extent_ptr *new_ptr, *old_ptr = (void *) + bch2_bkey_has_device(bkey_i_to_s_c(insert), + m->data_opts.rewrite_dev); + if (!old_ptr) + goto nomatch; + + if (old_ptr->cached) + extent_for_each_ptr(extent_i_to_s(new), new_ptr) + new_ptr->cached = true; + + bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); + } extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { @@ -260,8 +266,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, BCH_WRITE_DATA_ENCODED| BCH_WRITE_FROM_INTERNAL; - m->op.nr_replicas = 1; - m->op.nr_replicas_required = 1; + m->op.nr_replicas = data_opts.nr_replicas; + m->op.nr_replicas_required = data_opts.nr_replicas; m->op.index_update_fn = bch2_migrate_index_update; switch (data_cmd) { @@ -291,14 +297,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, unsigned compressed_sectors = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - crc_is_compressed(p.crc) && - bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) + if (p.ptr.dev == data_opts.rewrite_dev && + !p.ptr.cached && + crc_is_compressed(p.crc)) compressed_sectors += p.crc.compressed_size; if (compressed_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, - compressed_sectors, + k.k->size * m->op.nr_replicas, BCH_DISK_RESERVATION_NOFAIL); if (ret) return ret; @@ -320,12 +326,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -409,7 +415,7 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } -static int bch2_move_extent(struct bch_fs *c, +static int bch2_move_extent(struct btree_trans *trans, struct moving_context *ctxt, struct write_point_specifier wp, struct bch_io_opts io_opts, @@ -418,6 +424,7 @@ static int bch2_move_extent(struct bch_fs *c, enum data_cmd data_cmd, struct data_opts data_opts) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct moving_io *io; const union bch_extent_entry *entry; @@ -484,7 +491,7 @@ static int bch2_move_extent(struct bch_fs *c, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(c, &io->rbio, k, 0, + bch2_read_extent(trans, &io->rbio, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -602,7 +609,7 @@ peek: k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, + ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { if (ret2 == -ENOMEM) { @@ -749,6 +756,7 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, return DATA_SKIP; data_opts->target = 0; + data_opts->nr_replicas = 1; data_opts->btree_insert_flags = 0; return DATA_ADD_REPLICAS; } @@ -764,6 +772,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, return DATA_SKIP; data_opts->target = 0; + data_opts->nr_replicas = 1; data_opts->btree_insert_flags = 0; data_opts->rewrite_dev = op->migrate.dev; return DATA_REWRITE; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 0acd1720d4f8..b04bc669226d 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -20,7 +20,8 @@ enum data_cmd { struct data_opts { u16 target; - unsigned rewrite_dev; + u8 rewrite_dev; + u8 nr_replicas; int btree_insert_flags; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index de0a7974ec9f..ddfda1ef8a79 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -53,17 +53,21 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) cmp_int(l->offset, r->offset); } -static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) +static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { copygc_heap *h = &c->copygc_heap; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct copygc_heap_entry search = { - .dev = ptr->dev, - .offset = ptr->offset + .dev = p.ptr.dev, + .offset = p.ptr.offset, }; ssize_t i = eytzinger0_find_le(h->data, h->used, @@ -81,27 +85,24 @@ static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) BUG_ON(i != j); #endif if (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen) - return ptr->dev; - } + p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && + p.ptr.gen == h->data[i].gen) { + data_opts->target = io_opts->background_target; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; + data_opts->rewrite_dev = p.ptr.dev; - return -1; -} + if (p.has_ec) { + struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); -static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_opts *data_opts) -{ - int dev_idx = __copygc_pred(c, k); - if (dev_idx < 0) - return DATA_SKIP; - - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; - data_opts->rewrite_dev = dev_idx; - return DATA_REWRITE; + data_opts->nr_replicas += m->nr_redundant; + } + + return DATA_REWRITE; + } + } + + return DATA_SKIP; } static bool have_copygc_reserve(struct bch_dev *ca) @@ -168,7 +169,8 @@ static int bch2_copygc(struct bch_fs *c) buckets = bucket_array(ca); for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + struct bucket *g = buckets->b + b; + struct bucket_mark m = READ_ONCE(g->mark); struct copygc_heap_entry e; if (m.owned_by_allocator || @@ -177,9 +179,12 @@ static int bch2_copygc(struct bch_fs *c) bucket_sectors_used(m) >= ca->mi.bucket_size) continue; + WARN_ON(m.stripe && !g->ec_redundancy); + e = (struct copygc_heap_entry) { .dev = dev_idx, .gen = m.gen, + .replicas = 1 + g->ec_redundancy, .fragmentation = bucket_sectors_used(m) * (1U << 15) / ca->mi.bucket_size, .sectors = bucket_sectors_used(m), @@ -196,11 +201,11 @@ static int bch2_copygc(struct bch_fs *c) } for (i = h->data; i < h->data + h->used; i++) - sectors_to_move += i->sectors; + sectors_to_move += i->sectors * i->replicas; while (sectors_to_move > sectors_reserved) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - sectors_to_move -= e.sectors; + sectors_to_move -= e.sectors * e.replicas; } buckets_to_move = h->used; diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index afe25cd26c06..97a36ac0beea 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -247,7 +247,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, break; case BCH_OPT_FN: if (!c) - return -EINVAL; + return 0; return opt->parse(c, val, res); } @@ -325,7 +325,8 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } -int bch2_parse_mount_opts(struct bch_opts *opts, char *options) +int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + char *options) { char *opt, *name, *val; int ret, id; @@ -340,7 +341,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) if (id < 0) goto bad_opt; - ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); if (ret < 0) goto bad_val; } else { diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 014c608ca0c6..710a7ee67039 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -185,7 +185,7 @@ enum opt_type { x(inline_data, u8, \ OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ x(acl, u8, \ OPT_FORMAT|OPT_MOUNT, \ @@ -418,7 +418,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, int bch2_opt_check_may_set(struct bch_fs *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); -int bch2_parse_mount_opts(struct bch_opts *, char *); +int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); /* inode opts: */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 56a1f761271f..44d2651be970 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -73,6 +73,7 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, { if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { data_opts->target = io_opts->background_target; + data_opts->nr_replicas = 1; data_opts->btree_insert_flags = 0; return DATA_ADD_REPLICAS; } else { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6e829bf0a31f..1745cfac6b26 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -25,6 +25,18 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +/* for -o reconstruct_alloc: */ +static void drop_alloc_keys(struct journal_keys *keys) +{ + size_t src, dst; + + for (src = 0, dst = 0; src < keys->nr; src++) + if (keys->d[src].btree_id != BTREE_ID_ALLOC) + keys->d[dst++] = keys->d[src]; + + keys->nr = dst; +} + /* iterate over keys read from the journal: */ static struct journal_key *journal_key_search(struct journal_keys *journal_keys, @@ -845,9 +857,11 @@ static int verify_superblock_clean(struct bch_fs *c, } mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); + "superblock read clock %u doesn't match journal %u after clean shutdown", + clean->read_clock, j->read_clock); mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); + "superblock write clock %u doesn't match journal %u after clean shutdown", + clean->write_clock, j->write_clock); for (i = 0; i < BTREE_ID_NR; i++) { char buf1[200], buf2[200]; @@ -928,7 +942,6 @@ static int read_btree_roots(struct bch_fs *c) continue; } - if (r->error) { __fsck_err(c, i == BTREE_ID_ALLOC ? FSCK_CAN_IGNORE : 0, @@ -961,7 +974,7 @@ int bch2_fs_recovery(struct bch_fs *c) const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; u64 journal_seq; - bool wrote = false, write_sb = false; + bool write_sb = false, need_write_alloc = false; int ret; if (c->sb.clean) @@ -1025,6 +1038,11 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + drop_alloc_keys(&c->journal_keys); + } + ret = journal_replay_early(c, clean, &c->journal_entries); if (ret) goto err; @@ -1090,8 +1108,10 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "starting metadata mark and sweep"); err = "error in mark and sweep"; ret = bch2_gc(c, &c->journal_keys, true, true); - if (ret) + if (ret < 0) goto err; + if (ret) + need_write_alloc = true; bch_verbose(c, "mark and sweep done"); } @@ -1101,8 +1121,10 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; ret = bch2_gc(c, &c->journal_keys, true, false); - if (ret) + if (ret < 0) goto err; + if (ret) + need_write_alloc = true; bch_verbose(c, "mark and sweep done"); } @@ -1126,7 +1148,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; bch_verbose(c, "journal replay done"); - if (!c->opts.nochanges) { + if (need_write_alloc && !c->opts.nochanges) { /* * note that even when filesystem was clean there might be work * to do here, if we ran gc (because of fsck) which recalculated @@ -1134,8 +1156,8 @@ int bch2_fs_recovery(struct bch_fs *c) */ bch_verbose(c, "writing allocation info"); err = "error writing out alloc info"; - ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: - bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); + ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: + bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); if (ret) { bch_err(c, "error writing alloc info"); goto err; @@ -1281,15 +1303,29 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_fs_journal_start(&c->journal, 1, &journal); bch2_journal_set_replay_done(&c->journal); + err = "error going read-write"; + ret = bch2_fs_read_write_early(c); + if (ret) + goto err; + + /* + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ + err = "error writing alloc info"; + ret = bch2_alloc_write(c, 0); + if (ret) + goto err; + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed_inode, &root_inode); + bch2_inode_pack(c, &packed_inode, &root_inode); err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed_inode.inode.k_i, - NULL, NULL, BTREE_INSERT_LAZY_RW); + NULL, NULL, 0); if (ret) goto err; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 3c473f1380a6..8abcbfb3bd64 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -9,6 +9,18 @@ #include <linux/sched/signal.h> +static inline unsigned bkey_type_to_indirect(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + return KEY_TYPE_reflink_v; + case KEY_TYPE_inline_data: + return KEY_TYPE_indirect_inline_data; + default: + return 0; + } +} + /* reflink pointers */ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -71,17 +83,42 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +/* indirect inline data */ + +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) + return "incorrect value size"; + return NULL; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + pr_buf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, - struct bkey_i_extent *e) + struct bkey_i *orig) { struct bch_fs *c = trans->c; struct btree_iter *reflink_iter; struct bkey_s_c k; - struct bkey_i_reflink_v *r_v; + struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; + __le64 *refcount; int ret; + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, POS(0, c->reflink_hint), BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { @@ -90,7 +127,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, continue; } - if (bkey_deleted(k.k) && e->k.size <= k.k->size) + if (bkey_deleted(k.k) && orig->k.size <= k.k->size) break; } @@ -100,29 +137,31 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, /* rewind iter to start of hole, if necessary: */ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); - r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) goto err; - bkey_reflink_v_init(&r_v->k_i); + bkey_init(&r_v->k); + r_v->k.type = bkey_type_to_indirect(&orig->k); r_v->k.p = reflink_iter->pos; - bch2_key_resize(&r_v->k, e->k.size); - r_v->k.version = e->k.version; + bch2_key_resize(&r_v->k, orig->k.size); + r_v->k.version = orig->k.version; + + set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + - bkey_val_u64s(&e->k)); - r_v->v.refcount = 0; - memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + refcount = (void *) &r_v->v; + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); + bch2_trans_update(trans, reflink_iter, r_v, 0); r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); if (IS_ERR(r_p)) return PTR_ERR(r_p); - e->k.type = KEY_TYPE_reflink_p; - r_p = bkey_i_to_reflink_p(&e->k_i); + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); @@ -144,8 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) if (bkey_cmp(iter->pos, end) >= 0) return bkey_s_c_null; - if (k.k->type == KEY_TYPE_extent || - k.k->type == KEY_TYPE_reflink_p) + if (bkey_extent_is_data(k.k)) break; } @@ -218,7 +256,7 @@ s64 bch2_remap_range(struct bch_fs *c, if (!bkey_cmp(dst_iter->pos, dst_end)) break; - if (src_k.k->type == KEY_TYPE_extent) { + if (src_k.k->type != KEY_TYPE_reflink_p) { bkey_on_stack_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); @@ -226,7 +264,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_cut_back(src_end, new_src.k); ret = bch2_make_extent_indirect(&trans, src_iter, - bkey_i_to_extent(new_src.k)); + new_src.k); if (ret) goto btree_err; diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 5445c1cf0797..9d5e7dc58f2b 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -18,13 +18,22 @@ const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); - #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ } +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *, + struct bkey_s_c); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ +} + s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, u64, u64 *, u64, s64 *); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 6b6506c68609..91518c0d6794 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, extent_to_replicas(k, e); break; case KEY_TYPE_stripe: - e->data_type = BCH_DATA_user; + e->data_type = BCH_DATA_parity; stripe_to_replicas(k, e); break; } @@ -446,7 +446,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, bch2_bkey_to_replicas(&search.e, k); - return __bch2_mark_replicas(c, &search.e, check); + ret = __bch2_mark_replicas(c, &search.e, check); + if (ret) + return ret; + + if (search.e.data_type == BCH_DATA_parity) { + search.e.data_type = BCH_DATA_cached; + ret = __bch2_mark_replicas(c, &search.e, check); + if (ret) + return ret; + + search.e.data_type = BCH_DATA_user; + ret = __bch2_mark_replicas(c, &search.e, check); + if (ret) + return ret; + } + + return 0; } bool bch2_bkey_replicas_marked(struct bch_fs *c, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 30be083b09bf..8673e9744ce1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -149,44 +149,6 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } -int bch2_congested(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - struct backing_dev_info *bdi; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - rcu_read_lock(); - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - const struct bch_devs_mask *devs = - bch2_target_to_mask(c, c->opts.foreground_target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_member_device_rcu(ca, c, i, devs) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } - rcu_read_unlock(); - - return ret; -} - /* Filesystem RO/RW: */ /* @@ -207,14 +169,15 @@ int bch2_congested(void *data, int bdi_bits) static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; - bool wrote = false; unsigned i, clean_passes = 0; - int ret; bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_gc_thread_stop(c); + bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); + bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); + /* * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: @@ -228,20 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) goto nowrote_alloc; - bch_verbose(c, "writing alloc info"); - /* - * This should normally just be writing the bucket read/write clocks: - */ - ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: - bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); - bch_verbose(c, "writing alloc info complete"); - - if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); - - if (ret) - goto nowrote_alloc; - bch_verbose(c, "flushing journal and stopping allocators"); bch2_journal_flush_all_pins(&c->journal); @@ -454,6 +403,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); + bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); + for_each_rw_member(ca, c, i) { ret = bch2_dev_allocator_start(ca); if (ret) { @@ -496,9 +448,10 @@ int bch2_fs_read_write_early(struct bch_fs *c) /* Filesystem startup/shutdown: */ -static void bch2_fs_free(struct bch_fs *c) +static void __bch2_fs_free(struct bch_fs *c) { unsigned i; + int cpu; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); @@ -523,6 +476,12 @@ static void bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[1]); free_percpu(c->usage[0]); kfree(c->usage_base); + + if (c->btree_iters_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + + free_percpu(c->btree_iters_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -533,6 +492,7 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); if (c->journal_reclaim_wq) @@ -552,10 +512,10 @@ static void bch2_fs_release(struct kobject *kobj) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - bch2_fs_free(c); + __bch2_fs_free(c); } -void bch2_fs_stop(struct bch_fs *c) +void __bch2_fs_stop(struct bch_fs *c) { struct bch_dev *ca; unsigned i; @@ -586,13 +546,6 @@ void bch2_fs_stop(struct bch_fs *c) kobject_put(&c->opts_dir); kobject_put(&c->internal); - mutex_lock(&bch_fs_list_lock); - list_del(&c->list); - mutex_unlock(&bch_fs_list_lock); - - closure_sync(&c->cl); - closure_debug_destroy(&c->cl); - /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); @@ -605,6 +558,22 @@ void bch2_fs_stop(struct bch_fs *c) for (i = 0; i < c->sb.nr_devices; i++) if (c->devs[i]) + bch2_free_super(&c->devs[i]->disk_sb); +} + +void bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); + + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + + for (i = 0; i < c->sb.nr_devices; i++) + if (c->devs[i]) bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); bch_verbose(c, "shutdown complete"); @@ -612,6 +581,12 @@ void bch2_fs_stop(struct bch_fs *c) kobject_put(&c->kobj); } +void bch2_fs_stop(struct bch_fs *c) +{ + __bch2_fs_stop(c); + bch2_fs_free(c); +} + static const char *bch2_fs_online(struct bch_fs *c) { struct bch_dev *ca; @@ -669,6 +644,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); + + c->kobj.kset = bcachefs_kset; + kobject_init(&c->kobj, &bch2_fs_ktype); + kobject_init(&c->internal, &bch2_fs_internal_ktype); + kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); + kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + c->minor = -1; c->disk_sb.fs_sb = true; @@ -761,11 +744,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->copygc_wq = alloc_workqueue("bcache_copygc", + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", + !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || @@ -775,9 +760,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -799,18 +787,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_dev_alloc(c, i)) goto err; - /* - * Now that all allocations have succeeded, init various refcounty - * things that let us shutdown: - */ - closure_init(&c->cl, NULL); - - c->kobj.kset = bcachefs_kset; - kobject_init(&c->kobj, &bch2_fs_ktype); - kobject_init(&c->internal, &bch2_fs_internal_ktype); - kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); - kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); - mutex_lock(&bch_fs_list_lock); err = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); @@ -906,6 +882,13 @@ int bch2_fs_start(struct bch_fs *c) set_bit(BCH_FS_STARTED, &c->flags); + /* + * Allocator threads don't start filling copygc reserve until after we + * set BCH_FS_STARTED - wake them now: + */ + for_each_online_member(ca, c, i) + bch2_wake_allocator(ca); + if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { @@ -1683,6 +1666,11 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); + err = "alloc write failed"; + ret = bch2_dev_alloc_write(c, ca, 0); + if (ret) + goto err; + if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) @@ -2037,7 +2025,6 @@ static void bcachefs_exit(void) static int __init bcachefs_init(void) { bch2_bkey_pack_test(); - bch2_inode_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || bch2_chardev_init() || diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index fffee96726ce..02c81f3555c3 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); @@ -231,6 +230,8 @@ static inline void bch2_fs_lazy_rw(struct bch_fs *c) bch2_fs_read_write_early(c); } +void __bch2_fs_stop(struct bch_fs *); +void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 0cb29f43d99d..d7ad293aff4d 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -208,12 +208,6 @@ read_attribute(io_timers_write); write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ -#define BCH_DEBUG_PARAM(name, description) \ - rw_attribute(name); - - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - #define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ { .name = #_name, .mode = S_IRUGO }; @@ -414,10 +408,6 @@ SHOW(bch2_fs) return out.pos - buf; } -#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - return 0; } @@ -462,10 +452,6 @@ STORE(bch2_fs) /* Debugging: */ -#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; @@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_write, &sysfs_internal_uuid, - -#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - NULL }; diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index fd4044a6a08f..2709163e02b5 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { struct page *page = alloc_page(gfp_mask); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) return -ENOMEM; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 119c86122023..6e5335440b4b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -37,17 +37,6 @@ struct closure; #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -#define memcpy(dst, src, len) \ -({ \ - void *_dst = (dst); \ - const void *_src = (src); \ - size_t _len = (len); \ - \ - BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ - (void *) (_dst) + (_len) <= (void *) (_src))); \ - memcpy(_dst, _src, _len); \ -}) - #else /* DEBUG */ #define EBUG_ON(cond) @@ -99,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -664,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 index 000000000000..a3d252c741c8 --- /dev/null +++ b/fs/bcachefs/varint.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitops.h> +#include <asm/unaligned.h> + +#include "varint.h" + +int bch2_varint_encode(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(v & 255) + 1; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v >>= bytes; + v &= ~(~0ULL << (7 * bytes)); + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h new file mode 100644 index 000000000000..8daf813576b7 --- /dev/null +++ b/fs/bcachefs/varint.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_VARINT_H +#define _BCACHEFS_VARINT_H + +int bch2_varint_encode(u8 *, u64); +int bch2_varint_decode(const u8 *, const u8 *, u64 *); + +#endif /* _BCACHEFS_VARINT_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 9b4e8295ed75..ba2c55559796 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) +TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, + unsigned reason, + enum btree_id have_btree_id, + unsigned have_iter_type, + enum btree_id want_btree_id, + unsigned want_iter_type), + TP_ARGS(trans_ip, caller_ip, reason, + have_btree_id, have_iter_type, + want_btree_id, want_iter_type), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) + __field(u8, reason ) + __field(u8, have_btree_id ) + __field(u8, have_iter_type ) + __field(u8, want_btree_id ) + __field(u8, want_iter_type ) + ), + + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; + __entry->reason = reason; + __entry->have_btree_id = have_btree_id; + __entry->have_iter_type = have_iter_type; + __entry->want_btree_id = want_btree_id; + __entry->want_iter_type = want_iter_type; + ), + + TP_printk("%pF %pF because %u have %u:%u want %u:%u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, + __entry->reason, + __entry->have_btree_id, + __entry->have_iter_type, + __entry->want_btree_id, + __entry->want_iter_type) ); TRACE_EVENT(trans_restart_iters_realloced, |