diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-12-10 18:10:06 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-12-10 18:28:28 -0500 |
commit | 73f27dc83d8bd75a25b5d60f8806b0c36d3d8ed2 (patch) | |
tree | a33e6a2edc8f7fc4134b29f7446686b47c986cf1 | |
parent | d61d8760d1142bdaf250e656892f341a1fb18dcd (diff) |
Merge with 55e26c434e bcachefs: Always check if we need disk res in extent update path
59 files changed, 2458 insertions, 1937 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5594af719b2a..57c5d58c2d87 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -20,6 +20,7 @@ config BCACHEFS_FS select SIXLOCKS select RAID6_PQ select XOR_BLOCKS + select SRCU help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 97508de9f721..09a7f8c8583a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -505,9 +505,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ret = PTR_ERR_OR_ZERO(a); if (ret) @@ -1456,7 +1453,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; p = kthread_create(bch2_allocator_thread, ca, - "bch_alloc[%s]", ca->name); + "bch-alloc/%s", ca->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9749cde23cf3..eb5b40804773 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include <linux/semaphore.h> #include <linux/seqlock.h> #include <linux/shrinker.h> +#include <linux/srcu.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> @@ -213,9 +214,11 @@ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ -#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -228,8 +231,11 @@ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_err_ratelimited(c, fmt, ...) \ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_verbose(c, fmt, ...) \ do { \ @@ -642,12 +648,13 @@ struct bch_fs { mempool_t btree_iters_pool; struct btree_iter_buf __percpu *btree_iters_bufs; + struct srcu_struct btree_trans_barrier; + struct btree_key_cache btree_key_cache; struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; - struct workqueue_struct *journal_reclaim_wq; /* ALLOCATION */ struct delayed_work pd_controllers_update; @@ -666,6 +673,7 @@ struct bch_fs { unsigned bucket_size_max; atomic64_t sectors_available; + struct mutex sectors_available_lock; struct bch_fs_pcpu __percpu *pcpu; @@ -673,7 +681,7 @@ struct bch_fs { seqcount_t usage_lock; struct bch_fs_usage *usage_base; - struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; struct bch_fs_usage __percpu *usage_gc; /* single element mempool: */ @@ -751,7 +759,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 94b5418587e3..02a76c3d3acb 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ - x(new_varint, 15) + x(new_varint, 15) \ + x(journal_no_flush, 16) #define BCH_SB_FEATURES_ALL \ ((1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_new_varint))\ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1575,6 +1577,7 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); #define BCH_JOURNAL_BUCKETS_MIN 8 diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index d71157a3e073..0e626b098d91 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -73,6 +73,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) +#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -329,4 +330,17 @@ struct bch_ioctl_disk_resize { __u64 nbuckets; }; +/* + * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize_journal { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 99b7fce2bfd3..f5779795a4b2 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { if (k) { - pr_buf(out, "u64s %u type %s ", k->u64s, - bch2_bkey_types[k->type]); + pr_buf(out, "u64s %u type ", k->u64s); + + if (k->type < KEY_TYPE_MAX) + pr_buf(out, "%s ", bch2_bkey_types[k->type]); + else + pr_buf(out, "%u ", k->type); bch2_bpos_to_text(out, k->p); @@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + if (k.k->type < KEY_TYPE_MAX) { + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); + } else { + pr_buf(out, "(invalid type %u)", k.k->type); + } } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 26716657453f..1c7318c6e46f 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, return (u16) v; } -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +__always_inline +static inline void __make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l, *r; + struct bkey_packed *l = is_power_of_2(j) + ? min_key + : tree_to_prev_bkey(b, t, j >> ffs(j)); + struct bkey_packed *r = is_power_of_2(j + 1) + ? max_key + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); unsigned mantissa; int shift, exponent, high_bit; - if (is_power_of_2(j)) { - l = min_key; - - if (!l->u64s) { - if (!bkey_pack_pos(l, b->data->min_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = b->data->min_key; - bkey_copy(l, &tmp); - } - } - } else { - l = tree_to_prev_bkey(b, t, j >> ffs(j)); - - EBUG_ON(m < l); - } - - if (is_power_of_2(j + 1)) { - r = max_key; - - if (!r->u64s) { - if (!bkey_pack_pos(r, t->max_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = t->max_key; - bkey_copy(r, &tmp); - } - } - } else { - r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - EBUG_ON(m > r); - } - /* * for failed bfloats, the lookup code falls back to comparing against * the original key. @@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, f->mantissa = mantissa; } +static void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_i *k; + + if (is_power_of_2(j) && + !min_key->u64s) { + k = (void *) min_key; + bkey_init(&k->k); + k->k.p = b->data->min_key; + } + + if (is_power_of_2(j + 1) && + !max_key->u64s) { + k = (void *) max_key; + bkey_init(&k->k); + k->k.p = t->max_key; + } + + __make_bfloat(b, t, j, min_key, max_key); +} + /* bytes remaining - only valid for last bset: */ static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { @@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } -static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k; @@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) } } -static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); - struct bkey_packed min_key, max_key; + struct bkey_i min_key, max_key; unsigned j, cacheline = 1; - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); retry: @@ -789,9 +780,16 @@ retry: t->max_key = bkey_unpack_pos(b, prev); + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + bkey_init(&max_key.k); + max_key.k.p = t->max_key; + /* Then we build the tree */ eytzinger1_for_each(j, t->size) - make_bfloat(b, t, j, &min_key, &max_key); + __make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); } static void bset_alloc_tree(struct btree *b, struct bset_tree *t) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 50ea92feae0f..09774f56f11c 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -81,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; @@ -329,9 +328,9 @@ restart: clear_btree_node_accessed(b); } - memalloc_nofs_restore(flags); mutex_unlock(&bc->lock); out: + memalloc_nofs_restore(flags); return (unsigned long) freed * btree_pages(c); } @@ -382,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_data_free(c, b); } + BUG_ON(atomic_read(&c->btree_cache.dirty)); + while (!list_empty(&bc->freed)) { b = list_first_entry(&bc->freed, struct btree, list); list_del(&b->list); @@ -446,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.seeks = 4; bc->shrink.batch = btree_pages(c) * 2; - register_shrinker(&bc->shrink); + ret = register_shrinker(&bc->shrink); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; @@ -1063,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, stats.floats, stats.failed); } + +void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) +{ + pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); + pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 8a19e60e9258..e766ef552ce7 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c) void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index ba4acc112ed3..6268ea637d19 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c, struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; - unsigned i; c->ec_stripes_heap.used = 0; @@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c, } }; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); bch2_dev_usage_from_buckets(c); @@ -1427,7 +1426,7 @@ int bch2_gc_thread_start(struct bch_fs *c) BUG_ON(c->gc_thread); - p = kthread_create(bch2_gc_thread, c, "bch_gc"); + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 10a00085cdd6..87f97ccb3f1f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_iter_reinit_node(iter, b); } +static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + pr_buf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, + c->btree_roots[b->c.btree_id].level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" - "pos ", - write ? "before write " : "", - b->c.btree_id, b->c.level, - c->btree_roots[b->c.btree_id].level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_buf(out, "error validating btree node %sat btree ", + write ? "before write " : ""); + btree_pos_to_text(out, c, b); - pr_buf(out, " node offset %u", b->written); + pr_buf(out, "\n node offset %u", b->written); if (i) pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } @@ -628,21 +635,26 @@ enum btree_validate_ret { ({ \ __label__ out; \ char _buf[300]; \ + char *buf = _buf; \ struct printbuf out = PBUF(_buf); \ \ + buf = kmalloc(4096, GFP_ATOMIC); \ + if (buf) \ + out = _PBUF(buf, 4986); \ + \ btree_err_msg(&out, c, b, i, b->written, write); \ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", _buf); \ + mustfix_fsck_err(c, "%s", buf); \ goto out; \ } \ \ switch (write) { \ case READ: \ - bch_err(c, "%s", _buf); \ + bch_err(c, "%s", buf); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -663,7 +675,7 @@ enum btree_validate_ret { } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", _buf); \ + bch_err(c, "corrupt metadata before write: %s", buf); \ \ if (bch2_fs_inconsistent(c)) { \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ @@ -672,6 +684,8 @@ enum btree_validate_ret { break; \ } \ out: \ + if (buf != _buf) \ + kfree(buf); \ true; \ }) @@ -1104,6 +1118,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; + struct printbuf out; bool can_retry; goto start; @@ -1123,8 +1139,10 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", - bch2_blk_status_to_str(bio->bi_status)); + out = PBUF(buf); + btree_pos_to_text(&out, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1408,7 +1426,7 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); @@ -1442,8 +1460,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ret = validate_bset(c, b, i, sectors, WRITE, false) ?: validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); - if (ret) + if (ret) { bch2_inconsistent_error(c); + dump_stack(); + } return ret; } @@ -1486,6 +1506,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!btree_node_may_write(b)) return; + if (old & (1 << BTREE_NODE_never_write)) + return; + if (old & (1 << BTREE_NODE_write_in_flight)) { btree_node_wait_on_io(b); continue; @@ -1498,6 +1521,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + atomic_dec(&c->btree_cache.dirty); + BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); @@ -1530,6 +1555,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } + BUG_ON(b->written && !seq); + + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index b859a067c78b..1a4b11e99cc4 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -14,6 +14,23 @@ struct btree_write; struct btree; struct btree_iter; +static inline bool btree_node_dirty(struct btree *b) +{ + return test_bit(BTREE_NODE_dirty, &b->flags); +} + +static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + +static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +} + struct btree_read_bio { struct bch_fs *c; u64 start_time; @@ -104,7 +121,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 58f1a3dd97d3..21253be5aab6 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -346,7 +346,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) { struct btree_iter *iter; - trans_for_each_iter_all(trans, iter) + trans_for_each_iter(trans, iter) bch2_btree_iter_verify_locks(iter); } #else @@ -875,9 +875,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) char buf[100]; struct bkey uk = bkey_unpack_key(b, k); + bch2_dump_btree_node(iter->trans->c, l->b); bch2_bkey_to_text(&PBUF(buf), &uk); - panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", - buf, b->key.k.p.inode, b->key.k.p.offset); + panic("parent iter doesn't point to new node:\n" + "iter pos %s %llu:%llu\n" + "iter key %s\n" + "new node %llu:%llu-%llu:%llu\n", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + buf, + b->data->min_key.inode, + b->data->min_key.offset, + b->key.k.p.inode, b->key.k.p.offset); } if (!parent_locked) @@ -2002,110 +2012,46 @@ int bch2_trans_iter_free(struct btree_trans *trans, return bch2_trans_iter_put(trans, iter); } -#if 0 -static int bch2_trans_realloc_iters(struct btree_trans *trans, - unsigned new_size) +noinline __cold +static void btree_trans_iter_alloc_fail(struct btree_trans *trans) { - void *p, *new_iters, *new_updates, *new_updates2; - size_t iters_bytes; - size_t updates_bytes; - - new_size = roundup_pow_of_two(new_size); - - BUG_ON(new_size > BTREE_ITER_MAX); - if (new_size <= trans->size) - return 0; - - BUG_ON(trans->used_mempool); - - bch2_trans_unlock(trans); - - iters_bytes = sizeof(struct btree_iter) * new_size; - updates_bytes = sizeof(struct btree_insert_entry) * new_size; - - p = kmalloc(iters_bytes + - updates_bytes + - updates_bytes, GFP_NOFS); - if (p) - goto success; - - p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); - new_size = BTREE_ITER_MAX; + struct btree_iter *iter; + struct btree_insert_entry *i; - trans->used_mempool = true; -success: - new_iters = p; p += iters_bytes; - new_updates = p; p += updates_bytes; - new_updates2 = p; p += updates_bytes; - - memcpy(new_iters, trans->iters, - sizeof(struct btree_iter) * trans->nr_iters); - memcpy(new_updates, trans->updates, - sizeof(struct btree_insert_entry) * trans->nr_updates); - memcpy(new_updates2, trans->updates2, - sizeof(struct btree_insert_entry) * trans->nr_updates2); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - memset(trans->iters, POISON_FREE, - sizeof(struct btree_iter) * trans->nr_iters + - sizeof(struct btree_insert_entry) * trans->nr_iters); - - kfree(trans->iters); - - trans->iters = new_iters; - trans->updates = new_updates; - trans->updates2 = new_updates2; - trans->size = new_size; - - if (trans->iters_live) { - trace_trans_restart_iters_realloced(trans->ip, trans->size); - return -EINTR; + trans_for_each_iter(trans, iter) + printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); + + trans_for_each_update(trans, i) { + char buf[300]; + + bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); + printk(KERN_ERR "update: btree %s %s\n", + bch2_btree_ids[i->iter->btree_id], buf); } - - return 0; + panic("trans iter oveflow\n"); } -#endif static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { - unsigned idx = __ffs64(~trans->iters_linked); - - if (idx < trans->nr_iters) - goto got_slot; + unsigned idx; - if (trans->nr_iters == trans->size) { - struct btree_iter *iter; + if (unlikely(trans->iters_linked == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_trans_iter_alloc_fail(trans); - BUG_ON(trans->size < BTREE_ITER_MAX); + idx = __ffs64(~trans->iters_linked); - trans_for_each_iter(trans, iter) { - pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", - bch2_btree_ids[iter->btree_id], - iter->pos.inode, - iter->pos.offset, - (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); - } - - panic("trans iter oveflow\n"); -#if 0 - ret = bch2_trans_realloc_iters(trans, trans->size * 2); - if (ret) - return ERR_PTR(ret); -#endif - } - - idx = trans->nr_iters++; - BUG_ON(trans->nr_iters > trans->size); - - trans->iters[idx].idx = idx; -got_slot: - BUG_ON(trans->iters_linked & (1ULL << idx)); - trans->iters_linked |= 1ULL << idx; - trans->iters[idx].flags = 0; + trans->iters_linked |= 1ULL << idx; + trans->iters[idx].idx = idx; + trans->iters[idx].flags = 0; return &trans->iters[idx]; } @@ -2141,8 +2087,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, { struct btree_iter *iter, *best = NULL; - BUG_ON(trans->nr_iters > BTREE_ITER_MAX); - trans_for_each_iter(trans, iter) { if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) continue; @@ -2160,16 +2104,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, if (!best) { iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - bch2_btree_iter_init(trans, iter, btree_id, pos, flags); } else if ((trans->iters_live & (1ULL << best->idx)) || (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - btree_iter_copy(iter, best); } else { iter = best; @@ -2203,9 +2141,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, struct btree_iter *iter = __btree_trans_get_iter(trans, btree_id, pos, flags); - if (!IS_ERR(iter)) - __bch2_btree_iter_set_pos(iter, pos, - btree_node_type_is_extents(btree_id)); + __bch2_btree_iter_set_pos(iter, pos, + btree_node_type_is_extents(btree_id)); return iter; } @@ -2221,7 +2158,6 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, flags|BTREE_ITER_NODES); unsigned i; - BUG_ON(IS_ERR(iter)); BUG_ON(bkey_cmp(iter->pos, pos)); iter->locks_want = locks_want; @@ -2241,9 +2177,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *iter; iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - btree_iter_copy(iter, src); trans->iters_live |= 1ULL << iter->idx; @@ -2318,7 +2251,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->iters_touched &= trans->iters_live; - trans->need_reset = 0; trans->nr_updates = 0; trans->nr_updates2 = 0; trans->mem_top = 0; @@ -2339,20 +2271,21 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) { - unsigned new_size = BTREE_ITER_MAX; - size_t iters_bytes = sizeof(struct btree_iter) * new_size; - size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; - void *p; + size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + void *p = NULL; BUG_ON(trans->used_mempool); - p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: - mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); +#ifdef __KERNEL__ + p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); +#endif + if (!p) + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); trans->iters = p; p += iters_bytes; trans->updates = p; p += updates_bytes; trans->updates2 = p; p += updates_bytes; - trans->size = new_size; } void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, @@ -2369,8 +2302,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, */ bch2_trans_alloc_iters(trans, c); - if (expected_mem_bytes) - bch2_trans_preload_mem(trans, expected_mem_bytes); + if (expected_mem_bytes) { + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); + trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); #ifdef CONFIG_BCACHEFS_DEBUG trans->pid = current->pid; @@ -2392,12 +2329,19 @@ int bch2_trans_exit(struct btree_trans *trans) mutex_unlock(&trans->c->btree_trans_lock); #endif + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); kfree(trans->fs_usage_deltas); kfree(trans->mem); +#ifdef __KERNEL__ + /* + * Userspace doesn't have a real percpu implementation: + */ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); +#endif if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); @@ -2474,6 +2418,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { mempool_exit(&c->btree_iters_pool); + cleanup_srcu_struct(&c->btree_trans_barrier); } int bch2_fs_btree_iter_init(struct bch_fs *c) @@ -2483,7 +2428,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); - return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + sizeof(struct btree_insert_entry) * nr); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index f7a73619c85b..9a7f8d0197ec 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -48,21 +48,22 @@ static inline int btree_iter_err(const struct btree_iter *iter) /* Iterate over iters within a transaction: */ -#define trans_for_each_iter_all(_trans, _iter) \ - for (_iter = (_trans)->iters; \ - _iter < (_trans)->iters + (_trans)->nr_iters; \ - _iter++) - static inline struct btree_iter * __trans_next_iter(struct btree_trans *trans, unsigned idx) { - EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); + u64 l; + + if (idx == BTREE_ITER_MAX) + return NULL; - for (; idx < trans->nr_iters; idx++) - if (trans->iters_linked & (1ULL << idx)) - return &trans->iters[idx]; + l = trans->iters_linked >> idx; + if (!l) + return NULL; - return NULL; + idx += __ffs64(l); + EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(trans->iters[idx].idx != idx); + return &trans->iters[idx]; } #define trans_for_each_iter(_trans, _iter) \ @@ -240,10 +241,9 @@ static inline int bkey_err(struct bkey_s_c k) #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ - bch2_trans_get_iter((_trans), (_btree_id), \ - (_start), (_flags))) ?: \ - PTR_ERR_OR_ZERO(((_k) = \ + for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ + (_start), (_flags)), \ + (_ret) = PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_peek(_iter, _flags)).k); \ !_ret && (_k).k; \ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ @@ -270,9 +270,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, { struct btree_iter *iter = __bch2_trans_get_iter(trans, btree_id, pos, flags); - - if (!IS_ERR(iter)) - iter->ip_allocated = _THIS_IP_; + iter->ip_allocated = _THIS_IP_; return iter; } @@ -284,10 +282,8 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) struct btree_iter *iter = __bch2_trans_copy_iter(trans, src); - if (!IS_ERR(iter)) - iter->ip_allocated = _THIS_IP_; + iter->ip_allocated = _THIS_IP_; return iter; - } struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 0ee4f78ce67a..244c5dbcd3e9 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -9,8 +9,11 @@ #include "journal.h" #include "journal_reclaim.h" +#include <linux/sched/mm.h> #include <trace/events/bcachefs.h> +static struct kmem_cache *bch2_key_cache; + static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -66,12 +69,22 @@ static void bkey_cached_evict(struct btree_key_cache *c, BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, bch2_btree_key_cache_params)); memset(&ck->key, ~0, sizeof(ck->key)); + + c->nr_keys--; } -static void bkey_cached_free(struct btree_key_cache *c, +static void bkey_cached_free(struct btree_key_cache *bc, struct bkey_cached *ck) { - list_move(&ck->list, &c->freed); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_move_tail(&ck->list, &bc->freed); + bc->nr_freed++; kfree(ck->k); ck->k = NULL; @@ -86,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c) { struct bkey_cached *ck; - list_for_each_entry(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) + list_for_each_entry_reverse(ck, &c->freed, list) + if (bkey_cached_lock_for_evict(ck)) { + c->nr_freed--; return ck; + } + + ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (likely(ck)) { + INIT_LIST_HEAD(&ck->list); + six_lock_init(&ck->c.lock); + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + return ck; + } list_for_each_entry(ck, &c->clean, list) if (bkey_cached_lock_for_evict(ck)) { @@ -96,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c) return ck; } - ck = kzalloc(sizeof(*ck), GFP_NOFS); - if (!ck) - return NULL; - - INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - - return ck; + return NULL; } static struct bkey_cached * @@ -124,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c, ck->key.btree_id = btree_id; ck->key.pos = pos; ck->valid = false; - - BUG_ON(ck->flags); + ck->flags = 1U << BKEY_CACHED_ACCESSED; if (rhashtable_lookup_insert_fast(&c->table, &ck->hash, @@ -135,6 +149,8 @@ btree_key_cache_create(struct btree_key_cache *c, return NULL; } + c->nr_keys++; + list_move(&ck->list, &c->clean); six_unlock_write(&ck->c.lock); @@ -153,9 +169,6 @@ static int btree_key_cache_fill(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, ck->key.btree_id, ck->key.pos, BTREE_ITER_SLOTS); - if (IS_ERR(iter)) - return PTR_ERR(iter); - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) { @@ -280,6 +293,9 @@ fill: goto err; } + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + iter->uptodate = BTREE_ITER_NEED_PEEK; bch2_btree_iter_downgrade(iter); return ret; @@ -300,24 +316,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_iter *c_iter = NULL, *b_iter = NULL; - struct bkey_cached *ck; + struct bkey_cached *ck = NULL; int ret; b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, BTREE_ITER_SLOTS| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(b_iter); - if (ret) - goto out; - c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(c_iter); - if (ret) - goto out; retry: ret = bch2_btree_iter_traverse(c_iter); if (ret) @@ -348,17 +357,22 @@ err: if (ret == -EINTR) goto retry; - BUG_ON(ret && !bch2_journal_error(j)); - - if (ret) + if (ret) { + bch2_fs_fatal_err_on(!bch2_journal_error(j), c, + "error flushing key cache: %i", ret); goto out; + } bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); if (!evict) { mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty--; + } + list_move_tail(&ck->list, &c->btree_key_cache.clean); mutex_unlock(&c->btree_key_cache.lock); } else { @@ -371,6 +385,11 @@ evict: six_lock_write(&ck->c.lock, NULL, NULL); mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty--; + } + bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free(&c->btree_key_cache, ck); mutex_unlock(&c->btree_key_cache.lock); @@ -391,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j, struct bkey_cached_key key; struct btree_trans trans; + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + six_lock_read(&ck->c.lock, NULL, NULL); key = ck->key; if (ck->journal.seq != seq || !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_read(&ck->c.lock); - return; + goto unlock; } six_unlock_read(&ck->c.lock); bch2_trans_init(&trans, c, 0, 0); btree_key_cache_flush_pos(&trans, key, seq, false); bch2_trans_exit(&trans); +unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); } /* @@ -428,6 +451,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) iter->l[0].b; + bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -448,14 +472,22 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { mutex_lock(&c->btree_key_cache.lock); - list_del_init(&ck->list); + list_move(&ck->list, &c->btree_key_cache.dirty); set_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty++; + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; + mutex_unlock(&c->btree_key_cache.lock); } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, &ck->journal, btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); return true; } @@ -467,20 +499,107 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, } #endif -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; + unsigned flags; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) + return -1; + + flags = memalloc_nofs_save(); + + /* + * Newest freed entries are at the end of the list - once we hit one + * that's too new to be freed, we can bail out: + */ + list_for_each_entry_safe(ck, t, &bc->freed, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + bc->nr_freed--; + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + list_for_each_entry_safe(ck, t, &bc->clean, list) { + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) { + if (&t->list != &bc->clean) + list_move_tail(&bc->clean, &t->list); + goto out; + } + } +out: + memalloc_nofs_restore(flags); + mutex_unlock(&bc->lock); + + return freed; +} + +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + + return bc->nr_keys; +} + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bkey_cached *ck, *n; - mutex_lock(&c->lock); - list_for_each_entry_safe(ck, n, &c->clean, list) { + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); + list_splice(&bc->dirty, &bc->clean); + + list_for_each_entry_safe(ck, n, &bc->clean, list) { + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + kfree(ck->k); - kfree(ck); + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + bc->nr_keys--; } - list_for_each_entry_safe(ck, n, &c->freed, list) - kfree(ck); - mutex_unlock(&c->lock); - rhashtable_destroy(&c->table); + BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); + BUG_ON(bc->nr_keys); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + } + mutex_unlock(&bc->lock); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) @@ -488,33 +607,47 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) mutex_init(&c->lock); INIT_LIST_HEAD(&c->freed); INIT_LIST_HEAD(&c->clean); + INIT_LIST_HEAD(&c->dirty); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) { - return rhashtable_init(&c->table, &bch2_btree_key_cache_params); + int ret; + + c->shrink.seeks = 1; + c->shrink.count_objects = bch2_btree_key_cache_count; + c->shrink.scan_objects = bch2_btree_key_cache_scan; + + ret = register_shrinker(&c->shrink); + if (ret) + return ret; + + ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); + if (ret) + return ret; + + c->table_init_done = true; + return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - size_t i; + pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); + pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); + pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); +} - mutex_lock(&c->lock); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); +void bch2_btree_key_cache_exit(void) +{ + if (bch2_key_cache) + kmem_cache_destroy(bch2_key_cache); +} - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - pr_buf(out, "%s:", - bch2_btree_ids[ck->key.btree_id]); - bch2_bpos_to_text(out, ck->key.pos); +int __init bch2_btree_key_cache_init(void) +{ + bch2_key_cache = KMEM_CACHE(bkey_cached, 0); + if (!bch2_key_cache) + return -ENOMEM; - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - pr_buf(out, " journal seq %llu", ck->journal.seq); - pr_buf(out, "\n"); - } - } - mutex_unlock(&c->lock); + return 0; } diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index d448264abcc8..dad3e344dcf9 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -1,6 +1,24 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty; +} + struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); @@ -25,4 +43,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *); void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); +void bch2_btree_key_cache_exit(void); +int __init bch2_btree_key_cache_init(void); + #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 93721fbc7794..dc7de27112c6 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -158,6 +158,7 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + atomic_t dirty; struct shrinker shrink; /* @@ -292,8 +293,15 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter) struct btree_key_cache { struct mutex lock; struct rhashtable table; + bool table_init_done; struct list_head freed; struct list_head clean; + struct list_head dirty; + struct shrinker shrink; + + size_t nr_freed; + size_t nr_keys; + size_t nr_dirty; }; struct bkey_cached_key { @@ -301,7 +309,8 @@ struct bkey_cached_key { struct bpos pos; } __attribute__((packed, aligned(4))); -#define BKEY_CACHED_DIRTY 0 +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 struct bkey_cached { struct btree_bkey_cached_common c; @@ -309,6 +318,7 @@ struct bkey_cached { unsigned long flags; u8 u64s; bool valid; + u32 btree_trans_barrier_seq; struct bkey_cached_key key; struct rhash_head hash; @@ -345,21 +355,19 @@ struct btree_trans { pid_t pid; #endif unsigned long ip; + int srcu_idx; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; - - u8 nr_iters; u8 nr_updates; u8 nr_updates2; - u8 size; unsigned used_mempool:1; unsigned error:1; unsigned nounlock:1; - unsigned need_reset:1; unsigned in_traverse_all:1; + u64 iters_linked; + u64 iters_live; + u64 iters_touched; + unsigned mem_top; unsigned mem_bytes; void *mem; @@ -407,11 +415,11 @@ enum btree_flags { BTREE_NODE_fake, BTREE_NODE_old_extent_overwrite, BTREE_NODE_need_rewrite, + BTREE_NODE_never_write, }; BTREE_FLAG(read_in_flight); BTREE_FLAG(read_error); -BTREE_FLAG(dirty); BTREE_FLAG(need_write); BTREE_FLAG(noevict); BTREE_FLAG(write_idx); @@ -422,6 +430,7 @@ BTREE_FLAG(dying); BTREE_FLAG(fake); BTREE_FLAG(old_extent_overwrite); BTREE_FLAG(need_rewrite); +BTREE_FLAG(never_write); static inline struct btree_write *btree_current_write(struct btree *b) { @@ -640,6 +649,7 @@ enum btree_insert_ret { BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_MARK_REPLICAS, BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index e0b1bde37484..adb07043cbb3 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); -int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *); +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4ddd1697ffde..8f96756ba648 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -11,6 +11,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "journal.h" #include "journal_reclaim.h" @@ -48,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - BUG_ON(bkey_cmp(next_node, bp.v->min_key)); + if (bkey_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + panic("expected next min_key %llu:%llu got %llu:%llu\n", + next_node.inode, + next_node.offset, + bp.v->min_key.inode, + bp.v->min_key.offset); + } bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); + + if (bkey_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + panic("expected end %llu:%llu got %llu:%llu\n", + b->key.k.p.inode, + b->key.k.p.offset, + k.k->p.inode, + k.k->p.offset); + } break; } @@ -149,7 +165,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) b->ob.nr = 0; - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_lock_type(c, b, SIX_LOCK_write); __btree_node_free(c, b); @@ -264,7 +280,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->prealloc_nodes[--as->nr_prealloc_nodes]; set_btree_node_accessed(b); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); @@ -503,14 +519,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, trans->journal_pin = &as->journal; for_each_keylist_key(&as->new_keys, k) { - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), + ret = bch2_trans_mark_key(trans, + bkey_s_c_null, + bkey_i_to_s_c(k), 0, 0, BTREE_TRIGGER_INSERT); if (ret) return ret; } for_each_keylist_key(&as->old_keys, k) { - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(k), + bkey_s_c_null, 0, 0, BTREE_TRIGGER_OVERWRITE); if (ret) return ret; @@ -523,11 +543,25 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b = as->b; + struct btree_trans trans; u64 journal_seq = 0; unsigned i; int ret; /* + * If we're already in an error state, it might be because a btree node + * was never written, and we might be trying to free that same btree + * node here, but it won't have been marked as allocated and we'll see + * spurious disk usage inconsistencies in the transactional part below + * if we don't skip it: + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + BUG_ON(!journal_pin_active(&as->journal)); + + /* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have * been written so we can write out the update to the interior node. @@ -540,16 +574,20 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - ret = bch2_trans_do(c, &as->disk_res, &journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, - btree_update_nodes_written_trans(&trans, as)); - BUG_ON(ret && !bch2_journal_error(&c->journal)); - + bch2_trans_init(&trans, c, 0, 512); + ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); + + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, + "error %i in btree_update_nodes_written()", ret); +err: if (b) { /* * @b is the node we did the final insert into: @@ -569,17 +607,30 @@ static void btree_update_nodes_written(struct btree_update *as) list_del(&as->write_blocked_list); - if (!ret && as->b == b) { + /* + * Node might have been freed, recheck under + * btree_interior_update_lock: + */ + if (as->b == b) { struct bset *i = btree_bset_last(b); BUG_ON(!b->c.level); BUG_ON(!btree_node_dirty(b)); - i->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); + if (!ret) { + i->journal_seq = cpu_to_le64( + max(journal_seq, + le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + } else { + /* + * If we didn't get a journal sequence number we + * can't write this btree node, because recovery + * won't know to ignore this write: + */ + set_btree_node_never_write(b); + } } mutex_unlock(&c->btree_interior_update_lock); @@ -680,17 +731,7 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - /* - * When we write a new btree root, we have to drop our journal pin - * _before_ the new nodes are technically reachable; see - * btree_update_nodes_written(). - * - * This goes for journal pins that are recursively blocked on us - so, - * just transfer the journal pin to the new interior update so - * btree_update_nodes_written() can drop it. - */ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); - bch2_journal_pin_drop(&c->journal, &child->journal); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -827,7 +868,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, closure_wake_up(&c->btree_interior_update_wait); } - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); clear_btree_node_need_write(b); /* @@ -937,6 +978,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, if (ret) goto err; + bch2_journal_pin_add(&c->journal, + atomic64_read(&c->journal.seq), + &as->journal, NULL); + mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); @@ -1018,7 +1063,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct bkey_i *insert, struct btree_node_iter *node_iter) { + struct bch_fs *c = as->c; struct bkey_packed *k; + const char *invalid; + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); + bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); + dump_stack(); + } BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -1034,7 +1091,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bch2_btree_node_iter_advance(node_iter, b); bch2_btree_bset_insert_key(iter, b, node_iter, insert); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); } @@ -1353,9 +1410,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - if (as->must_rewrite) - goto split; - bch2_btree_node_lock_for_insert(c, b, iter); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { @@ -1363,6 +1417,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, goto split; } + btree_node_interior_verify(c, b); + bch2_btree_insert_keys_interior(as, b, iter, keys); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7668225e72c6..45d212730fd7 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -47,7 +47,6 @@ struct btree_update { BTREE_INTERIOR_UPDATING_AS, } mode; - unsigned must_rewrite:1; unsigned nodes_written:1; enum btree_id btree_id; @@ -237,6 +236,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, b->whiteout_u64s; ssize_t total = c->opts.btree_node_size << 6; + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; + return total - used; } diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index e386f8ed3922..e7816afe4a08 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); if (unlikely(!btree_node_dirty(b))) - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans, BUG_ON(iter->level); + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(trans->c)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + if (u64s <= ck->u64s) return BTREE_INSERT_OK; @@ -642,20 +646,24 @@ int bch2_trans_commit_error(struct btree_trans *trans, trace_trans_restart_journal_res_get(trans->ip); ret = -EINTR; break; - default: - BUG_ON(ret >= 0); - break; - } + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); - if (ret == -EINTR) { - int ret2 = bch2_btree_iter_traverse_all(trans); + do { + mutex_lock(&c->journal.reclaim_lock); + ret = bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); + } while (!ret && bch2_btree_key_cache_must_wait(c)); - if (ret2) { - trace_trans_restart_traverse(trans->ip); - return ret2; - } + if (!ret && bch2_trans_relock(trans)) + return 0; - trace_trans_restart_atomic(trans->ip); + trace_trans_restart_journal_reclaim(trans->ip); + ret = -EINTR; + break; + default: + BUG_ON(ret >= 0); + break; } return ret; @@ -699,7 +707,7 @@ static void bch2_trans_update2(struct btree_trans *trans, BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - EBUG_ON(trans->nr_updates2 >= trans->nr_iters); + EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; @@ -732,8 +740,6 @@ static int extent_update_to_keys(struct btree_trans *trans, return 0; iter = bch2_trans_copy_iter(trans, orig_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); iter->flags |= BTREE_ITER_INTENT; __bch2_btree_iter_set_pos(iter, insert->k.p, false); @@ -752,10 +758,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, int ret = 0; iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - k = bch2_btree_iter_peek_with_updates(iter); while (k.k && !(ret = bkey_err(k))) { @@ -764,8 +766,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) @@ -781,8 +781,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, if (bkey_cmp(k.k->p, end) > 0) { update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) @@ -796,8 +794,6 @@ static int extent_handle_overwrites(struct btree_trans *trans, bch2_trans_iter_put(trans, update_iter); } else { update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); if ((ret = PTR_ERR_OR_ZERO(update))) @@ -829,8 +825,6 @@ int __bch2_trans_commit(struct btree_trans *trans) unsigned u64s; int ret = 0; - BUG_ON(trans->need_reset); - if (!trans->nr_updates) goto out_noupdates; @@ -1023,10 +1017,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, */ if (trans->iters_live & (1ULL << i->iter->idx)) { i->iter = bch2_trans_copy_iter(trans, i->iter); - if (IS_ERR(i->iter)) { - trans->need_reset = true; - return PTR_ERR(i->iter); - } i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; bch2_trans_iter_put(trans, i->iter); @@ -1036,7 +1026,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_pos(i->iter, n.k->k.p); } - EBUG_ON(trans->nr_updates >= trans->nr_iters); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); @@ -1051,8 +1041,6 @@ int __bch2_btree_insert(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); ret = bch2_btree_iter_traverse(iter) ?: bch2_trans_update(trans, iter, k, 0); @@ -1076,13 +1064,29 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, __bch2_btree_insert(&trans, id, k)); } -int bch2_btree_delete_at_range(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - u64 *journal_seq) +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + bch2_trans_update(trans, iter, &k, 0); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) +{ + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; + + iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); retry: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -1094,6 +1098,10 @@ retry: bkey_init(&delete.k); /* + * This could probably be more efficient for extents: + */ + + /* * For extents, iter.pos won't necessarily be the same as * bkey_start_pos(k.k) (for non extents they always will be the * same). It's important that we delete starting from iter.pos @@ -1132,22 +1140,8 @@ retry: goto retry; } + bch2_trans_iter_put(trans, iter); return ret; - -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.p = iter->pos; - - bch2_trans_update(trans, iter, &k, 0); - return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags); } /* @@ -1159,21 +1153,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, u64 *journal_seq) { - struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; - - /* - * XXX: whether we need mem/more iters depends on whether this btree id - * has triggers - */ - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - - iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); - - ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); - ret = bch2_trans_exit(&trans) ?: ret; - - BUG_ON(ret == -EINTR); - return ret; + return bch2_trans_do(c, NULL, journal_seq, 0, + bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 82f1cc4ca693..1934b845ea15 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c) percpu_down_write(&c->mark_lock); usage = c->usage_base; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); for (i = 0; i < BCH_REPLICAS_MAX; i++) usage->reserved += usage->persistent_reserved[i]; @@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, { return this_cpu_ptr(gc ? c->usage_gc - : c->usage[journal_seq & 1]); + : c->usage[journal_seq & JOURNAL_BUF_MASK]); } u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) { ssize_t offset = v - (u64 *) c->usage_base; - unsigned seq; + unsigned i, seq; u64 ret; BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); @@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) do { seq = read_seqcount_begin(&c->usage_lock); - ret = *v + - percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + - percpu_u64_get((u64 __percpu *) c->usage[1] + offset); + ret = *v; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage *ret; - unsigned seq, v, u64s = fs_usage_u64s(c); + unsigned seq, i, v, u64s = fs_usage_u64s(c); retry: ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); if (unlikely(!ret)) @@ -251,8 +252,8 @@ retry: do { seq = read_seqcount_begin(&c->usage_lock); memcpy(ret, c->usage_base, u64s * sizeof(u64)); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { unsigned u64s = fs_usage_u64s(c); - BUG_ON(idx >= 2); + BUG_ON(idx >= ARRAY_SIZE(c->usage)); preempt_disable(); write_seqcount_begin(&c->usage_lock); @@ -323,7 +324,7 @@ static u64 reserve_factor(u64 r) static u64 avail_factor(u64 r) { - return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) @@ -1333,10 +1334,8 @@ static int bch2_mark_key_locked(struct bch_fs *c, ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: - if (!(flags & BTREE_TRIGGER_OVERWRITE)) - fs_usage->nr_inodes++; - else - fs_usage->nr_inodes--; + fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; + fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; break; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1400,10 +1399,10 @@ int bch2_mark_update(struct btree_trans *trans, old = (struct bkey_s_c) { &unpacked, NULL }; if (!btree_node_type_is_extents(iter->btree_id)) { + /* iterators should be uptodate, shouldn't get errors here: */ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { - _old = bch2_btree_node_iter_peek(&node_iter, b); - if (_old) - old = bkey_disassemble(b, _old, &unpacked); + old = bch2_btree_iter_peek_slot(iter); + BUG_ON(bkey_err(old)); } else { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -1576,9 +1575,6 @@ static int trans_get_key(struct btree_trans *trans, *iter = bch2_trans_get_iter(trans, btree_id, pos, flags|BTREE_ITER_INTENT); - if (IS_ERR(*iter)) - return PTR_ERR(*iter); - *k = __bch2_btree_iter_peek(*iter, flags); ret = bkey_err(*k); if (ret) @@ -1606,9 +1602,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - ret = bch2_btree_iter_traverse(iter); if (ret) { bch2_trans_iter_put(trans, iter); @@ -1754,59 +1747,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } +static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, + const struct bch_extent_ptr *ptr, + s64 sectors, bool parity) +{ + struct bkey_i_alloc *a; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + int ret; + + ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (ret) + return ret; + + if (parity) { + u.dirty_sectors += sectors; + u.data_type = u.dirty_sectors + ? BCH_DATA_parity + : 0; + } + + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + + bkey_alloc_init(&a->k_i); + a->k.p = iter->pos; + bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static int bch2_trans_mark_stripe(struct btree_trans *trans, - struct bkey_s_c k, + struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; struct bch_replicas_padded r; - struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; - struct btree_iter *iter; - bool deleting = flags & BTREE_TRIGGER_OVERWRITE; - s64 sectors = le16_to_cpu(s->sectors); unsigned i; int ret = 0; - if (deleting) - sectors = -sectors; - - bch2_bkey_to_replicas(&r.e, k); - update_replicas_list(trans, &r.e, sectors * s->nr_redundant); - /* - * The allocator code doesn't necessarily update bucket gens in the - * btree when incrementing them, right before handing out new buckets - - * we just need to persist those updates here along with the new stripe: + * If the pointers aren't changing, we don't need to do anything: */ + if (new_s && old_s && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; - for (i = 0; i < s->nr_blocks && !ret; i++) { - bool parity = i >= nr_data; + if (new_s) { + unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant; + s64 sectors = le16_to_cpu(new_s->sectors); - ret = bch2_trans_start_alloc_update(trans, &iter, - &s->ptrs[i], &u); - if (ret) - break; + bch2_bkey_to_replicas(&r.e, new); + update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + + for (i = 0; i < new_s->nr_blocks; i++) { + bool parity = i >= nr_data; - if (parity) { - u.dirty_sectors += sectors; - u.data_type = u.dirty_sectors - ? BCH_DATA_parity - : 0; + ret = bch2_trans_mark_stripe_alloc_ref(trans, + &new_s->ptrs[i], sectors, parity); + if (ret) + return ret; } + } - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto put_iter; - - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); -put_iter: - bch2_trans_iter_put(trans, iter); + if (old_s) { + unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant; + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + bch2_bkey_to_replicas(&r.e, old); + update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + + for (i = 0; i < old_s->nr_blocks; i++) { + bool parity = i >= nr_data; + + ret = bch2_trans_mark_stripe_alloc_ref(trans, + &old_s->ptrs[i], sectors, parity); + if (ret) + return ret; + } } return ret; @@ -1905,11 +1931,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, return ret; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, +int bch2_trans_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, unsigned offset, s64 sectors, unsigned flags) { - struct replicas_delta_list *d; struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct replicas_delta_list *d; + + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); switch (k.k->type) { case KEY_TYPE_btree_ptr: @@ -1925,15 +1956,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_user); case KEY_TYPE_stripe: - return bch2_trans_mark_stripe(trans, k, flags); - case KEY_TYPE_inode: - d = replicas_deltas_realloc(trans, 0); + return bch2_trans_mark_stripe(trans, old, new, flags); + case KEY_TYPE_inode: { + int nr = (new.k->type == KEY_TYPE_inode) - + (old.k->type == KEY_TYPE_inode); + + if (nr) { + d = replicas_deltas_realloc(trans, 0); + d->nr_inodes += nr; + } - if (!(flags & BTREE_TRIGGER_OVERWRITE)) - d->nr_inodes++; - else - d->nr_inodes--; return 0; + } case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1957,12 +1991,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, int bch2_trans_mark_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, + struct bkey_i *new, unsigned flags) { - struct btree *b = iter_l(iter)->b; - struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_k; + struct bkey_s_c old; int ret; if (unlikely(flags & BTREE_TRIGGER_NORUN)) @@ -1971,79 +2003,97 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), - 0, insert->k.size, BTREE_TRIGGER_INSERT); - if (ret) - return ret; - - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { - struct bkey_cached *ck = (void *) iter->l[0].b; + if (!btree_node_type_is_extents(iter->btree_id)) { + /* iterators should be uptodate, shouldn't get errors here: */ + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + old = bch2_btree_iter_peek_slot(iter); + BUG_ON(bkey_err(old)); + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; - return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), - 0, 0, BTREE_TRIGGER_OVERWRITE); - } + BUG_ON(!ck->valid); + old = bkey_i_to_s_c(ck->k); + } - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { + if (old.k->type == new->k.type) { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_INSERT|flags) ?: + bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { + struct btree *b = iter_l(iter)->b; + struct btree_node_iter node_iter = iter_l(iter)->iter; + struct bkey_packed *_old; struct bkey unpacked; - struct bkey_s_c k; - unsigned offset = 0; - s64 sectors = 0; - unsigned flags = BTREE_TRIGGER_OVERWRITE; - k = bkey_disassemble(b, _k, &unpacked); + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - if (btree_node_is_extents(b) - ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 - : bkey_cmp(insert->k.p, k.k->p)) - break; + bkey_init(&unpacked); + old = (struct bkey_s_c) { &unpacked, NULL }; + + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + 0, new->k.size, + BTREE_TRIGGER_INSERT); + if (ret) + return ret; + + while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { + unsigned flags = BTREE_TRIGGER_OVERWRITE; + unsigned offset = 0; + s64 sectors; + + old = bkey_disassemble(b, _old, &unpacked); + sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) + return 0; - if (btree_node_is_extents(b)) { - switch (bch2_extent_overlap(&insert->k, k.k)) { + switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: offset = 0; - sectors = -((s64) k.k->size); + sectors = -((s64) old.k->size); break; case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - sectors = bkey_start_offset(&insert->k) - - k.k->p.offset; + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: offset = 0; - sectors = bkey_start_offset(k.k) - - insert->k.p.offset; + sectors = bkey_start_offset(old.k) - + new->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - sectors = -((s64) insert->k.size); + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); - } - ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); - if (ret) - return ret; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + offset, sectors, flags); + if (ret) + return ret; - bch2_btree_node_iter_advance(&node_iter, b); + bch2_btree_node_iter_advance(&node_iter, b); + } } - return 0; + return ret; } /* Disk reservations: */ -static u64 bch2_recalc_sectors_available(struct bch_fs *c) -{ - percpu_u64_set(&c->pcpu->sectors_available, 0); - - return avail_factor(__bch2_fs_usage_read_short(c).free); -} - void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { percpu_down_read(&c->mark_lock); @@ -2078,7 +2128,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, if (get < sectors) { preempt_enable(); - percpu_up_read(&c->mark_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, @@ -2096,9 +2145,10 @@ out: return 0; recalculate: - percpu_down_write(&c->mark_lock); + mutex_lock(&c->sectors_available_lock); - sectors_available = bch2_recalc_sectors_available(c); + percpu_u64_set(&c->pcpu->sectors_available, 0); + sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { @@ -2112,7 +2162,8 @@ recalculate: ret = -ENOSPC; } - percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sectors_available_lock); + percpu_up_read(&c->mark_lock); return ret; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a3873becbb70..3a5ed1fcaf78 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -264,7 +264,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *, int bch2_replicas_delta_list_apply(struct bch_fs *, struct bch_fs_usage *, struct replicas_delta_list *); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 0377f9018d27..e7c8969aaad1 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -5,6 +5,7 @@ #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" +#include "journal.h" #include "move.h" #include "replicas.h" #include "super.h" @@ -340,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); + ctx->thread = kthread_create(bch2_data_thread, ctx, + "bch-data/%s", c->name); if (IS_ERR(ctx->thread)) { ret = PTR_ERR(ctx->thread); goto err; @@ -563,6 +565,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, return ret; } +static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_ioctl_disk_resize_journal arg) +{ + struct bch_dev *ca; + int ret; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -619,6 +641,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(data, struct bch_ioctl_data); case BCH_IOCTL_DISK_RESIZE: BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); + case BCH_IOCTL_DISK_RESIZE_JOURNAL: + BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); default: return -ENOTTY; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..3d88719ba86c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -463,7 +464,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 833537cc8fd0..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 0d68a277cfd7..aebf46bb1d21 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d7ba0e7fc3b3..eb03adc2d533 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) len << 9); if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { - __bcache_io_error(c, + bch_err_ratelimited(c, "checksum error while doing reconstruct read (%u:%u)", i, j); clear_bit(i, buf->valid); @@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) unsigned bytes = buf->size << 9; if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); return -1; } @@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) BTREE_ITER_SLOTS); k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: stripe not found"); kfree(buf); return bch2_trans_exit(&trans) ?: -EIO; @@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (ptr_stale(ca, ptr)) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: stale pointer"); clear_bit(i, buf->valid); continue; @@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); ret = -EIO; goto err; @@ -874,7 +874,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) for_each_keylist_key(&s->keys, k) { ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); if (ret) { - bch_err(c, "error creating stripe: error updating pointers"); + bch_err(c, "error creating stripe: error %i updating pointers", ret); break; } } @@ -1341,16 +1341,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, if (!h) return NULL; - if (!h->s && ec_new_stripe_alloc(c, h)) { - bch2_ec_stripe_head_put(c, h); - return NULL; - } - - if (!h->s->allocated) { - if (!h->s->existing_stripe && - (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { - //pr_info("got existing stripe %llu", idx); + if (!h->s) { + if (ec_new_stripe_alloc(c, h)) { + bch2_ec_stripe_head_put(c, h); + return NULL; + } + idx = get_existing_stripe(c, target, algo, redundancy); + if (idx >= 0) { h->s->existing_stripe = true; h->s->existing_stripe_idx = idx; if (get_stripe_key(c, idx, &h->s->stripe)) { @@ -1364,7 +1362,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ec_block_io(c, &h->s->stripe, READ, i, &cl); } } + } + if (!h->s->allocated) { if (!h->s->existing_stripe && !h->s->res.sectors) { ret = bch2_disk_reservation_get(c, &h->s->res, diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 94b53312fbbd..0e49fd728e44 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *); /* Logs message and handles the error: */ #define bch2_dev_io_error(ca, fmt, ...) \ do { \ - printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ - "IO error on %s for " fmt), \ + printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \ (ca)->name, ##__VA_ARGS__); \ bch2_io_error(ca); \ } while (0) +#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \ +do { \ + printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\ + (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \ + bch2_io_error(ca); \ +} while (0) + #define bch2_dev_io_err_on(cond, ca, ...) \ ({ \ bool _ret = (cond); \ @@ -196,16 +202,13 @@ do { \ _ret; \ }) -/* kill? */ - -#define __bcache_io_error(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, \ - "IO error: " fmt), ##__VA_ARGS__) - -#define bcache_io_error(c, bio, fmt, ...) \ -do { \ - __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_status = BLK_STS_IOERR; \ -} while (0) +#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) \ + bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\ + _ret; \ +}) #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 7262e320ce25..dc16a7731e38 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -35,6 +35,22 @@ #include <trace/events/bcachefs.h> #include <trace/events/writeback.h> +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + struct quota_res { u64 sectors; }; @@ -71,6 +87,24 @@ struct dio_read { struct bch_read_bio rbio; }; +/* stub version */ +static int add_to_page_cache_lru_vec(struct address_space *mapping, + struct page **pages, + unsigned nr_pages, + pgoff_t offset, gfp_t gfp_mask) +{ + int i, err = 0; + + for (i = 0; i < nr_pages; i++) { + err = add_to_page_cache_lru(pages[i], mapping, + offset + i, gfp_mask); + if (err) + break; + } + + return i ?: err; +} + /* pagecache_block must be held */ static int write_invalidate_inode_pages_range(struct address_space *mapping, loff_t start, loff_t end) @@ -265,28 +299,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -300,13 +319,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -514,10 +527,35 @@ static void bch2_set_page_dirty(struct bch_fs *c, vm_fault_t bch2_page_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); struct bch_inode_info *inode = file_bch_inode(file); int ret; + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + goto got_lock; + + bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + bch2_pagecache_add_get(&inode->ei_pagecache_lock); +got_lock: ret = filemap_fault(vmf); bch2_pagecache_add_put(&inode->ei_pagecache_lock); @@ -604,18 +642,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -629,10 +661,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -651,31 +683,29 @@ struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -683,41 +713,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -778,11 +776,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -879,17 +874,18 @@ retry: goto retry; if (ret) { - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } bkey_on_stack_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -898,7 +894,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); @@ -933,8 +929,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -1034,32 +1028,35 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1083,7 +1080,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1237,7 +1234,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_PAGES * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1806,29 +1803,53 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; - bool sync = dio->sync; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; long ret; if (dio->loop) goto loop; while (1) { + iter_count = dio->iter.count; + if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; ret = bio_iov_iter_get_pages(bio, &dio->iter); + dropped_locks = fdm_dropped_locks(); + current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); + + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; if (unlikely(ret < 0)) goto err; + if (unlikely(dropped_locks)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); bio->bi_iter.bi_size -= unaligned; iov_iter_revert(&dio->iter, unaligned); @@ -1838,7 +1859,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1901,9 +1922,15 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->op.error) + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); + break; + } + + if (!dio->iter.count) break; bio_reset(bio); @@ -2291,7 +2318,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (ret) goto err; - BUG_ON(inode->v.i_size < inode_u.bi_size); + WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { ret = bch2_extend(inode, &inode_u, iattr); @@ -2475,10 +2503,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS(inode->v.i_ino, src_start >> 9), BTREE_ITER_INTENT); - BUG_ON(IS_ERR_OR_NULL(src)); - dst = bch2_trans_copy_iter(&trans, src); - BUG_ON(IS_ERR_OR_NULL(dst)); while (1) { struct disk_reservation disk_res = @@ -2818,235 +2843,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3241,8 +3037,8 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) int pg_offset; loff_t ret = -1; - page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + page = find_lock_page(mapping, index); + if (!page) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..2537a3d25ede 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); @@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c04d90b17622..e3edca4d265b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -91,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock) __pagecache_lock_put(lock, 1); } +bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) +{ + return __pagecache_lock_tryget(lock, 1); +} + void bch2_pagecache_add_get(struct pagecache_lock *lock) { __pagecache_lock_get(lock, 1); @@ -271,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, if (!tmpfile) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c, 8, 1024); + bch2_trans_init(&trans, c, 8, + 2048 + (!tmpfile ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); @@ -886,6 +892,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bool have_extent = false; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; @@ -989,15 +999,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -1015,7 +1016,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1085,7 +1086,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1150,6 +1151,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; + inode->ei_flags = 0; inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); @@ -1251,7 +1253,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino); + bch2_inode_rm(c, inode->v.i_ino, true); } } @@ -1570,9 +1572,7 @@ got_sb: if (ret) goto err_put_super; - sb->s_bdi->congested_fn = bch2_congested; - sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index eda903a45325..3df85ffb450c 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -26,12 +26,14 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock) } void bch2_pagecache_add_put(struct pagecache_lock *); +bool bch2_pagecache_add_tryget(struct pagecache_lock *); void bch2_pagecache_add_get(struct pagecache_lock *); void bch2_pagecache_block_put(struct pagecache_lock *); void bch2_pagecache_block_get(struct pagecache_lock *); struct bch_inode_info { struct inode v; + unsigned long ei_flags; struct mutex ei_update_lock; u64 ei_journal_seq; @@ -49,6 +51,12 @@ struct bch_inode_info { struct bch_inode_unpacked ei_inode; }; +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0c5035270846..39f872de0c18 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans, return 0; iter = bch2_trans_copy_iter(trans, h->chain); - BUG_ON(IS_ERR(iter)); for_each_btree_key_continue(iter, 0, k2, ret) { if (bkey_cmp(k2.k->p, k.k->p) >= 0) @@ -265,10 +264,8 @@ static void hash_set_chain_start(struct btree_trans *trans, hash_stop_chain(trans, h); if (!hole) { - if (!h->chain) { + if (!h->chain) h->chain = bch2_trans_copy_iter(trans, k_iter); - BUG_ON(IS_ERR(h->chain)); - } h->chain_end = k.k->p.offset; } @@ -440,9 +437,6 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans, bch2_cut_front(cut_at, u); u_iter = bch2_trans_copy_iter(trans, iter); - ret = PTR_ERR_OR_ZERO(u_iter); - if (ret) - return ret; /* * We don't want to go through the @@ -485,7 +479,11 @@ static int check_extents(struct bch_fs *c) BTREE_ITER_INTENT); retry: for_each_btree_key_continue(iter, 0, k, ret) { - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + /* + * due to retry errors we might see the same extent twice: + */ + if (bkey_cmp(prev.k->k.p, k.k->p) && + bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { char buf1[200]; char buf2[200]; @@ -1254,7 +1252,7 @@ static int check_inode(struct btree_trans *trans, bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum); + ret = bch2_inode_rm(c, u.bi_inum, false); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 42371de7f72a..bf1c7319669c 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -302,9 +302,6 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), BTREE_ITER_CACHED|flags); - if (IS_ERR(iter)) - return iter; - k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) @@ -537,10 +534,12 @@ found_slot: inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - return bch2_inode_write(trans, iter, inode_u); + ret = bch2_inode_write(trans, iter, inode_u); + bch2_trans_iter_put(trans, iter); + return ret; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) { struct btree_trans trans; struct btree_iter *iter; @@ -551,6 +550,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) u64 bi_generation; int ret; + bch2_trans_init(&trans, c, 0, 0); + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -559,37 +560,34 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_XATTRS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - start, end, NULL); + ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS, + start, end, NULL); if (ret) - return ret; - - bch2_trans_init(&trans, c, 0, 0); + goto err; retry: bch2_trans_begin(&trans); bi_generation = 0; - ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); - if (ret) { - if (ret != -EINTR) - bch_err(c, "error flushing btree key cache: %i", ret); - goto err; + if (cached) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_cached(iter); + } else { + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); } - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); if (ret) goto err; - bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, + bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c, "inode %llu not found when deleting", inode_nr); @@ -639,9 +637,6 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inode_nr), BTREE_ITER_CACHED); - if (IS_ERR(iter)) - return PTR_ERR(iter); - k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index ef7e885dce0c..dbdfcf63d079 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); -int bch2_inode_rm(struct bch_fs *, u64); +int bch2_inode_rm(struct bch_fs *, u64, bool); int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 5251e1983c72..15b58a33c8ff 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -135,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -186,36 +186,33 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, static int sum_sector_overwrites(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *new, - bool may_allocate, bool *maybe_extending, - s64 *delta) + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { struct btree_iter *iter; struct bkey_s_c old; int ret = 0; - *maybe_extending = true; - *delta = 0; + *maybe_extending = true; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { - if (!may_allocate && - bch2_bkey_nr_ptrs_fully_allocated(old) < - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { - ret = -ENOSPC; - break; - } + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * + *i_sectors_delta += sectors * (bkey_extent_is_allocation(&new->k) - bkey_extent_is_allocation(old.k)); + *disk_sectors_delta += sectors * + (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) - + bch2_bkey_nr_ptrs_fully_allocated(old)); + if (bkey_cmp(old.k->p, new->k.p) >= 0) { /* * Check if there's already data above where we're @@ -249,12 +246,12 @@ int bch2_extent_update(struct btree_trans *trans, struct disk_reservation *disk_res, u64 *journal_seq, u64 new_i_size, - s64 *i_sectors_delta) + s64 *i_sectors_delta_total) { /* this must live until after bch2_trans_commit(): */ struct bkey_inode_buf inode_p; bool extending = false; - s64 delta = 0; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; ret = bch2_extent_trim_atomic(k, iter); @@ -262,16 +259,30 @@ int bch2_extent_update(struct btree_trans *trans, return ret; ret = sum_sector_overwrites(trans, iter, k, - disk_res && disk_res->sectors != 0, - &extending, &delta); + &extending, + &i_sectors_delta, + &disk_sectors_delta); if (ret) return ret; + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + pr_info("disk_sectors_delta %lli disk_res %llu", + disk_sectors_delta, + disk_res->sectors); + + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + 0); + if (ret) + return ret; + } + new_i_size = extending ? min(k->k.p.offset << 9, new_i_size) : 0; - if (delta || new_i_size) { + if (i_sectors_delta || new_i_size) { struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; @@ -298,9 +309,9 @@ int bch2_extent_update(struct btree_trans *trans, else new_i_size = 0; - inode_u.bi_sectors += delta; + inode_u.bi_sectors += i_sectors_delta; - if (delta || new_i_size) { + if (i_sectors_delta || new_i_size) { bch2_inode_pack(trans->c, &inode_p, &inode_u); bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); @@ -315,10 +326,12 @@ int bch2_extent_update(struct btree_trans *trans, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); - if (!ret && i_sectors_delta) - *i_sectors_delta += delta; + if (ret) + return ret; - return ret; + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + return 0; } int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, @@ -578,7 +591,8 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, op->pos.inode, + "write error %i from btree update", ret); op->error = ret; } } @@ -623,7 +637,10 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ + "data write error: %s", bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); @@ -1281,15 +1298,14 @@ void bch2_write(struct closure *cl) wbio_init(bio)->put_bio = false; if (bio_sectors(bio) & (c->opts.block_size - 1)) { - __bcache_io_error(c, "misaligned write"); + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); op->error = -EIO; goto err; } if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) - __bcache_io_error(c, "read only"); op->error = -EROFS; goto err; } @@ -1718,7 +1734,8 @@ retry: * reading a btree node */ BUG_ON(!ret); - __bcache_io_error(c, "btree IO error: %i", ret); + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); err: rbio->bio.bi_status = BLK_STS_IOERR; out: @@ -1790,9 +1807,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = PTR_ERR_OR_ZERO(iter))) - goto out; - k = bch2_btree_iter_peek_slot(iter); if ((ret = bkey_err(k))) goto out; @@ -1925,17 +1939,15 @@ csum_err: return; } - bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); return; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); + bch_err_inum_ratelimited(c, rbio->pos.inode, + "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); return; } @@ -1957,7 +1969,14 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + /* + * XXX: rbio->pos is not what we want here when reading from indirect + * extents + */ + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->pos.inode, + rbio->pos.offset, + "data read error: %s", bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; @@ -2000,10 +2019,6 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, POS(0, reflink_offset), BTREE_ITER_SLOTS); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -2011,7 +2026,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_reflink_v && k.k->type != KEY_TYPE_indirect_inline_data) { - __bcache_io_error(trans->c, + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, "pointer to nonexistent indirect extent"); ret = -EIO; goto err; @@ -2057,7 +2072,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); goto err; } @@ -2207,7 +2223,8 @@ get_bio: if (!rbio->pick.idx) { if (!rbio->have_ioref) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -2357,7 +2374,9 @@ err: if (ret == -EINTR) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); goto out; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index c2cafd3892a4..d54424829378 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -18,7 +18,19 @@ #include <trace/events/bcachefs.h> -static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); +static u64 last_unwritten_seq(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + + lockdep_assert_held(&j->lock); + + return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); +} + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq >= last_unwritten_seq(j); +} static bool __journal_entry_is_open(union journal_res_state state) { @@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j) return __journal_entry_is_open(j->reservations); } +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); + EBUG_ON(seq == journal_cur_seq(j) && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +} + static void journal_pin_new_entry(struct journal *j, int count) { struct journal_entry_pin_list *p; @@ -51,6 +79,10 @@ static void bch2_journal_buf_init(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + memset(buf->has_inode, 0, sizeof(buf->has_inode)); memset(buf->data, 0, sizeof(*buf->data)); @@ -72,21 +104,15 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } /* journal entry close/open: */ -void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +void __bch2_journal_buf_put(struct journal *j) { - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); - - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } @@ -99,7 +125,6 @@ static bool __journal_entry_close(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - bool set_need_write = false; unsigned sectors; lockdep_assert_held(&j->lock); @@ -118,15 +143,13 @@ static bool __journal_entry_close(struct journal *j) if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { set_bit(JOURNAL_NEED_WRITE, &j->flags); j->need_write_time = local_clock(); - set_need_write = true; } - if (new.prev_buf_unwritten) - return false; - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; - new.prev_buf_unwritten = 1; + + if (new.idx == new.unwritten_idx) + return false; BUG_ON(journal_state_count(new, new.idx)); } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -139,8 +162,6 @@ static bool __journal_entry_close(struct journal *j) BUG_ON(sectors > buf->sectors); buf->sectors = sectors; - bkey_extent_init(&buf->key); - /* * We have to set last_seq here, _before_ opening a new journal entry: * @@ -162,29 +183,44 @@ static bool __journal_entry_close(struct journal *j) */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); - if (journal_entry_empty(buf->data)) - clear_bit(JOURNAL_NOT_EMPTY, &j->flags); - else - set_bit(JOURNAL_NOT_EMPTY, &j->flags); + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); + clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx, set_need_write); + bch2_journal_buf_put(j, old.idx); return true; } +static bool journal_entry_want_write(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = false; + + /* + * Don't close it yet if we already have a write in flight, but do set + * NEED_WRITE: + */ + if (s.idx != s.unwritten_idx) + set_bit(JOURNAL_NEED_WRITE, &j->flags); + else + ret = __journal_entry_close(j); + + return ret; +} + static bool journal_entry_close(struct journal *j) { bool ret; spin_lock(&j->lock); - ret = __journal_entry_close(j); + ret = journal_entry_want_write(j); spin_unlock(&j->lock); return ret; @@ -202,16 +238,19 @@ static bool journal_entry_close(struct journal *j) */ static int journal_entry_open(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; int u64s; u64 v; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); if (j->blocked) - return -EAGAIN; + return cur_entry_blocked; if (j->cur_entry_error) return j->cur_entry_error; @@ -227,7 +266,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= le32_to_cpu(buf->data->u64s)) - return -ENOSPC; + return cur_entry_journal_full; /* * Must be set before marking the journal entry as open: @@ -239,7 +278,7 @@ static int journal_entry_open(struct journal *j) old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return -EROFS; + return cur_entry_insufficient_devices; /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); @@ -263,8 +302,8 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state state = READ_ONCE(j->reservations); - bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); if (!ret) journal_entry_close(j); @@ -291,17 +330,29 @@ static void journal_write_work(struct work_struct *work) u64 bch2_inode_journal_seq(struct journal *j, u64 inode) { size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; + union journal_res_state s; + unsigned i; + u64 seq; - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = journal_cur_seq(j); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = journal_cur_seq(j) - 1; + seq = journal_cur_seq(j); + s = READ_ONCE(j->reservations); + i = s.idx; + + while (1) { + if (test_bit(h, j->buf[i].has_inode)) + goto out; + + if (i == s.unwritten_idx) + break; + + i = (i - 1) & JOURNAL_BUF_MASK; + seq--; + } + + seq = 0; +out: spin_unlock(&j->lock); return seq; @@ -352,7 +403,7 @@ retry: * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = -ENOSPC; + ret = cur_entry_journal_full; goto unlock; } @@ -375,14 +426,16 @@ retry: * there's still a previous one in flight: */ trace_journal_entry_full(c); - ret = -EAGAIN; + ret = cur_entry_blocked; } else { ret = journal_entry_open(j); } unlock: - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) + if ((ret && ret != cur_entry_insufficient_devices) && + !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; + trace_journal_full(c); + } can_discard = j->can_discard; spin_unlock(&j->lock); @@ -390,32 +443,25 @@ unlock: if (!ret) goto retry; - if (ret == -ENOSPC) { - WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), - "JOURNAL_RES_GET_RESERVED set but journal full"); - - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - trace_journal_full(c); - - if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } - - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; } - ret = -EAGAIN; + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } } - return ret; + return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -448,8 +494,10 @@ static bool journal_preres_available(struct journal *j, { bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); - if (!ret) - bch2_journal_reclaim_work(&j->reclaim_work.work); + if (!ret && mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } return ret; } @@ -503,168 +551,80 @@ out: /* journal flushing: */ -u64 bch2_journal_last_unwritten_seq(struct journal *j) -{ - u64 seq; - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - if (j->reservations.prev_buf_unwritten) - seq--; - spin_unlock(&j->lock); - - return seq; -} - /** - * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't - * open yet, or wait if we cannot + * bch2_journal_flush_seq_async - wait for a journal entry to be written * - * used by the btree interior update machinery, when it needs to write a new - * btree root - every journal entry contains the roots of all the btrees, so it - * doesn't need to bother with getting a journal reservation + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) +int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int ret; - - spin_lock(&j->lock); - - /* - * Can't try to open more than one sequence number ahead: - */ - BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); + struct journal_buf *buf; + int ret = 0; - if (journal_cur_seq(j) > seq || - journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return 0; - } + if (seq <= j->flushed_seq_ondisk) + return 1; - if (journal_cur_seq(j) < seq && - !__journal_entry_close(j)) { - /* haven't finished writing out the previous one: */ - trace_journal_entry_full(c); - ret = -EAGAIN; - } else { - BUG_ON(journal_cur_seq(j) != seq); + spin_lock(&j->lock); - ret = journal_entry_open(j); + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; + goto out; } - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - - if (ret == -EAGAIN || ret == -ENOSPC) - closure_wait(&j->async_wait, cl); - - spin_unlock(&j->lock); - - if (ret == -ENOSPC) { - trace_journal_full(c); - bch2_journal_reclaim_work(&j->reclaim_work.work); - ret = -EAGAIN; + if (seq <= j->flushed_seq_ondisk) { + ret = 1; + goto out; } - return ret; -} - -static int journal_seq_error(struct journal *j, u64 seq) -{ - union journal_res_state state = READ_ONCE(j->reservations); - - if (seq == journal_cur_seq(j)) - return bch2_journal_error(j); - - if (seq + 1 == journal_cur_seq(j) && - !state.prev_buf_unwritten && - seq > j->seq_ondisk) - return -EIO; - - return 0; -} + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, last_unwritten_seq(j)); -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - /* seq should be for a journal entry that has been opened: */ - BUG_ON(seq > journal_cur_seq(j)); - BUG_ON(seq == journal_cur_seq(j) && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); +recheck_need_open: + if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + struct journal_res res = { 0 }; - if (seq == journal_cur_seq(j)) - return journal_cur_buf(j); - if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) - return journal_prev_buf(j); - return NULL; -} + spin_unlock(&j->lock); -/** - * bch2_journal_wait_on_seq - wait for a journal entry to be written - * - * does _not_ cause @seq to be written immediately - if there is no other - * activity to cause the relevant journal entry to be filled up or flushed it - * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is - * configurable). - */ -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, - struct closure *parent) -{ - struct journal_buf *buf; + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; - spin_lock(&j->lock); + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); - if ((buf = journal_seq_to_buf(j, seq))) { - if (!closure_wait(&buf->wait, parent)) + if (parent && !closure_wait(&buf->wait, parent)) BUG(); - if (seq == journal_cur_seq(j)) { - smp_mb(); - if (bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - } - - spin_unlock(&j->lock); -} - -/** - * bch2_journal_flush_seq_async - wait for a journal entry to be written - * - * like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary - */ -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) -{ - struct journal_buf *buf; - - spin_lock(&j->lock); + bch2_journal_res_put(j, &res); - if (parent && - (buf = journal_seq_to_buf(j, seq))) - if (!closure_wait(&buf->wait, parent)) - BUG(); - - if (seq == journal_cur_seq(j)) - __journal_entry_close(j); - spin_unlock(&j->lock); -} + spin_lock(&j->lock); + goto want_write; + } -static int journal_seq_flushed(struct journal *j, u64 seq) -{ - int ret; + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } - spin_lock(&j->lock); - ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); + buf->must_flush = true; + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: if (seq == journal_cur_seq(j)) - __journal_entry_close(j); + journal_entry_want_write(j); +out: spin_unlock(&j->lock); - return ret; } @@ -673,28 +633,13 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) u64 start_time = local_clock(); int ret, ret2; - ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); + ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } -/** - * bch2_journal_meta_async - force a journal entry to be written - */ -void bch2_journal_meta_async(struct journal *j, struct closure *parent) -{ - struct journal_res res; - - memset(&res, 0, sizeof(res)); - - bch2_journal_res_get(j, &res, jset_u64s(0), 0); - bch2_journal_res_put(j, &res); - - bch2_journal_flush_seq_async(j, res.seq, parent); -} - int bch2_journal_meta(struct journal *j) { struct journal_res res; @@ -790,16 +735,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (nr <= ja->nr) return 0; - ret = -ENOMEM; new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) + if (!new_buckets || !new_bucket_seq) { + ret = -ENOMEM; goto err; + } journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) { + ret = -ENOSPC; goto err; + } /* * We may be called from the device add path, before the new device has @@ -828,8 +776,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } } else { + rcu_read_lock(); ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl); + rcu_read_unlock(); if (IS_ERR(ob)) { ret = cl ? -EAGAIN : -ENOSPC; goto err; @@ -843,6 +793,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, spin_lock(&c->journal.lock); } + /* + * XXX + * For resize at runtime, we should be writing the new + * superblock before inserting into the journal array + */ + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; __array_insert_item(ja->buckets, ja->nr, pos); __array_insert_item(ja->bucket_seq, ja->nr, pos); @@ -875,9 +831,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (!new_fs) bch2_open_bucket_put(c, ob); } - - ret = 0; err: + bch2_sb_resize_journal(&ca->disk_sb, + ja->nr + sizeof(*journal_buckets) / sizeof(u64)); kfree(new_bucket_seq); kfree(new_buckets); @@ -955,15 +911,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { union journal_res_state state; - struct journal_buf *w; - bool ret; + bool ret = false; + unsigned i; spin_lock(&j->lock); state = READ_ONCE(j->reservations); - w = j->buf + !state.idx; + i = state.idx; - ret = state.prev_buf_unwritten && - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); + while (i != state.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + ret = true; + } spin_unlock(&j->lock); return ret; @@ -989,10 +948,11 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_NOT_EMPTY, &j->flags)); + (journal_entry_is_open(j) || + j->last_empty_seq + 1 != journal_cur_seq(j))); cancel_delayed_work_sync(&j->write_work); - cancel_delayed_work_sync(&j->reclaim_work); + bch2_journal_reclaim_stop(j); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -1045,8 +1005,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; journal_pin_new_entry(j, 1); + + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); + bch2_journal_buf_init(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1100,8 +1064,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].buf_size); - kvpfree(j->buf[0].data, j->buf[0].buf_size); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); } @@ -1109,6 +1075,7 @@ int bch2_fs_journal_init(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; + unsigned i; int ret = 0; pr_verbose_init(c->opts, ""); @@ -1117,15 +1084,12 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; @@ -1137,13 +1101,20 @@ int bch2_fs_journal_init(struct journal *j) ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { ret = -ENOMEM; goto out; } + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) { + ret = -ENOMEM; + goto out; + } + } + j->pin.front = j->pin.back = 1; out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1152,15 +1123,14 @@ out: /* debug: */ -void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; - unsigned iter; + unsigned i; rcu_read_lock(); - spin_lock(&j->lock); s = READ_ONCE(j->reservations); pr_buf(out, @@ -1169,7 +1139,12 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" + "nr direct reclaim:\t%llu\n" + "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" + "current entry error:\t%u\n" "current entry:\t\t", fifo_used(&j->pin), journal_cur_seq(j), @@ -1177,7 +1152,12 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->last_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, - j->cur_entry_sectors); + j->nr_flush_writes, + j->nr_noflush_writes, + j->nr_direct_reclaim, + j->nr_background_reclaim, + j->cur_entry_sectors, + j->cur_entry_error); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: @@ -1194,16 +1174,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } pr_buf(out, - "current entry refs:\t%u\n" - "prev entry unwritten:\t", - journal_state_count(s, s.idx)); - - if (s.prev_buf_unwritten) - pr_buf(out, "yes, ref %u sectors %u\n", - journal_state_count(s, !s.idx), - journal_prev_buf(j)->sectors); - else - pr_buf(out, "no\n"); + "current entry:\t\tidx %u refcount %u\n", + s.idx, journal_state_count(s, s.idx)); + + i = s.idx; + while (i != s.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + + pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", + i, journal_state_count(s, i), j->buf[i].sectors); + } pr_buf(out, "need write:\t\t%i\n" @@ -1211,7 +1191,21 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - for_each_member_device_rcu(ca, c, iter, + pr_buf(out, "space:\n"); + pr_buf(out, "\tdiscarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + pr_buf(out, "\tclean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + pr_buf(out, "\tclean\t\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + pr_buf(out, "\ttotal\t\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + + for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1221,12 +1215,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) pr_buf(out, "dev %u:\n" "\tnr\t\t%u\n" + "\tbucket size\t%u\n" "\tavailable\t%u:%u\n" - "\tdiscard_idx\t\t%u\n" - "\tdirty_idx_ondisk\t%u (seq %llu)\n" - "\tdirty_idx\t\t%u (seq %llu)\n" + "\tdiscard_idx\t%u\n" + "\tdirty_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t%u (seq %llu)\n" "\tcur_idx\t\t%u (seq %llu)\n", - iter, ja->nr, + i, ja->nr, ca->mi.bucket_size, bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free, ja->discard_idx, @@ -1235,10 +1230,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } - spin_unlock(&j->lock); rcu_read_unlock(); } +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + spin_lock(&j->lock); + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); +} + void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) { struct journal_entry_pin_list *pin_list; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 0cbe9df384f9..1db1f190a168 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j) return j->buf + j->reservations.idx; } -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); return j->pin.back - 1; } @@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) { - return idx == 0 ? s.buf0_count : s.buf1_count; + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; } static inline void bch2_journal_set_has_inode(struct journal *j, @@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *, bool); +void __bch2_journal_buf_put(struct journal *); -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, }).v, &j->reservations.counter); - if (!journal_state_count(s, idx)) { - EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); - __bch2_journal_buf_put(j, need_write_just_set); - } + + EBUG_ON(((s.idx - idx) & 3) > + ((s.idx - s.unwritten_idx) & 3)); + + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); } /* @@ -282,14 +288,14 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, 0, 0, NULL, 0); - bch2_journal_buf_put(j, res->idx, false); + bch2_journal_buf_put(j, res->idx); res->ref = 0; } @@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j, !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) return 0; - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - new.cur_entry_offset += res->u64s; journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -371,7 +384,7 @@ out: static inline bool journal_check_may_get_unreserved(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - bool ret = s.reserved <= s.remaining && + bool ret = s.reserved < s.remaining && fifo_free(&j->pin) > 8; lockdep_assert_held(&j->lock); @@ -464,13 +477,8 @@ void bch2_journal_entry_res_resize(struct journal *, struct journal_entry_res *, unsigned); -u64 bch2_journal_last_unwritten_seq(struct journal *); -int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); - -void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); @@ -500,6 +508,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_pins_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index bd0e6b371701..0e6fbe2f6a75 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -10,10 +10,27 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include <trace/events/bcachefs.h> +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; + u64 last_seq = 0; int ret; - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } + + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } - /* Drop entries we don't need anymore */ + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -81,9 +99,7 @@ add: if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { if (i->bad) { devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + __journal_replay_free(i); } else if (bad) { goto found; } else { @@ -105,6 +121,7 @@ add: list_add(&i->list, where); i->devs = devs; i->bad = bad; + i->ignore = false; memcpy(&i->j, j, bytes); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -161,6 +178,8 @@ static void journal_entry_null_range(void *start, void *end) #define journal_entry_err_on(cond, c, msg, ...) \ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) +#define FSCK_DELETED_KEY 5 + static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned level, enum btree_id btree_id, @@ -173,28 +192,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, int ret = 0; if (journal_entry_err_on(!k->k.u64s, c, - "invalid %s in journal: k->u64s 0", type)) { + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), c, - "invalid %s in journal: extends past end of journal entry", - type)) { + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in journal: bad format %u", - type, k->k.format)) { - le16_add_cpu(&entry->u64s, -k->k.u64s); + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (!write) @@ -208,13 +241,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", - type, invalid, buf); - - le16_add_cpu(&entry->u64s, -k->k.u64s); + mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + invalid, buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (write) @@ -230,15 +268,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, struct jset_entry *entry, int write) { - struct bkey_i *k; + struct bkey_i *k = entry->start; - vstruct_for_each(entry, k) { + while (k != vstruct_last(entry)) { int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, k, "key", write); - if (ret) - return ret; + if (ret == FSCK_DELETED_KEY) + continue; + + k = bkey_next(k); } return 0; @@ -432,46 +472,45 @@ static int jset_validate(struct bch_fs *c, "%s sector %llu seq %llu: unknown journal entry version %u", ca->name, sector, le64_to_cpu(jset->seq), version)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + /* don't try to continue: */ + return EINVAL; } + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca->name, sector, le64_to_cpu(jset->seq), bytes)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + ret = JOURNAL_ENTRY_BAD; + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); } - if (bytes > sectors_read << 9) - return JOURNAL_ENTRY_REREAD; - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca->name, sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - return JOURNAL_ENTRY_BAD; + JSET_CSUM_TYPE(jset))) { + ret = JOURNAL_ENTRY_BAD; + goto bad_csum_type; + } csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, "%s sector %llu seq %llu: journal checksum bad", - ca->name, sector, le64_to_cpu(jset->seq))) { - /* XXX: retry IO, when we start retrying checksum errors */ - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } + ca->name, sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - +bad_csum_type: if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, "invalid journal entry: last_seq > seq")) { jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } - - return 0; fsck_err: return ret; } @@ -536,7 +575,7 @@ reread: bio_put(bio); if (bch2_dev_io_err_on(ret, ca, - "journal read from sector %llu", + "journal read error: sector %llu", offset) || bch2_meta_read_fault("journal")) return -EIO; @@ -677,14 +716,16 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -713,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + last_seq, *blacklist_seq - 1); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -746,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -929,36 +1055,51 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) buf->buf_size = new_size; } +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + j->reservations.unwritten_idx; +} + static void journal_write_done(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; + union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); + u64 v; + int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); - goto err; + err = -EIO; + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; } - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); - - if (bch2_mark_replicas(c, &replicas.e)) - goto err; + if (err) + bch2_fatal_error(c); spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; j->seq_ondisk = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -967,14 +1108,21 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -out: + journal_reclaim_kick(&c->journal); + /* also must come before signalling write completion: */ closure_debug_destroy(cl); - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(new.idx == new.unwritten_idx); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + bch2_journal_space_available(j); closure_wake_up(&w->wait); journal_wake(j); @@ -982,11 +1130,10 @@ out: if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); - return; -err: - bch2_fatal_error(c); - spin_lock(&j->lock); - goto out; + + if (new.unwritten_idx != new.idx && + !journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } static void journal_write_endio(struct bio *bio) @@ -994,10 +1141,10 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); @@ -1014,7 +1161,7 @@ void bch2_journal_write(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct jset_entry *start, *end; struct jset *jset; struct bio *bio; @@ -1023,13 +1170,29 @@ void bch2_journal_write(struct closure *cl) unsigned i, sectors, bytes, u64s; int ret; - bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1067,6 +1230,9 @@ void bch2_journal_write(struct closure *cl) SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + if (journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1148,8 +1314,9 @@ retry_alloc: bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + if (!JSET_NO_FLUSH(jset)) + bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); @@ -1158,20 +1325,21 @@ retry_alloc: ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - + if (!JSET_NO_FLUSH(jset)) { + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_opf = REQ_OP_FLUSH; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } no_io: bch2_bucket_seq_cleanup(c); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 6958ee0f8cf2..6b4c80968f52 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -11,6 +11,7 @@ struct journal_replay { struct bch_devs_list devs; /* checksum error, but we may want to try using it anyways: */ bool bad; + bool ignore; /* must be last: */ struct jset j; }; @@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 18e45296e7de..b77d4e7f42d6 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" +#include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +#include <linux/kthread.h> +#include <linux/sched/mm.h> +#include <trace/events/bcachefs.h> + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -53,82 +59,108 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } -static struct journal_space { - unsigned next_entry; - unsigned remaining; -} __journal_space_available(struct journal *j, unsigned nr_devs_want, - enum journal_space_from from) +static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned sectors_next_entry = UINT_MAX; - unsigned sectors_total = UINT_MAX; - unsigned i, nr_devs = 0; - unsigned unwritten_sectors = j->reservations.prev_buf_unwritten - ? journal_prev_buf(j)->sectors - : 0; + unsigned sectors = 0; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_this_device, sectors_this_device; + while (!sectors && *idx != j->reservations.idx) { + sectors = j->buf[*idx].sectors; - if (!ja->nr) - continue; + *idx = (*idx + 1) & JOURNAL_BUF_MASK; + } - buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); - sectors_this_device = ja->sectors_free; + return sectors; +} - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - if (unwritten_sectors >= sectors_this_device) { - if (!buckets_this_device) - continue; +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; - } + if (from == journal_space_total) + return (struct journal_space) { + .next_entry = ca->mi.bucket_size, + .total = ca->mi.bucket_size * ja->nr, + }; - sectors_this_device -= unwritten_sectors; + buckets = bch2_journal_dev_buckets_available(j, ja, from); + sectors = ja->sectors_free; - if (sectors_this_device < ca->mi.bucket_size && - buckets_this_device) { - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + while ((unwritten = get_unwritten_sectors(j, &idx))) { + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; + break; + } + + buckets--; + sectors = ca->mi.bucket_size; } - if (!sectors_this_device) + sectors -= unwritten; + } + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; + sectors = ca->mi.bucket_size; + } + + return (struct journal_space) { + .next_entry = sectors, + .total = sectors + buckets * ca->mi.bucket_size, + }; +} + +static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned i, pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + if (!ca->journal.nr) continue; - sectors_next_entry = min(sectors_next_entry, - sectors_this_device); + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) + continue; - sectors_total = min(sectors_total, - buckets_this_device * ca->mi.bucket_size + - sectors_this_device); + for (pos = 0; pos < nr_devs; pos++) + if (space.total > dev_space[pos].total) + break; - nr_devs++; + array_insert_item(dev_space, nr_devs, pos, space); } rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; - return (struct journal_space) { - .next_entry = sectors_next_entry, - .remaining = max_t(int, 0, sectors_total - sectors_next_entry), - }; + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ + return dev_space[nr_devs_want - 1]; } void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_space discarded, clean_ondisk, clean; - unsigned overhead, u64s_remaining = 0; + unsigned clean, clean_ondisk, total; + s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -164,31 +196,53 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = -EROFS; - goto out; - } - - if (!fifo_free(&j->pin)) { - ret = -ENOSPC; + ret = cur_entry_insufficient_devices; goto out; } nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); - clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); - clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); - if (!discarded.next_entry) - ret = -ENOSPC; + clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; - overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * - journal_entry_overhead(j); - u64s_remaining = clean.remaining << 6; - u64s_remaining = max_t(int, 0, u64s_remaining - overhead); - u64s_remaining /= 4; + if (!clean_ondisk && + j->reservations.idx == + j->reservations.unwritten_idx) { + char *buf = kmalloc(4096, GFP_ATOMIC); + + bch_err(c, "journal stuck"); + if (buf) { + __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); + pr_err("\n%s", buf); + kfree(buf); + } + + bch2_fatal_error(c); + ret = cur_entry_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; + else if (!fifo_free(&j->pin)) + ret = cur_entry_journal_pin_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && + (clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + + u64s_remaining = (u64) clean << 6; + u64s_remaining -= (u64) total << 3; + u64s_remaining = max(0LL, u64s_remaining); + u64s_remaining /= 2; + u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); out: - j->cur_entry_sectors = !ret ? discarded.next_entry : 0; + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); journal_check_may_get_unreserved(j); @@ -263,6 +317,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); BUG_ON(!fifo_pop(&j->pin, temp)); popped = true; } @@ -271,6 +326,14 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } +void __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) + bch2_journal_reclaim_fast(j); +} + void bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); @@ -314,11 +377,14 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + struct journal_entry_pin_list *pin_list; + + spin_lock(&j->lock); + pin_list = journal_seq_pin(j, seq); __journal_pin_drop(j, pin); @@ -329,45 +395,6 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, pin->flush = flush_fn; list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); -} - -void __bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - bch2_journal_pin_add_locked(j, seq, pin, flush_fn); - spin_unlock(&j->lock); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - journal_wake(j); -} - -void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (journal_pin_active(pin) && pin->seq < seq) - return; - - spin_lock(&j->lock); - - if (pin->seq != seq) { - bch2_journal_pin_add_locked(j, seq, pin, flush_fn); - } else { - struct journal_entry_pin_list *pin_list = - journal_seq_pin(j, seq); - - /* - * If the pin is already pinning the right sequence number, it - * still might've already been flushed: - */ - list_move(&pin->list, &pin_list->list); - } - spin_unlock(&j->lock); /* @@ -377,20 +404,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq, journal_wake(j); } -void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - if (journal_pin_active(src) && - (!journal_pin_active(dst) || src->seq < dst->seq)) - bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); - - spin_unlock(&j->lock); -} - /** * bch2_journal_pin_flush: ensure journal pin callback is no longer running */ @@ -431,7 +444,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) list_move(&ret->list, &pin_list->flushed); BUG_ON(j->flush_in_progress); j->flush_in_progress = ret; - j->last_flushed = jiffies; } spin_unlock(&j->lock); @@ -440,17 +452,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) } /* returns true if we did work */ -static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, - unsigned min_nr) +static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) { struct journal_entry_pin *pin; - bool ret = false; - u64 seq; + u64 seq, ret = 0; lockdep_assert_held(&j->reclaim_lock); - while ((pin = journal_get_next_pin(j, min_nr - ? U64_MAX : seq_to_flush, &seq))) { + while (1) { + cond_resched(); + + j->last_flushed = jiffies; + + pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq); + if (!pin) + break; + if (min_nr) min_nr--; @@ -459,7 +478,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, BUG_ON(j->flush_in_progress != pin); j->flush_in_progress = NULL; wake_up(&j->pin_flush_wait); - ret = true; + ret++; } return ret; @@ -523,15 +542,33 @@ static u64 journal_seq_to_flush(struct journal *j) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -void bch2_journal_reclaim(struct journal *j) +static int __bch2_journal_reclaim(struct journal *j, bool direct) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned min_nr = 0; - u64 seq_to_flush = 0; + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; + unsigned flags; + int ret = 0; + /* + * We can't invoke memory reclaim while holding the reclaim_lock - + * journal reclaim is required to make progress for memory reclaim + * (cleaning the caches), so we can't get stuck in memory reclaim while + * we're holding the reclaim lock: + */ lockdep_assert_held(&j->reclaim_lock); + flags = memalloc_noreclaim_save(); do { + if (kthread && kthread_should_stop()) + break; + + if (bch2_journal_error(j)) { + ret = -EIO; + break; + } + bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -547,21 +584,110 @@ void bch2_journal_reclaim(struct journal *j) if (j->prereserved.reserved * 2 > j->prereserved.remaining) min_nr = 1; - } while (journal_flush_pins(j, seq_to_flush, min_nr)); - if (!bch2_journal_error(j)) - queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, - msecs_to_jiffies(j->reclaim_delay_ms)); + if (atomic_read(&c->btree_cache.dirty) * 4 > + c->btree_cache.used * 3) + min_nr = 1; + + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + + min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); + + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + c->btree_key_cache.nr_dirty, + c->btree_key_cache.nr_keys); + + nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); + + if (direct) + j->nr_direct_reclaim += nr_flushed; + else + j->nr_background_reclaim += nr_flushed; + trace_journal_reclaim_finish(c, nr_flushed); + } while (min_nr && nr_flushed); + + memalloc_noreclaim_restore(flags); + + return ret; } -void bch2_journal_reclaim_work(struct work_struct *work) +int bch2_journal_reclaim(struct journal *j) { - struct journal *j = container_of(to_delayed_work(work), - struct journal, reclaim_work); + return __bch2_journal_reclaim(j, true); +} - mutex_lock(&j->reclaim_lock); - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; + unsigned long next; + int ret = 0; + + set_freezable(); + + kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); + + while (!ret && !kthread_should_stop()) { + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); + ret = __bch2_journal_reclaim(j, false); + mutex_unlock(&j->reclaim_lock); + + next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + if (j->reclaim_kicked) + break; + if (time_after_eq(jiffies, next)) + break; + schedule_timeout(next - jiffies); + try_to_freeze(); + + } + __set_current_state(TASK_RUNNING); + } + + return 0; +} + +void bch2_journal_reclaim_stop(struct journal *j) +{ + struct task_struct *p = j->reclaim_thread; + + j->reclaim_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_journal_reclaim_start(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; + + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + + get_task_struct(p); + j->reclaim_thread = p; + wake_up_process(p); + return 0; } static int journal_flush_done(struct journal *j, u64 seq_to_flush, @@ -575,7 +701,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0); + *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; spin_lock(&j->lock); /* diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 8128907a7623..f02caa3d49ea 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -4,11 +4,16 @@ #define JOURNAL_PIN (32 * 1024) -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, -}; +static inline void journal_reclaim_kick(struct journal *j) +{ + struct task_struct *p = READ_ONCE(j->reclaim_thread); + + if (p && !j->reclaim_kicked) { + j->reclaim_kicked = true; + if (p) + wake_up_process(p); + } +} unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, @@ -28,34 +33,45 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } +void __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); +void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_add(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) - __bch2_journal_pin_add(j, seq, pin, flush_fn); + bch2_journal_pin_set(j, seq, pin, flush_fn); } -void bch2_journal_pin_update(struct journal *, u64, - struct journal_entry_pin *, - journal_pin_flush_fn); +static inline void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) +{ + if (journal_pin_active(src)) + bch2_journal_pin_add(j, src->seq, dst, flush_fn); +} -void bch2_journal_pin_copy(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); +static inline void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_do_discards(struct journal *); -void bch2_journal_reclaim(struct journal *); -void bch2_journal_reclaim_work(struct work_struct *); +int bch2_journal_reclaim(struct journal *); + +void bch2_journal_reclaim_stop(struct journal *); +int bch2_journal_reclaim_start(struct journal *); bool bch2_journal_flush_pins(struct journal *, u64); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index d0f1bbf8f6a7..e1b63f3879f4 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -118,7 +118,7 @@ out_write_sb: out: mutex_unlock(&c->sb_lock); - return ret; + return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, @@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); - BUG_ON(c->journal_seq_blacklist_table); - if (!bl) return 0; @@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) journal_seq_blacklist_table_cmp, NULL); + kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 154b51b891d3..67ee47eb17a7 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -9,11 +9,13 @@ #include "super_types.h" #include "fifo.h" -struct journal_res; +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) /* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. */ struct journal_buf { struct jset *data; @@ -27,6 +29,8 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -81,10 +85,12 @@ union journal_res_state { struct { u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; }; }; @@ -116,6 +122,20 @@ union journal_preres_state { #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; + /* * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, * either because something's waiting on the write to complete or because it's @@ -127,8 +147,8 @@ enum { JOURNAL_STARTED, JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, - JOURNAL_NOT_EMPTY, JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, }; /* Embedded in struct bch_fs */ @@ -147,7 +167,14 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - int cur_entry_error; + enum { + cur_entry_ok, + cur_entry_blocked, + cur_entry_journal_full, + cur_entry_journal_pin_full, + cur_entry_journal_stuck, + cur_entry_insufficient_devices, + } cur_entry_error; union journal_preres_state prereserved; @@ -160,7 +187,7 @@ struct journal { * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ - struct journal_buf buf[2]; + struct journal_buf buf[JOURNAL_BUF_NR]; spinlock_t lock; @@ -180,7 +207,10 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; /* * FIFO of journal entries whose btree updates have not yet been @@ -203,14 +233,20 @@ struct journal { struct journal_entry_pin_list *data; } pin; + struct journal_space space[journal_space_nr]; + u64 replay_journal_seq; u64 replay_journal_seq_end; struct write_point wp; spinlock_t err_lock; - struct delayed_work reclaim_work; struct mutex reclaim_lock; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + unsigned long last_flushed; struct journal_entry_pin *flush_in_progress; wait_queue_head_t pin_flush_wait; @@ -221,11 +257,15 @@ struct journal { unsigned write_delay_ms; unsigned reclaim_delay_ms; + unsigned long last_flush_write; u64 res_get_blocked_start; u64 need_write_time; u64 write_start_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + struct time_stats *write_time; struct time_stats *delay_time; struct time_stats *blocked_time; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fbeaa3b67326..6633d21f604a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -326,12 +326,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index ddfda1ef8a79..2c5daed58aca 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, copygc_heap *h = &c->copygc_heap; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + struct extent_ptr_decoded p = { 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); @@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); if (IS_ERR(t)) return PTR_ERR(t); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 44d2651be970..c3373c48fa81 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c) if (c->opts.nochanges) return 0; - p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1745cfac6b26..1883a1faf380 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -187,7 +187,7 @@ void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, { memset(iter, 0, sizeof(*iter)); - iter->btree = bch2_trans_get_iter(trans, id, pos, 0); + iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH); bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); } @@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) static struct journal_keys journal_keys_sort(struct list_head *journal_entries) { - struct journal_replay *p; + struct journal_replay *i; struct jset_entry *entry; struct bkey_i *k, *_n; struct journal_keys keys = { NULL }; @@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (list_empty(journal_entries)) return keys; - keys.journal_seq_base = - le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); - - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + if (!keys.journal_seq_base) + keys.journal_seq_base = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); if (!keys.d) goto err; - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + + for_each_jset_key(k, _n, entry, &i->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(p->j.seq) - + .journal_seq = le64_to_cpu(i->j.seq) - keys.journal_seq_base, - .journal_offset = k->_data - p->j._data, + .journal_offset = k->_data - i->j._data, }; } @@ -443,9 +443,6 @@ retry: bch2_cut_back(atomic_end, split); split_iter = bch2_trans_copy_iter(&trans, iter); - ret = PTR_ERR_OR_ZERO(split_iter); - if (ret) - goto err; /* * It's important that we don't go through the @@ -456,11 +453,14 @@ retry: __bch2_btree_iter_set_pos(split_iter, split->k.p, false); bch2_trans_update(&trans, split_iter, split, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(&trans, split_iter); bch2_btree_iter_set_pos(iter, split->k.p); if (remark) { - ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), + ret = bch2_trans_mark_key(&trans, + bkey_s_c_null, + bkey_i_to_s_c(split), 0, split->k.size, BTREE_TRIGGER_INSERT); if (ret) @@ -469,7 +469,9 @@ retry: } while (bkey_cmp(iter->pos, k->k.p) < 0); if (remark) { - ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), + ret = bch2_trans_mark_key(&trans, + bkey_i_to_s_c(k), + bkey_s_c_null, 0, -((s64) k->k.size), BTREE_TRIGGER_OVERWRITE); if (ret) @@ -481,6 +483,8 @@ retry: BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY); err: + bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) goto retry; @@ -499,8 +503,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, iter = bch2_trans_get_node_iter(trans, id, k->k.p, BTREE_MAX_DEPTH, level, BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); /* * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run @@ -535,8 +537,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter) ?: - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); bch2_trans_iter_put(trans, iter); return ret; } @@ -613,6 +614,7 @@ static int bch2_journal_replay(struct bch_fs *c, */ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); + journal_reclaim_kick(j); j->replay_journal_seq = seq; @@ -645,46 +647,6 @@ err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - -static int -verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, - struct list_head *journal) -{ - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - u64 start_seq = le64_to_cpu(i->j.last_seq); - u64 end_seq = le64_to_cpu(i->j.seq); - u64 seq = start_seq; - int ret = 0; - - list_for_each_entry(i, journal, list) { - if (le64_to_cpu(i->j.seq) < start_seq) - continue; - - fsck_err_on(seq != le64_to_cpu(i->j.seq), c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - seq, le64_to_cpu(i->j.seq) - 1, - start_seq, end_seq); - - seq = le64_to_cpu(i->j.seq); - - fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, - "found blacklisted journal entry %llu", seq); - - do { - seq++; - } while (bch2_journal_seq_is_blacklisted(c, seq, false)); - } -fsck_err: - return ret; -} - /* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, @@ -769,6 +731,7 @@ static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean, struct list_head *journal) { + struct journal_replay *i; struct jset_entry *entry; int ret; @@ -784,18 +747,19 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); + list_for_each_entry(i, journal, list) { + if (i->ignore) + continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); + c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - list_for_each_entry(i, journal, list) vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) return ret; } + } } bch2_fs_usage_initialize(c); @@ -844,9 +808,6 @@ static int verify_superblock_clean(struct bch_fs *c, struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!c->sb.clean || !j) - return 0; - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), @@ -973,7 +934,8 @@ int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; - u64 journal_seq; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; bool write_sb = false, need_write_alloc = false; int ret; @@ -993,24 +955,38 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct jset *j; + struct journal_replay *i; - ret = bch2_journal_read(c, &c->journal_entries); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + list_for_each_entry_reverse(i, &c->journal_entries, list) + if (!i->ignore) { + last_journal_entry = &i->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&c->journal_entries)) { - bch_err(c, "no journal entries found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; - goto err; + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + goto use_clean; } c->journal_keys = journal_keys_sort(&c->journal_entries); @@ -1019,16 +995,21 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - j = &list_last_entry(&c->journal_entries, - struct journal_replay, list)->j; - - ret = verify_superblock_clean(c, &clean, j); - if (ret) + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; - journal_seq = le64_to_cpu(j->seq) + 1; - } else { - journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } if (!c->sb.clean && @@ -1047,30 +1028,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (!c->sb.clean) { + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 4); + blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - - journal_seq += 4; - - /* - * The superblock needs to be written before we do any btree - * node writes: it will be in the read_write() path - */ - } - - ret = bch2_blacklist_table_initialize(c); - - if (!list_empty(&c->journal_entries)) { - ret = verify_journal_entries_not_blacklisted_or_missing(c, - &c->journal_entries); - if (ret) - goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq, diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 91518c0d6794..00a197b65e0b 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, static int replicas_table_update(struct bch_fs *c, struct bch_replicas_cpu *new_r) { - struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; struct bch_fs_usage *new_scratch = NULL; struct bch_fs_usage __percpu *new_gc = NULL; struct bch_fs_usage *new_base = NULL; - unsigned bytes = sizeof(struct bch_fs_usage) + + unsigned i, bytes = sizeof(struct bch_fs_usage) + sizeof(u64) * new_r->nr; - int ret = -ENOMEM; + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_NOIO))) + goto err; if (!(new_base = kzalloc(bytes, GFP_NOIO)) || - !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || - !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || !(new_scratch = kmalloc(bytes, GFP_NOIO)) || (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { - bch_err(c, "error updating replicas table: memory allocation failure"); + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) goto err; - } + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (c->usage[i]) + __replicas_table_update_pcpu(new_usage[i], new_r, + c->usage[i], &c->replicas); if (c->usage_base) __replicas_table_update(new_base, new_r, c->usage_base, &c->replicas); - if (c->usage[0]) - __replicas_table_update_pcpu(new_usage[0], new_r, - c->usage[0], &c->replicas); - if (c->usage[1]) - __replicas_table_update_pcpu(new_usage[1], new_r, - c->usage[1], &c->replicas); if (c->usage_gc) __replicas_table_update_pcpu(new_gc, new_r, c->usage_gc, &c->replicas); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + swap(c->usage[i], new_usage[i]); swap(c->usage_base, new_base); - swap(c->usage[0], new_usage[0]); - swap(c->usage[1], new_usage[1]); swap(c->usage_scratch, new_scratch); swap(c->usage_gc, new_gc); swap(c->replicas, *new_r); - ret = 0; -err: +out: free_percpu(new_gc); kfree(new_scratch); free_percpu(new_usage[1]); free_percpu(new_usage[0]); kfree(new_base); return ret; +err: + bch_err(c, "error updating replicas table: memory allocation failure"); + ret = -ENOMEM; + goto out; } static unsigned reserve_journal_replicas(struct bch_fs *c, @@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) struct bch_replicas_cpu n; if (!__replicas_has_entry(&c->replicas_gc, e) && - (c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i]))) { + bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { n = cpu_replicas_add_entry(&c->replicas_gc, e); if (!n.entries) { ret = -ENOSPC; @@ -603,9 +603,7 @@ retry: cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || - c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i])) + bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index dea9b7252b88..1ecf72c9487c 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -205,8 +205,6 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, int ret; iter = bch2_trans_copy_iter(trans, start); - if (IS_ERR(iter)) - return PTR_ERR(iter); bch2_btree_iter_next_slot(iter); @@ -253,11 +251,8 @@ int bch2_hash_set(struct btree_trans *trans, } if (!slot && - !(flags & BCH_HASH_SET_MUST_REPLACE)) { + !(flags & BCH_HASH_SET_MUST_REPLACE)) slot = bch2_trans_copy_iter(trans, iter); - if (IS_ERR(slot)) - return PTR_ERR(slot); - } if (k.k->type != KEY_TYPE_whiteout) goto not_found; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index cee6cc938734..78835bd2d6bc 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -614,9 +614,6 @@ got_super: bdev_logical_block_size(sb->bdev)) goto err; - if (sb->mode & FMODE_WRITE) - bdev_get_queue(sb->bdev)->backing_dev_info->capabilities - |= BDI_CAP_STABLE_WRITES; ret = 0; sb->have_layout = true; out: @@ -636,7 +633,7 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -995,10 +992,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_down_write(&c->mark_lock); if (!journal_seq) { - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); } else { - bch2_fs_usage_acc_to_base(c, journal_seq & 1); + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); } { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index f606de540895..651fbc5d52b1 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -49,7 +49,6 @@ #include <linux/device.h> #include <linux/genhd.h> #include <linux/idr.h> -#include <linux/kthread.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/random.h> @@ -149,44 +148,6 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } -int bch2_congested(void *data, int bdi_bits) -{ - struct bch_fs *c = data; - struct backing_dev_info *bdi; - struct bch_dev *ca; - unsigned i; - int ret = 0; - - rcu_read_lock(); - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - const struct bch_devs_mask *devs = - bch2_target_to_mask(c, c->opts.foreground_target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_member_device_rcu(ca, c, i, devs) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } - rcu_read_unlock(); - - return ret; -} - /* Filesystem RO/RW: */ /* @@ -297,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_RW, &c->flags)) { - cancel_delayed_work_sync(&c->journal.reclaim_work); + BUG_ON(c->journal.reclaim_thread); return; } @@ -455,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err(c, "error starting journal reclaim: %i", ret); + return ret; + } + if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -463,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) percpu_ref_reinit(&c->writes); set_bit(BCH_FS_RW, &c->flags); - - queue_delayed_work(c->journal_reclaim_wq, - &c->journal.reclaim_work, 0); return 0; err: __bch2_fs_read_only(c); @@ -511,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); kfree(c->usage_scratch); - free_percpu(c->usage[1]); - free_percpu(c->usage[0]); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); kfree(c->usage_base); if (c->btree_iters_bufs) @@ -533,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); - if (c->journal_reclaim_wq) - destroy_workqueue(c->journal_reclaim_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->wq) @@ -754,6 +716,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init_early(&c->btree_cache); + mutex_init(&c->sectors_available_lock); + if (percpu_init_rwsem(&c->mark_lock)) goto err; @@ -788,8 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || @@ -2056,6 +2018,7 @@ static void bcachefs_exit(void) bch2_debug_exit(); bch2_vfs_exit(); bch2_chardev_exit(); + bch2_btree_key_cache_exit(); if (bcachefs_kset) kset_unregister(bcachefs_kset); } @@ -2065,6 +2028,7 @@ static int __init bcachefs_init(void) bch2_bkey_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_btree_key_cache_init() || bch2_chardev_init() || bch2_vfs_init() || bch2_debug_init()) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 048ffec622af..02c81f3555c3 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index d7ad293aff4d..cc13fc258115 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -165,6 +165,7 @@ read_attribute(journal_debug); read_attribute(journal_pins); read_attribute(btree_updates); read_attribute(dirty_btree_nodes); +read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); read_attribute(stripes_heap); @@ -374,6 +375,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_btree_cache) { + bch2_btree_cache_to_text(&out, c); + return out.pos - buf; + } + if (attr == &sysfs_btree_key_cache) { bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); return out.pos - buf; @@ -458,7 +464,7 @@ STORE(bch2_fs) /* Debugging: */ if (attr == &sysfs_trigger_journal_flush) - bch2_journal_meta_async(&c->journal, NULL); + bch2_journal_meta(&c->journal); if (attr == &sysfs_trigger_btree_coalesce) bch2_coalesce(c); @@ -497,10 +503,11 @@ STORE(bch2_fs) if (threads_str && !(ret = kstrtouint(threads_str, 10, &threads)) && !(ret = bch2_strtoull_h(nr_str, &nr))) - bch2_btree_perf_test(c, test, nr, threads); - else - size = ret; + ret = bch2_btree_perf_test(c, test, nr, threads); kfree(tmp); + + if (ret) + size = ret; } #endif return size; @@ -550,6 +557,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_pins, &sysfs_btree_updates, &sysfs_dirty_btree_nodes, + &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, &sysfs_stripes_heap, diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 4dcace650416..f1d09e3ada09 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c) /* unit tests */ -static void test_delete(struct bch_fs *c, u64 nr) +static int test_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -41,24 +41,37 @@ static void test_delete(struct bch_fs *c, u64 nr) BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); - BUG_ON(ret); + if (ret) { + bch_err(c, "lookup error in test_delete: %i", ret); + goto err; + } ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in test_delete: %i", ret); + goto err; + } pr_info("deleting once"); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "delete error (first) in test_delete: %i", ret); + goto err; + } pr_info("deleting twice"); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); - + if (ret) { + bch_err(c, "delete error (second) in test_delete: %i", ret); + goto err; + } +err: bch2_trans_exit(&trans); + return ret; } -static void test_delete_written(struct bch_fs *c, u64 nr) +static int test_delete_written(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -73,27 +86,37 @@ static void test_delete_written(struct bch_fs *c, u64 nr) BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); - BUG_ON(ret); + if (ret) { + bch_err(c, "lookup error in test_delete_written: %i", ret); + goto err; + } ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in test_delete_written: %i", ret); + goto err; + } bch2_journal_flush_all_pins(&c->journal); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); - + if (ret) { + bch_err(c, "delete error in test_delete_written: %i", ret); + goto err; + } +err: bch2_trans_exit(&trans); + return ret; } -static void test_iterate(struct bch_fs *c, u64 nr) +static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -109,7 +132,10 @@ static void test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate: %i", ret); + goto err; + } } pr_info("iterating forwards"); @@ -132,17 +158,18 @@ static void test_iterate(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != --i); BUG_ON(i); - +err: bch2_trans_exit(&trans); + return ret; } -static void test_iterate_extents(struct bch_fs *c, u64 nr) +static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -159,7 +186,10 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_extents: %i", ret); + goto err; + } } pr_info("iterating forwards"); @@ -182,17 +212,18 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) } BUG_ON(i); - +err: bch2_trans_exit(&trans); + return ret; } -static void test_iterate_slots(struct bch_fs *c, u64 nr) +static int test_iterate_slots(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -208,7 +239,10 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_slots: %i", ret); + goto err; + } } pr_info("iterating forwards"); @@ -240,17 +274,18 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } - +err: bch2_trans_exit(&trans); + return ret; } -static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) +static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -267,7 +302,10 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); + goto err; + } } pr_info("iterating forwards"); @@ -299,15 +337,16 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } - +err: bch2_trans_exit(&trans); + return 0; } /* * XXX: we really want to make sure we've got a btree with depth > 0 for these * tests */ -static void test_peek_end(struct bch_fs *c, u64 nr) +static int test_peek_end(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -324,9 +363,10 @@ static void test_peek_end(struct bch_fs *c, u64 nr) BUG_ON(k.k); bch2_trans_exit(&trans); + return 0; } -static void test_peek_end_extents(struct bch_fs *c, u64 nr) +static int test_peek_end_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -343,14 +383,15 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr) BUG_ON(k.k); bch2_trans_exit(&trans); + return 0; } /* extent unit tests */ u64 test_version; -static void insert_test_extent(struct bch_fs *c, - u64 start, u64 end) +static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) { struct bkey_i_cookie k; int ret; @@ -364,42 +405,47 @@ static void insert_test_extent(struct bch_fs *c, ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) + bch_err(c, "insert error in insert_test_extent: %i", ret); + return ret; } -static void __test_extent_overwrite(struct bch_fs *c, +static int __test_extent_overwrite(struct bch_fs *c, u64 e1_start, u64 e1_end, u64 e2_start, u64 e2_end) { - insert_test_extent(c, e1_start, e1_end); - insert_test_extent(c, e2_start, e2_end); + int ret; + + ret = insert_test_extent(c, e1_start, e1_end) ?: + insert_test_extent(c, e2_start, e2_end); delete_test_keys(c); + return ret; } -static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 0, 32); - __test_extent_overwrite(c, 8, 64, 0, 32); + return __test_extent_overwrite(c, 0, 64, 0, 32) ?: + __test_extent_overwrite(c, 8, 64, 0, 32); } -static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 32, 64); - __test_extent_overwrite(c, 0, 64, 32, 72); + return __test_extent_overwrite(c, 0, 64, 32, 64) ?: + __test_extent_overwrite(c, 0, 64, 32, 72); } -static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 32, 40); + return __test_extent_overwrite(c, 0, 64, 32, 40); } -static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 32, 64, 0, 64); - __test_extent_overwrite(c, 32, 64, 0, 128); - __test_extent_overwrite(c, 32, 64, 32, 64); - __test_extent_overwrite(c, 32, 64, 32, 128); + return __test_extent_overwrite(c, 32, 64, 0, 64) ?: + __test_extent_overwrite(c, 32, 64, 0, 128) ?: + __test_extent_overwrite(c, 32, 64, 32, 64) ?: + __test_extent_overwrite(c, 32, 64, 32, 128); } /* perf tests */ @@ -415,11 +461,11 @@ static u64 test_rand(void) return v; } -static void rand_insert(struct bch_fs *c, u64 nr) +static int rand_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct bkey_i_cookie k; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); @@ -430,48 +476,63 @@ static void rand_insert(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "error in rand_insert: %i", ret); + break; + } } bch2_trans_exit(&trans); + return ret; } -static void rand_lookup(struct bch_fs *c, u64 nr) +static int rand_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, - POS(0, test_rand()), 0); + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); k = bch2_btree_iter_peek(iter); - bch2_trans_iter_free(&trans, iter); + ret = bkey_err(k); + if (ret) { + bch_err(c, "error in rand_lookup: %i", ret); + break; + } } + bch2_trans_iter_free(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void rand_mixed(struct bch_fs *c, u64 nr) +static int rand_mixed(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, - POS(0, test_rand()), 0); + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) { + bch_err(c, "lookup error in rand_mixed: %i", ret); + break; + } if (!(i & 3) && k.k) { struct bkey_i_cookie k; @@ -481,14 +542,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in rand_mixed: %i", ret); + break; + } } - - bch2_trans_iter_free(&trans, iter); } + bch2_trans_iter_free(&trans, iter); bch2_trans_exit(&trans); + return ret; } static int __do_delete(struct btree_trans *trans, struct bpos pos) @@ -500,15 +563,14 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - goto err; - k = bch2_btree_iter_peek(iter); ret = bkey_err(k); if (ret) goto err; + if (!k.k) + goto err; + bkey_init(&delete.k); delete.k.p = k.k->p; @@ -518,10 +580,10 @@ err: return ret; } -static void rand_delete(struct bch_fs *c, u64 nr) +static int rand_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); @@ -531,19 +593,23 @@ static void rand_delete(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); - BUG_ON(ret); + if (ret) { + bch_err(c, "error in rand_delete: %i", ret); + break; + } } bch2_trans_exit(&trans); + return ret; } -static void seq_insert(struct bch_fs *c, u64 nr) +static int seq_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_cookie insert; - int ret; + int ret = 0; u64 i = 0; bkey_cookie_init(&insert.k_i); @@ -556,35 +622,39 @@ static void seq_insert(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &insert.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "error in seq_insert: %i", ret); + break; + } if (++i == nr) break; } bch2_trans_exit(&trans); + return ret; } -static void seq_lookup(struct bch_fs *c, u64 nr) +static int seq_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ; bch2_trans_exit(&trans); + return ret; } -static void seq_overwrite(struct bch_fs *c, u64 nr) +static int seq_overwrite(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -596,23 +666,28 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &u.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "error in seq_overwrite: %i", ret); + break; + } } bch2_trans_exit(&trans); + return ret; } -static void seq_delete(struct bch_fs *c, u64 nr) +static int seq_delete(struct bch_fs *c, u64 nr) { int ret; ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, POS(0, 0), POS(0, U64_MAX), NULL); - BUG_ON(ret); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; } -typedef void (*perf_test_fn)(struct bch_fs *, u64); +typedef int (*perf_test_fn)(struct bch_fs *, u64); struct test_job { struct bch_fs *c; @@ -628,11 +703,13 @@ struct test_job { u64 start; u64 finish; + int ret; }; static int btree_perf_test_thread(void *data) { struct test_job *j = data; + int ret; if (atomic_dec_and_test(&j->ready)) { wake_up(&j->ready_wait); @@ -641,7 +718,9 @@ static int btree_perf_test_thread(void *data) wait_event(j->ready_wait, !atomic_read(&j->ready)); } - j->fn(j->c, j->nr / j->nr_threads); + ret = j->fn(j->c, j->nr / j->nr_threads); + if (ret) + j->ret = ret; if (atomic_dec_and_test(&j->done)) { j->finish = sched_clock(); @@ -651,8 +730,8 @@ static int btree_perf_test_thread(void *data) return 0; } -void bch2_btree_perf_test(struct bch_fs *c, const char *testname, - u64 nr, unsigned nr_threads) +int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) { struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; char name_buf[20], nr_buf[20], per_sec_buf[20]; @@ -695,7 +774,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, if (!j.fn) { pr_err("unknown test %s", testname); - return; + return -EINVAL; } //pr_info("running test %s:", testname); @@ -720,6 +799,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, time / NSEC_PER_SEC, time * nr_threads / nr, per_sec_buf); + return j.ret; } #endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h index 551d0764225e..c73b18aea7e0 100644 --- a/fs/bcachefs/tests.h +++ b/fs/bcachefs/tests.h @@ -6,7 +6,7 @@ struct bch_fs; #ifdef CONFIG_BCACHEFS_TESTS -void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); +int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); #else diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index e8a7df61ff5c..6e5335440b4b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -88,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -653,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index ba2c55559796..d4cb7a298cc2 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write, TP_ARGS(bio) ); +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, u64 min_nr, + u64 prereserved, u64 prereserved_total, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, min_nr, prereserved, prereserved_total, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, min_nr ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->min_nr = min_nr; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + __entry->uuid, + __entry->min_nr, + __entry->prereserved, + __entry->prereserved_total, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) +); + /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -513,7 +572,7 @@ TRACE_EVENT(transaction_restart_ip, __entry->ip = ip; ), - TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) + TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) ); DECLARE_EVENT_CLASS(transaction_restart, @@ -528,7 +587,7 @@ DECLARE_EVENT_CLASS(transaction_restart, __entry->ip = ip; ), - TP_printk("%pf", (void *) __entry->ip) + TP_printk("%ps", (void *) __entry->ip) ); DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, @@ -568,7 +627,7 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_iter_type = want_iter_type; ), - TP_printk("%pF %pF because %u have %u:%u want %u:%u", + TP_printk("%ps %pS because %u have %u:%u want %u:%u", (void *) __entry->trans_ip, (void *) __entry->caller_ip, __entry->reason, @@ -592,7 +651,7 @@ TRACE_EVENT(trans_restart_iters_realloced, __entry->nr = nr; ), - TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) + TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) ); TRACE_EVENT(trans_restart_mem_realloced, @@ -609,7 +668,7 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes = bytes; ), - TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) + TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, @@ -622,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, TP_ARGS(ip) ); +DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, TP_PROTO(unsigned long ip), TP_ARGS(ip) @@ -657,11 +721,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_atomic, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) -); - DECLARE_EVENT_CLASS(node_lock_fail, TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), TP_ARGS(level, iter_seq, node, node_seq), |