diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-07-27 14:24:31 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-07-27 14:29:49 -0400 |
commit | 014287bbf6f0046d4092f20be2ff0c3385e5df3e (patch) | |
tree | f3613171cfd567b012d6d2252a85d22157c4d09a | |
parent | 0511e1ea598d18d603d4478030c1e5893d5b2598 (diff) |
Merge with 6288f1b609 bcachefs: Convert various code to printbuf
59 files changed, 1546 insertions, 1583 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index ea33d6abdcfc..f3e31bab4f9d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -249,6 +249,16 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); +const char *blk_status_to_str(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return "(invalid error)"; + return blk_errors[idx].name; +} +EXPORT_SYMBOL_GPL(blk_status_to_str); + static void print_req_error(struct request *req, blk_status_t status) { int idx = (__force int)status; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index cb720ee04b86..43b9f99194b9 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -41,29 +41,26 @@ static void pd_controllers_update(struct work_struct *work) struct bch_fs, pd_controllers_update); struct bch_dev *ca; + s64 free = 0, fragmented = 0; unsigned i; for_each_member_device(ca, c, i) { - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + struct bch_dev_usage stats = bch2_dev_usage_read(ca); - u64 free = bucket_to_sector(ca, + free += bucket_to_sector(ca, __dev_buckets_free(ca, stats)) << 9; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = (bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER] + - stats.buckets[BCH_DATA_CACHED]) - - (stats.sectors[BCH_DATA_USER] + - stats.sectors[BCH_DATA_CACHED])) << 9; - - fragmented = max(0LL, fragmented); - - bch2_pd_controller_update(&ca->copygc_pd, - free, fragmented, -1); + fragmented += max_t(s64, 0, (bucket_to_sector(ca, + stats.buckets[BCH_DATA_user] + + stats.buckets[BCH_DATA_cached]) - + (stats.sectors[BCH_DATA_user] + + stats.sectors[BCH_DATA_cached])) << 9); } + bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); schedule_delayed_work(&c->pd_controllers_update, c->pd_controllers_update_seconds * HZ); } @@ -517,11 +514,13 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - available = max_t(s64, 0, dev_buckets_available(c, ca) - + available = max_t(s64, 0, dev_buckets_available(ca) - ca->inc_gen_really_needs_gc); if (available > fifo_free(&ca->free_inc) || - (available && !fifo_full(&ca->free[RESERVE_BTREE]))) + (available && + (!fifo_full(&ca->free[RESERVE_BTREE]) || + !fifo_full(&ca->free[RESERVE_MOVINGGC])))) break; up_read(&c->gc_lock); @@ -1191,7 +1190,7 @@ stop: void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; - u64 capacity = 0, reserved_sectors = 0, gc_reserve; + u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; unsigned i, j; @@ -1234,7 +1233,7 @@ void bch2_recalc_capacity(struct bch_fs *c) dev_reserve *= ca->mi.bucket_size; - ca->copygc_threshold = dev_reserve; + copygc_threshold += dev_reserve; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); @@ -1253,6 +1252,7 @@ void bch2_recalc_capacity(struct bch_fs *c) reserved_sectors = min(reserved_sectors, capacity); + c->copygc_threshold = copygc_threshold; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; @@ -1312,7 +1312,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_writepoint_stop(c, ca, &c->write_points[i]); - bch2_writepoint_stop(c, ca, &ca->copygc_write_point); + bch2_writepoint_stop(c, ca, &c->copygc_write_point); bch2_writepoint_stop(c, ca, &c->rebalance_write_point); bch2_writepoint_stop(c, ca, &c->btree_write_point); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 979aba30bc9d..4a048828869b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -70,12 +70,6 @@ #include <linux/rcupdate.h> #include <trace/events/bcachefs.h> -enum bucket_alloc_ret { - ALLOC_SUCCESS, - OPEN_BUCKETS_EMPTY, - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - /* * Open buckets represent a bucket that's currently being allocated from. They * serve two purposes: @@ -150,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) } static void open_bucket_free_unused(struct bch_fs *c, - struct open_bucket *ob, - bool may_realloc) + struct write_point *wp, + struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + bool may_realloc = wp->type == BCH_DATA_user; - BUG_ON(ca->open_buckets_partial_nr >= + BUG_ON(ca->open_buckets_partial_nr > ARRAY_SIZE(ca->open_buckets_partial)); if (ca->open_buckets_partial_nr < @@ -234,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, spin_lock(&c->freelist_lock); - if (may_alloc_partial && - ca->open_buckets_partial_nr) { - ob = c->open_buckets + - ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - return ob; + if (may_alloc_partial) { + int i; + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } } if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { @@ -297,6 +301,7 @@ out: ob->valid = true; ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; ob->ptr = (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, .gen = buckets->b[bucket].mark.gen, @@ -344,21 +349,20 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; - struct bch_dev *ca; unsigned i; - for_each_member_device_rcu(ca, c, i, devs) + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ret.devs[ret.nr++] = i; bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); return ret; } -void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, +void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_free(c, ca); + u64 free_space = dev_buckets_free(ca); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -396,21 +400,22 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -static int bch2_bucket_alloc_set(struct bch_fs *c, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) +enum bucket_alloc_ret +bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) { struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; - bool alloc_failure = false; + enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; unsigned i; BUG_ON(*nr_effective >= nr_replicas); @@ -428,102 +433,28 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { - enum bucket_alloc_ret ret = -PTR_ERR(ob); - - WARN_ON(reserve == RESERVE_MOVINGGC && - ret != OPEN_BUCKETS_EMPTY); + ret = -PTR_ERR(ob); if (cl) - return -EAGAIN; - if (ret == OPEN_BUCKETS_EMPTY) - return -ENOSPC; - alloc_failure = true; + return ret; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(c, ca, stripe); + bch2_dev_stripe_increment(ca, stripe); if (*nr_effective >= nr_replicas) - return 0; + return ALLOC_SUCCESS; } - return alloc_failure ? -ENOSPC : -EROFS; + return ret; } /* Allocate from stripes: */ /* - * XXX: use a higher watermark for allocating open buckets here: - */ -static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct bch_devs_mask devs; - struct open_bucket *ob; - unsigned i, nr_have = 0, nr_data = - min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; - bool have_cache = true; - int ret = 0; - - BUG_ON(h->blocks.nr > nr_data); - BUG_ON(h->parity.nr > h->redundancy); - - devs = h->devs; - - open_bucket_for_each(c, &h->parity, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - open_bucket_for_each(c, &h->blocks, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - - if (h->parity.nr < h->redundancy) { - nr_have = h->parity.nr; - - ret = bch2_bucket_alloc_set(c, &h->parity, - &h->parity_stripe, - &devs, - h->redundancy, - &nr_have, - &have_cache, - RESERVE_NONE, - 0, - NULL); - if (ret) - goto err; - } - - if (h->blocks.nr < nr_data) { - nr_have = h->blocks.nr; - - ret = bch2_bucket_alloc_set(c, &h->blocks, - &h->block_stripe, - &devs, - nr_data, - &nr_have, - &have_cache, - RESERVE_NONE, - 0, - NULL); - if (ret) - goto err; - } - - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - - return bch2_ec_stripe_new_alloc(c, h); -err: - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return -1; -} - -/* * if we can't allocate a new stripe because there are already too many * partially filled stripes, force allocating from an existing stripe even when * it's to a device we don't want: @@ -555,34 +486,30 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, if (ec_open_bucket(c, ptrs)) return; - h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); if (!h) return; - if (!h->s && ec_stripe_alloc(c, h)) - goto out_put_head; - - rcu_read_lock(); devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - rcu_read_unlock(); for (i = 0; i < devs_sorted.nr; i++) open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) if (ob->ptr.dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + !test_and_set_bit(h->s->data_block_idx[ec_idx], + h->s->blocks_allocated)) goto got_bucket; goto out_put_head; got_bucket: ca = bch_dev_bkey_exists(c, ob->ptr.dev); - ob->ec_idx = ec_idx; + ob->ec_idx = h->s->data_block_idx[ec_idx]; ob->ec = h->s; add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); atomic_inc(&h->s->pin); out_put_head: - bch2_ec_stripe_head_put(h); + bch2_ec_stripe_head_put(c, h); } /* Sector allocator */ @@ -607,7 +534,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, if (*nr_effective < nr_replicas && test_bit(ob->ptr.dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_USER && !*have_cache)) && + (wp->type == BCH_DATA_user && !*have_cache)) && (ob->ec || !need_ec)) { add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, @@ -619,24 +546,25 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static int open_bucket_add_buckets(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum alloc_reserve reserve, - unsigned flags, - struct closure *_cl) +static enum bucket_alloc_ret +open_bucket_add_buckets(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *_cl) { struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; + enum bucket_alloc_ret ret; unsigned i; - int ret; rcu_read_lock(); devs = target_rw_devs(c, wp->type, target); @@ -650,18 +578,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, __clear_bit(ob->ptr.dev, devs.d); if (erasure_code) { - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, true); - if (*nr_effective >= nr_replicas) - return 0; + if (!ec_open_bucket(c, ptrs)) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, flags, true); + if (*nr_effective >= nr_replicas) + return 0; + } - bucket_alloc_from_stripe(c, ptrs, wp, &devs, - target, erasure_code, - nr_replicas, nr_effective, - have_cache, flags); - if (*nr_effective >= nr_replicas) - return 0; + if (!ec_open_bucket(c, ptrs)) { + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags); + if (*nr_effective >= nr_replicas) + return 0; + } } get_buckets_from_writepoint(c, ptrs, wp, &devs, @@ -681,7 +613,7 @@ retry_blocking: ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != -EROFS && !cl && _cl) { + if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { cl = _cl; goto retry_blocking; } @@ -872,7 +804,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned nr_effective, write_points_nr; unsigned ob_flags = 0; bool have_cache; - int ret, i; + enum bucket_alloc_ret ret; + int i; if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ob_flags |= BUCKET_ALLOC_USE_DURABILITY; @@ -886,11 +819,11 @@ retry: wp = writepoint_find(c, write_point.v); - if (wp->type == BCH_DATA_USER) + if (wp->type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; /* metadata may not allocate on cache devices: */ - if (wp->type != BCH_DATA_USER) + if (wp->type != BCH_DATA_user) have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -920,7 +853,7 @@ alloc_done: if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); - if (ret == -EROFS && + if (ret == INSUFFICIENT_DEVICES && nr_effective >= nr_replicas_required) ret = 0; @@ -929,7 +862,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); + open_bucket_free_unused(c, wp, ob); wp->ptrs = ptrs; @@ -948,17 +881,24 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, ob, - wp->type == BCH_DATA_USER); + open_bucket_free_unused(c, wp, ob); wp->ptrs = ptrs; mutex_unlock(&wp->lock); - if (ret == -ENOSPC && + if (ret == FREELIST_EMPTY && try_decrease_writepoints(c, write_points_nr)) goto retry; - return ERR_PTR(ret); + switch (ret) { + case OPEN_BUCKETS_EMPTY: + case FREELIST_EMPTY: + return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); + case INSUFFICIENT_DEVICES: + return ERR_PTR(-EROFS); + default: + BUG(); + } } /* @@ -980,7 +920,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bch_extent_ptr tmp = ob->ptr; tmp.cached = !ca->mi.durability && - wp->type == BCH_DATA_USER; + wp->type == BCH_DATA_user; tmp.offset += ca->mi.bucket_size - ob->sectors_free; bch2_bkey_append_ptr(k, tmp); @@ -1009,6 +949,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) bch2_open_buckets_put(c, &ptrs); } +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->type = type; +} + void bch2_fs_allocator_foreground_init(struct bch_fs *c) { struct open_bucket *ob; @@ -1029,12 +976,13 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) c->open_buckets_freelist = ob - c->open_buckets; } - writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); - writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); + writepoint_init(&c->btree_write_point, BCH_DATA_btree); + writepoint_init(&c->rebalance_write_point, BCH_DATA_user); + writepoint_init(&c->copygc_write_point, BCH_DATA_user); for (wp = c->write_points; wp < c->write_points + c->write_points_nr; wp++) { - writepoint_init(wp, BCH_DATA_USER); + writepoint_init(wp, BCH_DATA_user); wp->last_used = sched_clock(); wp->write_point = (unsigned long) wp; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 687f973e4b3a..c658295cb8e0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -12,6 +12,13 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +enum bucket_alloc_ret { + ALLOC_SUCCESS, + OPEN_BUCKETS_EMPTY, + FREELIST_EMPTY, /* Allocator thread not keeping up */ + INSUFFICIENT_DEVICES, +}; + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -20,8 +27,7 @@ struct dev_alloc_list { struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); -void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, - struct dev_stripe_state *); +void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); @@ -92,6 +98,12 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } } +enum bucket_alloc_ret +bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); + struct write_point *bch2_alloc_sectors_start(struct bch_fs *, unsigned, unsigned, struct write_point_specifier, @@ -121,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp return (struct write_point_specifier) { .v = (unsigned long) wp }; } -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->type = type; -} - void bch2_fs_allocator_foreground_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4f1465077994..20705460bb0a 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -66,6 +66,7 @@ struct open_bucket { u8 type; unsigned valid:1; unsigned on_partial_list:1; + int alloc_reserve:3; unsigned sectors_free; struct bch_extent_ptr ptr; struct ec_stripe_new *ec; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index dbc714db6a3d..90303b6a3d99 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -451,13 +451,6 @@ struct bch_dev { alloc_heap alloc_heap; - /* Copying GC: */ - struct task_struct *copygc_thread; - copygc_heap copygc_heap; - struct bch_pd_controller copygc_pd; - struct write_point copygc_write_point; - u64 copygc_threshold; - atomic64_t rebalance_work; struct journal_device journal; @@ -741,7 +734,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -751,16 +744,27 @@ struct bch_fs { /* REBALANCE */ struct bch_fs_rebalance rebalance; + /* COPYGC */ + struct task_struct *copygc_thread; + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; + u64 copygc_threshold; + /* STRIPES: */ GENRADIX(struct stripe) stripes[2]; - struct mutex ec_stripe_create_lock; ec_stripes_heap ec_stripes_heap; spinlock_t ec_stripes_heap_lock; /* ERASURE CODING */ - struct list_head ec_new_stripe_list; - struct mutex ec_new_stripe_lock; + struct list_head ec_stripe_head_list; + struct mutex ec_stripe_head_lock; + + struct list_head ec_stripe_new_list; + struct mutex ec_stripe_new_lock; + + struct work_struct ec_stripe_create_work; u64 ec_stripe_hint; struct bio_set ec_bioset; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f808e63a713d..d5a2230e403c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1026,14 +1026,19 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); /* BCH_SB_FIELD_replicas: */ +#define BCH_DATA_TYPES() \ + x(none, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) + enum bch_data_type { - BCH_DATA_NONE = 0, - BCH_DATA_SB = 1, - BCH_DATA_JOURNAL = 2, - BCH_DATA_BTREE = 3, - BCH_DATA_USER = 4, - BCH_DATA_CACHED = 5, - BCH_DATA_NR = 6, +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR }; struct bch_replicas_entry_v0 { diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 6fc91e6a35e8..f7c2841ed8a7 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -313,44 +313,6 @@ struct rw_aux_tree { struct bpos k; }; -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 128 - -/* Space required for the btree node keys */ -static inline size_t btree_keys_bytes(struct btree *b) -{ - return PAGE_SIZE << b->page_order; -} - -static inline size_t btree_keys_cachelines(struct btree *b) -{ - return btree_keys_bytes(b) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) { BUG_ON(t->aux_data_offset == U16_MAX); @@ -426,29 +388,6 @@ static void bset_aux_tree_verify(struct btree *b) #endif } -/* Memory allocation */ - -void bch2_btree_keys_free(struct btree *b) -{ - vfree(b->aux_data); - b->aux_data = NULL; -} - -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - -int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) -{ - b->page_order = page_order; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); - if (!b->aux_data) - return -ENOMEM; - - return 0; -} - void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) { unsigned i; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 652ffed4adfb..5921cf689105 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -184,6 +184,38 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree } } +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +static inline size_t btree_keys_cachelines(struct btree *b) +{ + return (1U << b->byte_order) / BSET_CACHELINE; +} + +static inline size_t btree_aux_data_bytes(struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + +static inline size_t btree_aux_data_u64s(struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} + typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); static inline void @@ -334,8 +366,6 @@ static inline struct bset *bset_next_set(struct btree *b, return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } -void bch2_btree_keys_free(struct btree *); -int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); void bch2_btree_keys_init(struct btree *, bool *); void bch2_bset_init_first(struct btree *, struct bset *); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index d3addd3a8964..a0d570f3adf0 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -44,7 +44,8 @@ static void __btree_node_data_free(struct bch_fs *c, struct btree *b) kvpfree(b->data, btree_bytes(c)); b->data = NULL; - bch2_btree_keys_free(b); + vfree(b->aux_data); + b->aux_data = NULL; } static void btree_node_data_free(struct bch_fs *c, struct btree *b) @@ -72,7 +73,11 @@ static const struct rhashtable_params bch_btree_cache_params = { .obj_cmpfn = bch2_btree_cache_cmp_fn, }; -static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); @@ -80,7 +85,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { + b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, + PAGE_KERNEL_EXEC); + if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; return -ENOMEM; @@ -89,21 +96,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) return 0; } -static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -{ - struct btree_cache *bc = &c->btree_cache; - - if (!__btree_node_data_alloc(c, b, gfp)) { - bc->used++; - list_move(&b->list, &bc->freeable); - } else { - list_move(&b->list, &bc->freed); - } -} - -static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +static struct btree *__btree_node_mem_alloc(struct bch_fs *c) { - struct btree *b = kzalloc(sizeof(struct btree), gfp); + struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); if (!b) return NULL; @@ -111,9 +106,25 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) six_lock_init(&b->c.lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); + return b; +} - btree_node_data_alloc(c, b, gfp); - return b->data ? b : NULL; +static struct btree *btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b = __btree_node_mem_alloc(c); + if (!b) + return NULL; + + if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + kfree(b); + return NULL; + } + + bc->used++; + list_add(&b->list, &bc->freeable); + return b; } /* Btree in memory cache - hash table */ @@ -124,6 +135,8 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + six_lock_wakeup_all(&b->c.lock); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -402,7 +415,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!btree_node_mem_alloc(c, GFP_KERNEL)) { + if (!btree_node_mem_alloc(c)) { ret = -ENOMEM; goto out; } @@ -418,7 +431,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) goto out; } - c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); + c->verify_data = btree_node_mem_alloc(c); if (!c->verify_data) { ret = -ENOMEM; goto out; @@ -550,21 +563,16 @@ got_node: mutex_unlock(&bc->lock); if (!b) { - b = kzalloc(sizeof(struct btree), GFP_KERNEL); + b = __btree_node_mem_alloc(c); if (!b) goto err; - bkey_btree_ptr_init(&b->key); - six_lock_init(&b->c.lock); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); } if (!b->data) { - if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) goto err; mutex_lock(&bc->lock); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 2160012c734f..d0d3a85bb8be 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -79,14 +79,9 @@ static inline size_t btree_max_u64s(struct bch_fs *c) return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_page_order(struct bch_fs *c) -{ - return get_order(btree_bytes(c)); -} - static inline size_t btree_pages(struct bch_fs *c) { - return 1 << btree_page_order(c); + return btree_bytes(c) / PAGE_SIZE; } static inline unsigned btree_blocks(struct bch_fs *c) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 8771ef1f07cc..4f581130270c 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -109,7 +109,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, atomic64_set(&c->key_version, k.k->version.lo); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, + fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, "superblock not marked as containing replicas (type %u)", k.k->type)) { ret = bch2_mark_bkey_replicas(c, k); @@ -433,16 +433,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, if (offset == BCH_SB_SECTOR) mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_SB, flags); + BCH_DATA_sb, flags); mark_metadata_sectors(c, ca, offset, offset + (1 << layout->sb_max_size_bits), - BCH_DATA_SB, flags); + BCH_DATA_sb, flags); } for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } @@ -617,8 +617,11 @@ static int bch2_gc_done(struct bch_fs *c, copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); - if (dst->alive) + if (dst->alive) { + spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_insert(c, dst, dst_iter.pos); + spin_unlock(&c->ec_stripes_heap_lock); + } genradix_iter_advance(&dst_iter, &c->stripes[0]); genradix_iter_advance(&src_iter, &c->stripes[1]); @@ -673,8 +676,8 @@ static int bch2_gc_done(struct bch_fs *c, char buf[80]; if (metadata_only && - (e->data_type == BCH_DATA_USER || - e->data_type == BCH_DATA_CACHED)) + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) continue; bch2_replicas_entry_to_text(&PBUF(buf), e); @@ -759,8 +762,8 @@ static int bch2_gc_start(struct bch_fs *c, d->gen_valid = s->gen_valid; if (metadata_only && - (s->mark.data_type == BCH_DATA_USER || - s->mark.data_type == BCH_DATA_CACHED)) { + (s->mark.data_type == BCH_DATA_user || + s->mark.data_type == BCH_DATA_cached)) { d->_mark = s->mark; d->_mark.owned_by_allocator = 0; } @@ -949,8 +952,10 @@ int bch2_gc_gens(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (btree_node_type_needs_gc(i)) { ret = bch2_gc_btree_gens(c, i); - if (ret) + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); goto err; + } } for_each_member_device(ca, c, i) { @@ -961,6 +966,8 @@ int bch2_gc_gens(struct bch_fs *c) g->oldest_gen = g->gc_gen; up_read(&ca->bucket_lock); } + + c->gc_count++; err: up_read(&c->gc_lock); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index c81783ed9400..887e40574c93 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -57,25 +57,25 @@ static void set_needs_whiteout(struct bset *i, int v) k->needs_whiteout = v; } -static void btree_bounce_free(struct bch_fs *c, unsigned order, +static void btree_bounce_free(struct bch_fs *c, size_t size, bool used_mempool, void *p) { if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, PAGE_SIZE << order); + vpfree(p, size); } -static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, bool *used_mempool) { unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(order > btree_page_order(c)); + BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); @@ -125,16 +125,14 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) { struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; bool used_mempool = false; - unsigned order; + size_t bytes = b->whiteout_u64s * sizeof(u64); if (!b->whiteout_u64s) return; - order = get_order(b->whiteout_u64s * sizeof(u64)); + new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); - - ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); + ptrs = ptrs_end = ((void *) new_whiteouts + bytes); for (k = unwritten_whiteouts_start(c, b); k != unwritten_whiteouts_end(c, b); @@ -158,7 +156,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) memcpy_u64s(unwritten_whiteouts_start(c, b), new_whiteouts, b->whiteout_u64s); - btree_bounce_free(c, order, used_mempool, new_whiteouts); + btree_bounce_free(c, bytes, used_mempool, new_whiteouts); } static bool should_compact_bset(struct btree *b, struct bset_tree *t, @@ -187,7 +185,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, struct bkey_packed *whiteouts = NULL; struct bkey_packed *u_start, *u_pos; struct sort_iter sort_iter; - unsigned order, whiteout_u64s = 0, u64s; + unsigned bytes, whiteout_u64s = 0, u64s; bool used_mempool, compacting = false; BUG_ON(!btree_node_is_extents(b)); @@ -204,9 +202,9 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, sort_iter_init(&sort_iter, b); whiteout_u64s += b->whiteout_u64s; - order = get_order(whiteout_u64s * sizeof(u64)); + bytes = whiteout_u64s * sizeof(u64); - whiteouts = btree_bounce_alloc(c, order, &used_mempool); + whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); u_start = u_pos = whiteouts; memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), @@ -306,7 +304,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, unwritten_whiteouts_end(c, b), true); - btree_bounce_free(c, order, used_mempool, whiteouts); + btree_bounce_free(c, bytes, used_mempool, whiteouts); bch2_btree_build_aux_trees(b); @@ -401,7 +399,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; u64 start_time, seq = 0; - unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; + unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; @@ -416,11 +414,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, btree_bkey_last(b, t)); } - order = sorting_entire_node - ? btree_page_order(c) - : get_order(__vstruct_bytes(struct btree_node, u64s)); + bytes = sorting_entire_node + ? btree_bytes(c) + : __vstruct_bytes(struct btree_node, u64s); - out = btree_bounce_alloc(c, order, &used_mempool); + out = btree_bounce_alloc(c, bytes, &used_mempool); start_time = local_clock(); @@ -435,7 +433,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, out->keys.u64s = cpu_to_le16(u64s); - BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); + BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); if (sorting_entire_node) bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], @@ -449,7 +447,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { unsigned u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(order != btree_page_order(c)); + BUG_ON(bytes != btree_bytes(c)); /* * Our temporary buffer is the same size as the btree node's @@ -484,7 +482,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, set_btree_bset_end(b, &b->set[start_idx]); bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - btree_bounce_free(c, order, used_mempool, out); + btree_bounce_free(c, bytes, used_mempool, out); bch2_verify_btree_nr_keys(b); } @@ -620,7 +618,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -917,6 +915,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; + struct bch_extent_ptr *ptr; struct bset *i; bool used_mempool, blacklisted; unsigned u64s; @@ -971,8 +970,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bset_encrypt(c, i, b->written << 9); if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { set_btree_node_old_extent_overwrite(b); + set_btree_node_need_rewrite(b); + } sectors = vstruct_sectors(b->data, c->block_bits); } else { @@ -1040,7 +1041,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry BTREE_ERR_WANT_RETRY, c, b, NULL, "found bset signature after last bset"); - sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1058,7 +1059,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { @@ -1098,6 +1099,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry set_needs_whiteout(btree_bset_first(b), true); btree_node_reset_sib_u64s(b); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.state != BCH_MEMBER_STATE_RW) + set_btree_node_need_rewrite(b); + } out: mempool_free(iter, &c->fill_iter); return retry_read; @@ -1139,7 +1147,8 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", + bch2_blk_status_to_str(bio->bi_status)); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1220,7 +1229,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, set_btree_node_read_in_flight(b); if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); bio_set_dev(bio, ca->disk_sb.bdev); @@ -1392,7 +1401,7 @@ static void btree_node_write_work(struct work_struct *work) struct btree *b = wbio->wbio.bio.bi_private; btree_bounce_free(c, - wbio->wbio.order, + wbio->bytes, wbio->wbio.used_mempool, wbio->data); @@ -1423,8 +1432,8 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bio->bi_status == BLK_STS_REMOVED || - bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", + bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); @@ -1475,7 +1484,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; u64 seq = 0; bool used_mempool; unsigned long old, new; @@ -1545,8 +1554,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } - order = get_order(bytes); - data = btree_bounce_alloc(c, order, &used_mempool); + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { bn = data; @@ -1658,7 +1666,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; - wbio->wbio.order = order; + wbio->bytes = bytes; wbio->wbio.used_mempool = used_mempool; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; @@ -1689,13 +1697,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; /* XXX: submitting IO with btree locks held: */ - bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); return; err: set_btree_node_noevict(b); b->written += sectors_to_write; nowrite: - btree_bounce_free(c, order, used_mempool, data); + btree_bounce_free(c, bytes, used_mempool, data); btree_node_write_done(c, b); } @@ -1826,9 +1834,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c) rcu_read_unlock(); } -ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) +void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; @@ -1841,7 +1848,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) if (!(flags & (1 << BTREE_NODE_dirty))) continue; - pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", + pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", b, (flags & (1 << BTREE_NODE_dirty)) != 0, (flags & (1 << BTREE_NODE_need_write)) != 0, @@ -1852,6 +1859,4 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) b->will_make_reachable & 1); } rcu_read_unlock(); - - return out.pos - buf; } diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index f3d7ec749b61..66ebdd39f5b3 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -23,8 +23,9 @@ struct btree_read_bio { }; struct btree_write_bio { - void *data; struct work_struct work; + void *data; + unsigned bytes; struct bch_write_bio wbio; }; @@ -139,7 +140,7 @@ do { \ void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_btree_verify_flushed(struct bch_fs *); -ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); +void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 16c4d058358b..683b416ef427 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -94,7 +94,7 @@ struct btree { struct btree_nr_keys nr; u16 sib_u64s[2]; u16 whiteout_u64s; - u8 page_order; + u8 byte_order; u8 unpack_fn_len; /* @@ -409,6 +409,7 @@ enum btree_flags { BTREE_NODE_dying, BTREE_NODE_fake, BTREE_NODE_old_extent_overwrite, + BTREE_NODE_need_rewrite, }; BTREE_FLAG(read_in_flight); @@ -423,6 +424,7 @@ BTREE_FLAG(just_written); BTREE_FLAG(dying); BTREE_FLAG(fake); BTREE_FLAG(old_extent_overwrite); +BTREE_FLAG(need_rewrite); static inline struct btree_write *btree_current_write(struct btree *b) { @@ -593,7 +595,6 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) enum btree_trigger_flags { __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, @@ -606,7 +607,6 @@ enum btree_trigger_flags { }; #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a8cd6ffb6c7c..a2604b0ce2d8 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,7 +26,7 @@ /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct btree *b) +static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG struct bpos next_node = b->data->min_key; @@ -37,6 +37,9 @@ static void btree_node_interior_verify(struct btree *b) BUG_ON(!b->c.level); + if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + return; + bch2_btree_node_iter_init_from_start(&iter, b); while (1) { @@ -135,8 +138,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) bch2_btree_node_hash_remove(&c->btree_cache, b); - six_lock_wakeup_all(&b->c.lock); - mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); @@ -290,8 +291,10 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { set_btree_node_old_extent_overwrite(b); + set_btree_node_need_rewrite(b); + } bch2_btree_build_aux_trees(b); @@ -1118,8 +1121,8 @@ static struct btree *__btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n2); if (n1->c.level) { - btree_node_interior_verify(n1); - btree_node_interior_verify(n2); + btree_node_interior_verify(as->c, n1); + btree_node_interior_verify(as->c, n2); } return n2; @@ -1178,7 +1181,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, BUG_ON(b->nsets != 1 || b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); - btree_node_interior_verify(b); + btree_node_interior_verify(as->c, b); } static void btree_split(struct btree_update *as, struct btree *b, @@ -1376,7 +1379,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, bch2_btree_node_unlock_write(b, iter); - btree_node_interior_verify(b); + btree_node_interior_verify(c, b); /* * when called from the btree_split path the new nodes aren't added to @@ -1864,7 +1867,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, new_hash = bch2_btree_node_mem_alloc(c); } - +retry: as = bch2_btree_update_start(iter->trans, iter->btree_id, parent ? btree_update_reserve_required(c, parent) : 0, BTREE_INSERT_NOFAIL| @@ -1877,16 +1880,17 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, if (ret == -EAGAIN) ret = -EINTR; - if (ret != -EINTR) - goto err; + if (ret == -EINTR) { + bch2_trans_unlock(iter->trans); + up_read(&c->gc_lock); + closure_sync(&cl); + down_read(&c->gc_lock); - bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); - closure_sync(&cl); - down_read(&c->gc_lock); + if (bch2_trans_relock(iter->trans)) + goto retry; + } - if (!bch2_trans_relock(iter->trans)) - goto err; + goto err; } ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); @@ -1943,6 +1947,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); + set_btree_node_need_rewrite(b); b->c.level = 0; b->c.btree_id = id; @@ -1969,22 +1974,19 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) six_unlock_intent(&b->c.lock); } -ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) +void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - pr_buf(&out, "%p m %u w %u r %u j %llu\n", + pr_buf(out, "%p m %u w %u r %u j %llu\n", as, as->mode, as->nodes_written, atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); - - return out.pos - buf; } size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 4a5b9dcfbdd0..7668225e72c6 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -311,13 +311,13 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, struct btree *b, unsigned u64s) { - if (unlikely(btree_node_fake(b))) + if (unlikely(btree_node_need_rewrite(b))) return false; return u64s <= bch_btree_keys_u64s_remaining(c, b); } -ssize_t bch2_btree_updates_print(struct bch_fs *, char *); +void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 6e9688d0bb77..cd699c257244 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -264,30 +264,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, - unsigned *u64s) + unsigned u64s) { struct bch_fs *c = trans->c; struct btree *b = iter_l(iter)->b; - static enum btree_insert_ret ret; - if (unlikely(btree_node_fake(b))) - return BTREE_INSERT_BTREE_NODE_FULL; - - /* - * old bch2_extent_sort_fix_overlapping() algorithm won't work with new - * style extent updates: - */ - if (unlikely(btree_node_old_extent_overwrite(b))) - return BTREE_INSERT_BTREE_NODE_FULL; - - ret = !btree_iter_is_extents(iter) - ? BTREE_INSERT_OK - : bch2_extent_can_insert(trans, iter, insert); - if (ret) - return ret; - - if (*u64s > bch_btree_keys_u64s_remaining(c, b)) + if (!bch2_btree_node_insert_fits(c, b, u64s)) return BTREE_INSERT_BTREE_NODE_FULL; return BTREE_INSERT_OK; @@ -296,8 +278,7 @@ btree_key_can_insert(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert_cached(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, - unsigned *u64s) + unsigned u64s) { struct bkey_cached *ck = (void *) iter->l[0].b; unsigned new_u64s; @@ -305,10 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans, BUG_ON(iter->level); - if (*u64s <= ck->u64s) + if (u64s <= ck->u64s) return BTREE_INSERT_OK; - new_u64s = roundup_pow_of_two(*u64s); + new_u64s = roundup_pow_of_two(u64s); new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) return -ENOMEM; @@ -414,8 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, u64s += i->k->k.u64s; ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED - ? btree_key_can_insert(trans, i->iter, i->k, &u64s) - : btree_key_can_insert_cached(trans, i->iter, i->k, &u64s); + ? btree_key_can_insert(trans, i->iter, u64s) + : btree_key_can_insert_cached(trans, i->iter, u64s); if (ret) { *stopped_at = i; return ret; @@ -733,6 +714,11 @@ static int extent_update_to_keys(struct btree_trans *trans, struct bkey_i *insert) { struct btree_iter *iter; + int ret; + + ret = bch2_extent_can_insert(trans, orig_iter, insert); + if (ret) + return ret; if (bkey_deleted(&insert->k)) return 0; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 0ec194b93c71..97a8af31ded1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -133,13 +133,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c) cpu_replicas_entry(&c->replicas, i); switch (e->data_type) { - case BCH_DATA_BTREE: + case BCH_DATA_btree: usage->btree += usage->replicas[i]; break; - case BCH_DATA_USER: + case BCH_DATA_user: usage->data += usage->replicas[i]; break; - case BCH_DATA_CACHED: + case BCH_DATA_cached: usage->cached += usage->replicas[i]; break; } @@ -179,7 +179,7 @@ out_pool: return ret; } -struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { struct bch_dev_usage ret; @@ -367,7 +367,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m, struct bch_dev *ca) { if (!m.owned_by_allocator && - m.data_type == BCH_DATA_USER && + m.data_type == BCH_DATA_user && bucket_sectors_used(m)) return max_t(int, 0, (int) ca->mi.bucket_size - bucket_sectors_used(m)); @@ -382,7 +382,7 @@ static inline int bucket_stripe_sectors(struct bucket_mark m) static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors - ? BCH_DATA_CACHED + ? BCH_DATA_cached : m.data_type; } @@ -435,7 +435,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, enum bch_data_type type, int nr, s64 size) { - if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) + if (type == BCH_DATA_sb || type == BCH_DATA_journal) fs_usage->hidden += size; dev_usage->buckets[type] += nr; @@ -472,7 +472,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->sectors[old.data_type] -= old.dirty_sectors; u->sectors[new.data_type] += new.dirty_sectors; - u->sectors[BCH_DATA_CACHED] += + u->sectors[BCH_DATA_cached] += (int) new.cached_sectors - (int) old.cached_sectors; u->sectors_fragmented += is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); @@ -520,13 +520,13 @@ static inline int update_replicas(struct bch_fs *c, return 0; switch (r->data_type) { - case BCH_DATA_BTREE: + case BCH_DATA_btree: fs_usage->btree += sectors; break; - case BCH_DATA_USER: + case BCH_DATA_user: fs_usage->data += sectors; break; - case BCH_DATA_CACHED: + case BCH_DATA_cached: fs_usage->cached += sectors; break; } @@ -713,7 +713,8 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_alloc(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { @@ -721,7 +722,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, struct bkey_alloc_unpacked u; struct bch_dev *ca; struct bucket *g; - struct bucket_mark old, m; + struct bucket_mark old_m, m; + + /* We don't do anything for deletions - do we?: */ + if (new.k->type != KEY_TYPE_alloc) + return 0; /* * alloc btree is read in by bch2_alloc_read, not gc: @@ -730,15 +735,15 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + ca = bch_dev_bkey_exists(c, new.k->p.inode); - if (k.k->p.offset >= ca->mi.nbuckets) + if (new.k->p.offset >= ca->mi.nbuckets) return 0; - g = __bucket(ca, k.k->p.offset, gc); - u = bch2_alloc_unpack(k); + g = __bucket(ca, new.k->p.offset, gc); + u = bch2_alloc_unpack(new); - old = bucket_cmpxchg(g, m, ({ + old_m = bucket_cmpxchg(g, m, ({ m.gen = u.gen; m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; @@ -751,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, })); if (!(flags & BTREE_TRIGGER_ALLOC_READ)) - bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -764,11 +769,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old.cached_sectors) { + old_m.cached_sectors) { update_cached_sectors(c, fs_usage, ca->dev_idx, - -old.cached_sectors); - trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), - old.cached_sectors); + -old_m.cached_sectors); + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + old_m.cached_sectors); } return 0; @@ -792,8 +797,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket_mark old, new; bool overflow; - BUG_ON(data_type != BCH_DATA_SB && - data_type != BCH_DATA_JOURNAL); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); old = bucket_cmpxchg(g, new, ({ new.data_type = data_type; @@ -824,8 +829,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, unsigned sectors, struct gc_pos pos, unsigned flags) { - BUG_ON(type != BCH_DATA_SB && - type != BCH_DATA_JOURNAL); + BUG_ON(type != BCH_DATA_sb && + type != BCH_DATA_journal); preempt_disable(); @@ -878,51 +883,46 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, } static void bucket_set_stripe(struct bch_fs *c, - const struct bch_stripe *v, + const struct bch_extent_ptr *ptr, struct bch_fs_usage *fs_usage, u64 journal_seq, - unsigned flags) + unsigned flags, + bool enabled) { - bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); bool gc = flags & BTREE_TRIGGER_GC; - unsigned i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket_mark new, old; - for (i = 0; i < v->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = v->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, gc); - struct bucket_mark new, old; - - old = bucket_cmpxchg(g, new, ({ - new.stripe = enabled; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - })); + old = bucket_cmpxchg(g, new, ({ + new.stripe = enabled; + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - /* - * XXX write repair code for these, flag stripe as possibly bad - */ - if (old.gen != ptr->gen) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "stripe with stale pointer"); + /* + * XXX write repair code for these, flag stripe as possibly bad + */ + if (old.gen != ptr->gen) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "stripe with stale pointer"); #if 0 - /* - * We'd like to check for these, but these checks don't work - * yet: - */ - if (old.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "multiple stripes using same bucket"); - - if (!old.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "deleting stripe but bucket not marked as stripe bucket"); + /* + * We'd like to check for these, but these checks don't work + * yet: + */ + if (old.stripe && enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "multiple stripes using same bucket"); + + if (!old.stripe && !enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "deleting stripe but bucket not marked as stripe bucket"); #endif - } } static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, @@ -1064,8 +1064,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, { bool gc = flags & BTREE_TRIGGER_GC; struct stripe *m; - unsigned old, new; - int blocks_nonempty_delta; + unsigned i, blocks_nonempty = 0; m = genradix_ptr(&c->stripes[gc], p.idx); @@ -1084,31 +1083,30 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, *nr_parity = m->nr_redundant; *r = m->r; - old = m->block_sectors[p.block]; m->block_sectors[p.block] += sectors; - new = m->block_sectors[p.block]; - blocks_nonempty_delta = (int) !!new - (int) !!old; - if (blocks_nonempty_delta) { - m->blocks_nonempty += blocks_nonempty_delta; + for (i = 0; i < m->nr_blocks; i++) + blocks_nonempty += m->block_sectors[i] != 0; + if (m->blocks_nonempty != blocks_nonempty) { + m->blocks_nonempty = blocks_nonempty; if (!gc) bch2_stripes_heap_update(c, m, p.idx); } - m->dirty = true; - spin_unlock(&c->ec_stripes_heap_lock); return 0; } -static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_extent(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, unsigned offset, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, unsigned journal_seq, unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1124,7 +1122,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_BTREE + s64 disk_sectors = data_type == BCH_DATA_btree ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); @@ -1177,72 +1175,98 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_stripe(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - size_t idx = s.k->p.offset; + size_t idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; struct stripe *m = genradix_ptr(&c->stripes[gc], idx); unsigned i; - spin_lock(&c->ec_stripes_heap_lock); - - if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { - spin_unlock(&c->ec_stripes_heap_lock); + if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); return -1; } - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - m->sectors = le16_to_cpu(s.v->sectors); - m->algorithm = s.v->algorithm; - m->nr_blocks = s.v->nr_blocks; - m->nr_redundant = s.v->nr_redundant; + if (!new_s) { + /* Deleting: */ + for (i = 0; i < old_s->nr_blocks; i++) + bucket_set_stripe(c, old_s->ptrs + i, fs_usage, + journal_seq, flags, false); - bch2_bkey_to_replicas(&m->r.e, k); + if (!gc && m->on_heap) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } - /* - * XXX: account for stripes somehow here - */ -#if 0 - update_replicas(c, fs_usage, &m->r.e, stripe_sectors); -#endif + memset(m, 0, sizeof(*m)); + } else { + BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); + BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); + + for (i = 0; i < new_s->nr_blocks; i++) { + if (!old_s || + memcmp(new_s->ptrs + i, + old_s->ptrs + i, + sizeof(struct bch_extent_ptr))) { + + if (old_s) + bucket_set_stripe(c, old_s->ptrs + i, fs_usage, + journal_seq, flags, false); + bucket_set_stripe(c, new_s->ptrs + i, fs_usage, + journal_seq, flags, true); + } + } + + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + bch2_bkey_to_replicas(&m->r.e, new); /* gc recalculates these fields: */ if (!(flags & BTREE_TRIGGER_GC)) { - for (i = 0; i < s.v->nr_blocks; i++) { + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { m->block_sectors[i] = - stripe_blockcount_get(s.v, i); + stripe_blockcount_get(new_s, i); m->blocks_nonempty += !!m->block_sectors[i]; } } - if (!gc) + if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_update(c, m, idx); - m->alive = true; - } else { - if (!gc) - bch2_stripes_heap_del(c, m, idx); - memset(m, 0, sizeof(*m)); + spin_unlock(&c->ec_stripes_heap_lock); + } } - spin_unlock(&c->ec_stripes_heap_lock); - - bucket_set_stripe(c, s.v, fs_usage, 0, flags); return 0; } static int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, + struct bkey_s_c old, + struct bkey_s_c new, unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; int ret = 0; + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + preempt_disable(); if (!fs_usage || (flags & BTREE_TRIGGER_GC)) @@ -1251,7 +1275,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, switch (k.k->type) { case KEY_TYPE_alloc: - ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); + ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: @@ -1259,16 +1283,16 @@ static int bch2_mark_key_locked(struct bch_fs *c, ? c->opts.btree_node_size : -c->opts.btree_node_size; - ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, - fs_usage, journal_seq, flags); + ret = bch2_mark_extent(c, old, new, offset, sectors, + BCH_DATA_btree, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, - fs_usage, journal_seq, flags); + ret = bch2_mark_extent(c, old, new, offset, sectors, + BCH_DATA_user, fs_usage, journal_seq, flags); break; case KEY_TYPE_stripe: - ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); + ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: if (!(flags & BTREE_TRIGGER_OVERWRITE)) @@ -1294,82 +1318,38 @@ static int bch2_mark_key_locked(struct bch_fs *c, return ret; } -int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, +int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { + struct bkey deleted; + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; int ret; + bkey_init(&deleted); + percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, k, offset, sectors, - fs_usage, journal_seq, flags); + ret = bch2_mark_key_locked(c, old, new, offset, sectors, + fs_usage, journal_seq, + BTREE_TRIGGER_INSERT|flags); percpu_up_read(&c->mark_lock); return ret; } -inline int bch2_mark_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c old, - struct bkey_i *new, - struct bch_fs_usage *fs_usage, - unsigned flags, - bool is_extents) -{ - struct bch_fs *c = trans->c; - unsigned offset = 0; - s64 sectors = -((s64) old.k->size); - - flags |= BTREE_TRIGGER_OVERWRITE; - - if (is_extents - ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 - : bkey_cmp(new->k.p, old.k->p)) - return 0; - - if (is_extents) { - switch (bch2_extent_overlap(&new->k, old.k)) { - case BCH_EXTENT_OVERLAP_ALL: - offset = 0; - sectors = -((s64) old.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - offset = 0; - sectors = bkey_start_offset(old.k) - - new->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = -((s64) new->k.size); - flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; - break; - } - - BUG_ON(sectors >= 0); - } - - return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, - trans->journal_res.seq, flags) ?: 1; -} - int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, + struct bkey_i *new, struct bch_fs_usage *fs_usage, unsigned flags) { struct bch_fs *c = trans->c; struct btree *b = iter_l(iter)->b; struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_k; + struct bkey_packed *_old; + struct bkey_s_c old; + struct bkey unpacked; int ret = 0; if (unlikely(flags & BTREE_TRIGGER_NORUN)) @@ -1378,34 +1358,87 @@ int bch2_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, insert->k.size, - fs_usage, trans->journal_res.seq, - BTREE_TRIGGER_INSERT|flags); + bkey_init(&unpacked); + old = (struct bkey_s_c) { &unpacked, NULL }; - if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) - return 0; + if (!btree_node_type_is_extents(iter->btree_id)) { + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + _old = bch2_btree_node_iter_peek(&node_iter, b); + if (_old) + old = bkey_disassemble(b, _old, &unpacked); + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; - /* - * For non extents, we only mark the new key, not the key being - * overwritten - unless we're actually deleting: - */ - if ((iter->btree_id == BTREE_ID_ALLOC || - iter->btree_id == BTREE_ID_EC) && - !bkey_deleted(&insert->k)) - return 0; + if (ck->valid) + old = bkey_i_to_s_c(ck->k); + } - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + if (old.k->type == new->k.type) { + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - ret = bch2_mark_overwrite(trans, iter, k, insert, - fs_usage, flags, - btree_node_type_is_extents(iter->btree_id)); - if (ret <= 0) - break; + } else { + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), + 0, new->k.size, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); - bch2_btree_node_iter_advance(&node_iter, b); + while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { + unsigned offset = 0; + s64 sectors; + + old = bkey_disassemble(b, _old, &unpacked); + sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) + return 0; + + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + offset = 0; + sectors = -((s64) old.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; + sectors = bkey_start_offset(old.k) - + new->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); + flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; + break; + } + + BUG_ON(sectors >= 0); + + ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), + offset, sectors, fs_usage, + trans->journal_res.seq, flags) ?: 1; + if (ret <= 0) + break; + + bch2_btree_node_iter_advance(&node_iter, b); + } } return ret; @@ -1460,8 +1493,10 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } else { struct bkey_cached *ck = (void *) i->iter->l[0].b; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); - pr_err("%s", buf); + if (ck->valid) { + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); + pr_err("%s", buf); + } } } } @@ -1632,7 +1667,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_BTREE + s64 disk_sectors = data_type == BCH_DATA_btree ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); @@ -1774,11 +1809,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, : -c->opts.btree_node_size; return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_BTREE); + flags, BCH_DATA_btree); case KEY_TYPE_extent: case KEY_TYPE_reflink_v: return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_USER); + flags, BCH_DATA_user); case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); @@ -1829,9 +1864,6 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (ret) return ret; - if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) - return 0; - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -1992,7 +2024,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; - copygc_heap copygc_heap; size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); @@ -2001,15 +2032,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve * 2); - bool resize = ca->buckets[0] != NULL, - start_copygc = ca->copygc_thread != NULL; + bool resize = ca->buckets[0] != NULL; int ret = -ENOMEM; unsigned i; memset(&free, 0, sizeof(free)); memset(&free_inc, 0, sizeof(free_inc)); memset(&alloc_heap, 0, sizeof(alloc_heap)); - memset(©gc_heap, 0, sizeof(copygc_heap)); if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), @@ -2022,14 +2051,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || - !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) + !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) goto err; buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = nbuckets; - bch2_copygc_stop(ca); + bch2_copygc_stop(c); if (resize) { down_write(&c->gc_lock); @@ -2072,21 +2100,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) /* with gc lock held, alloc_heap can't be in use: */ swap(ca->alloc_heap, alloc_heap); - /* and we shut down copygc: */ - swap(ca->copygc_heap, copygc_heap); - nbuckets = ca->mi.nbuckets; if (resize) up_write(&ca->bucket_lock); - if (start_copygc && - bch2_copygc_start(c, ca)) - bch_err(ca, "error restarting copygc thread"); - ret = 0; err: - free_heap(©gc_heap); free_heap(&alloc_heap); free_fifo(&free_inc); for (i = 0; i < RESERVE_NR; i++) @@ -2103,7 +2123,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->copygc_heap); free_heap(&ca->alloc_heap); free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 97265fe90e96..653f6761862e 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -99,9 +99,9 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, { if (k->type == KEY_TYPE_btree_ptr || k->type == KEY_TYPE_btree_ptr_v2) - return BCH_DATA_BTREE; + return BCH_DATA_btree; - return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, @@ -182,7 +182,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, /* Device usage: */ -struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); void bch2_dev_usage_from_buckets(struct bch_fs *); @@ -202,9 +202,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, /* * Number of reclaimable buckets - only for use by the allocator thread: */ -static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_dev *ca) { - return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); } static inline u64 __dev_buckets_free(struct bch_dev *ca, @@ -215,9 +215,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca, fifo_used(&ca->free_inc); } -static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) +static inline u64 dev_buckets_free(struct bch_dev *ca) { - return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); + return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); } /* Filesystem usage: */ @@ -259,14 +259,11 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, - struct bch_fs_usage *, u64, unsigned); +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, + s64, struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, unsigned); -int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, struct bkey_i *, - struct bch_fs_usage *, unsigned, bool); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 53f22726893d..4ebe80b05ffc 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -123,6 +123,7 @@ struct disk_reservation { }; struct copygc_heap_entry { + u8 dev; u8 gen; u32 sectors; u64 offset; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 3af521947502..0377f9018d27 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -468,7 +468,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(c, ca); + src = bch2_dev_usage_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..3d88719ba86c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -463,7 +464,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 833537cc8fd0..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index a9f5d5696622..1d1590de55e8 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -152,9 +152,8 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) timer->fn(timer); } -ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) +void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); unsigned long now; unsigned i; @@ -162,12 +161,10 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) now = atomic_long_read(&clock->now); for (i = 0; i < clock->timers.used; i++) - pr_buf(&out, "%ps:\t%li\n", + pr_buf(out, "%ps:\t%li\n", clock->timers.data[i]->fn, clock->timers.data[i]->expire - now); spin_unlock(&clock->timer_lock); - - return out.pos - buf; } void bch2_io_clock_exit(struct io_clock *clock) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index da50afe206cc..70a0f7436c84 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -30,7 +30,7 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); __ret; \ }) -ssize_t bch2_io_timers_show(struct io_clock *, char *); +void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); void bch2_io_clock_exit(struct io_clock *); int bch2_io_clock_init(struct io_clock *); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index ddff52de2e97..b50d2b0d5fd3 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -7,7 +7,6 @@ #include "super-io.h" #include <linux/lz4.h> -#include <linux/sched/mm.h> #include <linux/zlib.h> #include <linux/zstd.h> @@ -46,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; @@ -64,7 +63,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, struct bbuf ret; struct bio_vec bv; struct bvec_iter iter; - unsigned nr_pages = 0, flags; + unsigned nr_pages = 0; struct page *stack_pages[16]; struct page **pages = NULL; void *data; @@ -104,10 +103,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, __bio_for_each_segment(bv, bio, iter, start) pages[nr_pages++] = bv.bv_page; - flags = memalloc_nofs_save(); data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - memalloc_nofs_restore(flags); - if (pages != stack_pages) kfree(pages); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 9442d6e4041c..425b0b806cee 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -200,40 +200,6 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) return false; } -static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i_stripe *s, - struct open_buckets *blocks, - struct open_buckets *parity, - unsigned stripe_size) -{ - struct open_bucket *ob; - unsigned i, u64s; - - bkey_stripe_init(&s->k_i); - s->v.sectors = cpu_to_le16(stripe_size); - s->v.algorithm = 0; - s->v.nr_blocks = parity->nr + blocks->nr; - s->v.nr_redundant = parity->nr; - s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); - s->v.csum_type = BCH_CSUM_CRC32C; - s->v.pad = 0; - - open_bucket_for_each(c, blocks, ob, i) - s->v.ptrs[i] = ob->ptr; - - open_bucket_for_each(c, parity, ob, i) - s->v.ptrs[blocks->nr + i] = ob->ptr; - - while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { - BUG_ON(1 << s->v.csum_granularity_bits >= - le16_to_cpu(s->v.sectors) || - s->v.csum_granularity_bits == U8_MAX); - s->v.csum_granularity_bits++; - } - - set_bkey_val_u64s(&s->k, u64s); -} - /* Checksumming: */ static void ec_generate_checksums(struct ec_stripe_buf *buf) @@ -360,7 +326,9 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); bio_put(&ec_bio->bio); @@ -605,39 +573,16 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) BUG_ON(h->data[m->heap_idx].idx != idx); } -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - size_t i; - - if (m->alive) { - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - heap_sift_up(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - heap_sift_down(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - - heap_verify_backpointer(c, idx); - } else { - bch2_stripes_heap_insert(c, m, idx); - } - - if (stripe_idx_to_delete(c) >= 0 && - !percpu_ref_is_dying(&c->writes)) - schedule_work(&c->ec_stripe_delete_work); -} - void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { + if (!m->on_heap) + return; + + m->on_heap = false; + heap_verify_backpointer(c, idx); - m->alive = false; heap_del(&c->ec_stripes_heap, m->heap_idx, ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); @@ -646,23 +591,54 @@ void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { + if (m->on_heap) + return; + BUG_ON(heap_full(&c->ec_stripes_heap)); + m->on_heap = true; + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { .idx = idx, .blocks_nonempty = m->blocks_nonempty, }), ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); - m->alive = true; heap_verify_backpointer(c, idx); } +void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + size_t i; + + if (!m->on_heap) + return; + + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + if (stripe_idx_to_delete(c) >= 0 && + !percpu_ref_is_dying(&c->writes)) + schedule_work(&c->ec_stripe_delete_work); +} + /* stripe deletion */ static int ec_stripe_delete(struct bch_fs *c, size_t idx) { + //pr_info("deleting stripe %zu", idx); return bch2_btree_delete_range(c, BTREE_ID_EC, POS(0, idx), POS(0, idx + 1), @@ -675,23 +651,20 @@ static void ec_stripe_delete_work(struct work_struct *work) container_of(work, struct bch_fs, ec_stripe_delete_work); ssize_t idx; - down_read(&c->gc_lock); - mutex_lock(&c->ec_stripe_create_lock); - while (1) { spin_lock(&c->ec_stripes_heap_lock); idx = stripe_idx_to_delete(c); - spin_unlock(&c->ec_stripes_heap_lock); - - if (idx < 0) + if (idx < 0) { + spin_unlock(&c->ec_stripes_heap_lock); break; + } + + bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); + spin_unlock(&c->ec_stripes_heap_lock); if (ec_stripe_delete(c, idx)) break; } - - mutex_unlock(&c->ec_stripe_create_lock); - up_read(&c->gc_lock); } /* stripe creation: */ @@ -784,6 +757,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + /* XXX this doesn't support the reflink btree */ + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(pos), BTREE_ITER_INTENT); @@ -809,12 +784,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, bkey_on_stack_reassemble(&sk, c, k); e = bkey_i_to_s_extent(sk.k); - extent_for_each_ptr(e, ptr) { - if (ptr->dev == dev) - ec_ptr = ptr; - else - ptr->cached = true; - } + bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); + ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); + BUG_ON(!ec_ptr); extent_stripe_ptr_add(e, s, ec_ptr, idx); @@ -844,6 +816,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) struct bch_fs *c = s->c; struct open_bucket *ob; struct bkey_i *k; + struct stripe *m; struct bch_stripe *v = &s->stripe.key.v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; struct closure cl; @@ -854,10 +827,13 @@ static void ec_stripe_create(struct ec_stripe_new *s) closure_init_stack(&cl); if (s->err) { - bch_err(c, "error creating stripe: error writing data buckets"); + if (s->err != -EROFS) + bch_err(c, "error creating stripe: error writing data buckets"); goto err; } + BUG_ON(!s->allocated); + if (!percpu_ref_tryget(&c->writes)) goto err; @@ -880,22 +856,33 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err_put_writes; } - mutex_lock(&c->ec_stripe_create_lock); - - ret = ec_stripe_bkey_insert(c, &s->stripe.key); + ret = s->existing_stripe + ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, + NULL, NULL, BTREE_INSERT_NOFAIL) + : ec_stripe_bkey_insert(c, &s->stripe.key); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); - goto err_unlock; + goto err_put_writes; } for_each_keylist_key(&s->keys, k) { ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); - if (ret) + if (ret) { + bch_err(c, "error creating stripe: error updating pointers"); break; + } } -err_unlock: - mutex_unlock(&c->ec_stripe_create_lock); + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); +#if 0 + pr_info("created a %s stripe %llu", + s->existing_stripe ? "existing" : "new", + s->stripe.key.k.p.offset); +#endif + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); + spin_unlock(&c->ec_stripes_heap_lock); err_put_writes: percpu_ref_put(&c->writes); err: @@ -908,30 +895,52 @@ err: bch2_keylist_free(&s->keys, s->inline_keys); - mutex_lock(&s->h->lock); - list_del(&s->list); - mutex_unlock(&s->h->lock); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) kvpfree(s->stripe.data[i], s->stripe.size << 9); kfree(s); } -static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) +static void ec_stripe_create_work(struct work_struct *work) { - struct ec_stripe_new *s = h->s; + struct bch_fs *c = container_of(work, + struct bch_fs, ec_stripe_create_work); + struct ec_stripe_new *s, *n; +restart: + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->pin)) { + list_del(&s->list); + mutex_unlock(&c->ec_stripe_new_lock); + ec_stripe_create(s); + goto restart; + } + mutex_unlock(&c->ec_stripe_new_lock); +} - list_add(&s->list, &h->stripes); - h->s = NULL; +static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) +{ + BUG_ON(atomic_read(&s->pin) <= 0); - return s; + if (atomic_dec_and_test(&s->pin)) { + BUG_ON(!s->pending); + queue_work(system_long_wq, &c->ec_stripe_create_work); + } } -static void ec_stripe_new_put(struct ec_stripe_new *s) +static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { - BUG_ON(atomic_read(&s->pin) <= 0); - if (atomic_dec_and_test(&s->pin)) - ec_stripe_create(s); + struct ec_stripe_new *s = h->s; + + BUG_ON(!s->allocated && !s->err); + + h->s = NULL; + s->pending = true; + + mutex_lock(&c->ec_stripe_new_lock); + list_add(&s->list, &c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + ec_stripe_new_put(c, s); } /* have a full bucket - hand it off to be erasure coded: */ @@ -942,7 +951,7 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) if (ob->sectors_free) s->err = -1; - ec_stripe_new_put(s); + ec_stripe_new_put(c, s); } void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) @@ -976,6 +985,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, if (!ob) return; + //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); + ec = ob->ec; mutex_lock(&ec->lock); @@ -1034,14 +1045,43 @@ static unsigned pick_blocksize(struct bch_fs *c, return best.size; } -int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) +static bool may_create_new_stripe(struct bch_fs *c) +{ + return false; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + unsigned nr_data, + unsigned nr_parity, + unsigned stripe_size) +{ + unsigned u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_type = BCH_CSUM_CRC32C; + s->v.pad = 0; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; unsigned i; - BUG_ON(h->parity.nr != h->redundancy); - BUG_ON(!h->blocks.nr); - BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); lockdep_assert_held(&h->lock); s = kzalloc(sizeof(*s), GFP_KERNEL); @@ -1052,11 +1092,9 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) atomic_set(&s->pin, 1); s->c = c; s->h = h; - s->blocks = h->blocks; - s->parity = h->parity; - - memset(&h->blocks, 0, sizeof(h->blocks)); - memset(&h->parity, 0, sizeof(h->parity)); + s->nr_data = min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + s->nr_parity = h->redundancy; bch2_keylist_init(&s->keys, s->inline_keys); @@ -1064,9 +1102,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->stripe.size = h->blocksize; memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); - ec_stripe_key_init(c, &s->stripe.key, - &s->blocks, &s->parity, - h->blocksize); + ec_stripe_key_init(c, &s->stripe.key, s->nr_data, + s->nr_parity, h->blocksize); for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); @@ -1098,14 +1135,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, mutex_init(&h->lock); mutex_lock(&h->lock); - INIT_LIST_HEAD(&h->stripes); h->target = target; h->algo = algo; h->redundancy = redundancy; rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_USER, target); + h->devs = target_rw_devs(c, BCH_DATA_user, target); for_each_member_device_rcu(ca, c, i, &h->devs) if (!ca->mi.durability) @@ -1118,26 +1154,22 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); - list_add(&h->list, &c->ec_new_stripe_list); + list_add(&h->list, &c->ec_stripe_head_list); return h; } -void bch2_ec_stripe_head_put(struct ec_stripe_head *h) +void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) { - struct ec_stripe_new *s = NULL; - if (h->s && + h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->blocks.nr) == h->s->blocks.nr) - s = ec_stripe_set_pending(h); + ec_stripe_set_pending(c, h); mutex_unlock(&h->lock); - - if (s) - ec_stripe_new_put(s); } -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, +struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, unsigned target, unsigned algo, unsigned redundancy) @@ -1147,8 +1179,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, if (!redundancy) return NULL; - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && h->redundancy == redundancy) { @@ -1158,7 +1190,196 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, h = ec_new_stripe_head_alloc(c, target, algo, redundancy); found: - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); + return h; +} + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + devs = h->devs; + + for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { + __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); + --nr_data; + } + + BUG_ON(h->s->blocks.nr > nr_data); + BUG_ON(h->s->parity.nr > h->redundancy); + + open_bucket_for_each(c, &h->s->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->s->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read(&c->mark_lock); + rcu_read_lock(); + + if (h->s->parity.nr < h->redundancy) { + nr_have = h->s->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->s->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->s->blocks.nr < nr_data) { + nr_have = h->s->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->s->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } +err: + rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + return ret; +} + +/* XXX: doesn't obey target: */ +static s64 get_existing_stripe(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t heap_idx; + u64 stripe_idx; + + if (may_create_new_stripe(c)) + return -1; + + spin_lock(&c->ec_stripes_heap_lock); + for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + if (!h->data[heap_idx].blocks_nonempty) + continue; + + stripe_idx = h->data[heap_idx].idx; + m = genradix_ptr(&c->stripes[0], stripe_idx); + + if (m->algorithm == algo && + m->nr_redundant == redundancy && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { + bch2_stripes_heap_del(c, m, stripe_idx); + spin_unlock(&c->ec_stripes_heap_lock); + return stripe_idx; + } + } + + spin_unlock(&c->ec_stripes_heap_lock); + return -1; +} + +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (!ret) + bkey_reassemble(&stripe->key.k_i, k); + bch2_trans_exit(&trans); + + return ret; +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + struct closure cl; + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i, data_idx = 0; + s64 idx; + + closure_init_stack(&cl); + + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); + if (!h) + return NULL; + + if (!h->s && ec_new_stripe_alloc(c, h)) { + bch2_ec_stripe_head_put(c, h); + return NULL; + } + + if (!h->s->allocated) { + if (!h->s->existing_stripe && + (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { + //pr_info("got existing stripe %llu", idx); + + h->s->existing_stripe = true; + h->s->existing_stripe_idx = idx; + if (get_stripe_key(c, idx, &h->s->stripe)) { + /* btree error */ + BUG(); + } + + for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) + if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { + __set_bit(i, h->s->blocks_allocated); + ec_block_io(c, &h->s->stripe, READ, i, &cl); + } + } + + if (new_stripe_alloc_buckets(c, h)) { + bch2_ec_stripe_head_put(c, h); + h = NULL; + goto out; + } + + open_bucket_for_each(c, &h->s->blocks, ob, i) { + data_idx = find_next_zero_bit(h->s->blocks_allocated, + h->s->nr_data, data_idx); + BUG_ON(data_idx >= h->s->nr_data); + + h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; + h->s->data_block_idx[i] = data_idx; + data_idx++; + } + + open_bucket_for_each(c, &h->s->parity, ob, i) + h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + + //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); + h->s->allocated = true; + } +out: + closure_sync(&cl); return h; } @@ -1168,14 +1389,10 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) struct open_bucket *ob; unsigned i; - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) { - struct ec_stripe_new *s = NULL; + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { mutex_lock(&h->lock); - bch2_open_buckets_stop_dev(c, ca, &h->blocks); - bch2_open_buckets_stop_dev(c, ca, &h->parity); - if (!h->s) goto unlock; @@ -1187,15 +1404,12 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) goto found; goto unlock; found: - h->s->err = -1; - s = ec_stripe_set_pending(h); + h->s->err = -EROFS; + ec_stripe_set_pending(c, h); unlock: mutex_unlock(&h->lock); - - if (s) - ec_stripe_new_put(s); } - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); } static int __bch2_stripe_write_key(struct btree_trans *trans, @@ -1278,11 +1492,21 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, { int ret = 0; - if (k.k->type == KEY_TYPE_stripe) + if (k.k->type == KEY_TYPE_stripe) { + struct stripe *m; + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_ALLOC_READ| BTREE_TRIGGER_NOATOMIC); + if (ret) + return ret; + + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], k.k->p.offset); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } return ret; } @@ -1333,25 +1557,73 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) return 0; } +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min(h->used, 20UL); i++) { + m = genradix_ptr(&c->stripes[0], h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, + h->data[i].blocks_nonempty, + m->nr_blocks - m->nr_redundant, + m->nr_redundant); + } + spin_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + pr_buf(out, "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); + + if (h->s) + pr_buf(out, "\tpending: blocks %u allocated %u\n", + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); + } + mutex_unlock(&c->ec_stripe_head_lock); + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(h, &c->ec_stripe_new_list, list) { + pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, + s->blocks.nr), + atomic_read(&s->pin)); + } + mutex_unlock(&c->ec_stripe_new_lock); +} + void bch2_fs_ec_exit(struct bch_fs *c) { struct ec_stripe_head *h; while (1) { - mutex_lock(&c->ec_new_stripe_lock); - h = list_first_entry_or_null(&c->ec_new_stripe_list, + mutex_lock(&c->ec_stripe_head_lock); + h = list_first_entry_or_null(&c->ec_stripe_head_list, struct ec_stripe_head, list); if (h) list_del(&h->list); - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); if (!h) break; BUG_ON(h->s); - BUG_ON(!list_empty(&h->stripes)); kfree(h); } + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + free_heap(&c->ec_stripes_heap); genradix_free(&c->stripes[0]); bioset_exit(&c->ec_bioset); @@ -1359,6 +1631,7 @@ void bch2_fs_ec_exit(struct bch_fs *c) int bch2_fs_ec_init(struct bch_fs *c) { + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 4dfaac034886..f8fc3d616cd7 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -93,9 +93,17 @@ struct ec_stripe_new { int err; + u8 nr_data; + u8 nr_parity; + bool allocated; + bool pending; + bool existing_stripe; + u64 existing_stripe_idx; + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; struct open_buckets blocks; + u8 data_block_idx[EC_STRIPE_MAX]; struct open_buckets parity; struct keylist keys; @@ -108,8 +116,6 @@ struct ec_stripe_head { struct list_head list; struct mutex lock; - struct list_head stripes; - unsigned target; unsigned algo; unsigned redundancy; @@ -122,9 +128,6 @@ struct ec_stripe_head { struct dev_stripe_state block_stripe; struct dev_stripe_state parity_stripe; - struct open_buckets blocks; - struct open_buckets parity; - struct ec_stripe_new *s; }; @@ -139,7 +142,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -void bch2_ec_stripe_head_put(struct ec_stripe_head *); +void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, unsigned, unsigned); @@ -157,6 +160,9 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *); int bch2_ec_mem_alloc(struct bch_fs *, bool); +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + void bch2_fs_ec_exit(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 5c3f77c8aac7..e4d633fca5bf 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -22,6 +22,7 @@ struct stripe { unsigned alive:1; unsigned dirty:1; + unsigned on_heap:1; u8 blocks_nonempty; u16 block_sectors[EC_STRIPE_MAX]; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 251d4af773a5..568f039edcff 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -179,11 +179,6 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) if (!percpu_down_read_trylock(&c->mark_lock)) return; - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); @@ -194,7 +189,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) goto err; err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || + if (mark.data_type != BCH_DATA_btree || mark.dirty_sectors < c->opts.btree_node_size) goto err; } @@ -267,11 +262,6 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) if (!percpu_down_read_trylock(&c->mark_lock)) return; - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - extent_for_each_ptr_decode(e, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); @@ -289,7 +279,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) "key too stale: %i", stale); bch2_fs_inconsistent_on(!stale && - (mark.data_type != BCH_DATA_USER || + (mark.data_type != BCH_DATA_user || mark_sectors < disk_sectors), c, "extent pointer not marked: %s:\n" "type %u sectors %u < %u", @@ -724,7 +714,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (WARN_ON(!s)) goto out; - durability = max_t(unsigned, durability, s->nr_redundant); + durability += s->nr_redundant; } out: return durability; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 4411883ab7b8..951a436195ee 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -603,7 +603,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -628,10 +628,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -783,11 +783,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1038,32 +1035,33 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1087,7 +1085,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1241,7 +1239,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_PAGES * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1518,24 +1516,24 @@ retry_reservation: if (!pg_copied) break; + if (!PageUptodate(page) && + pg_copied != PAGE_SIZE && + pos + copied + pg_copied < inode->v.i_size) { + zero_user(page, 0, PAGE_SIZE); + break; + } + flush_dcache_page(page); iov_iter_advance(iter, pg_copied); copied += pg_copied; + + if (pg_copied != pg_len) + break; } if (!copied) goto out; - if (copied < len && - ((offset + copied) & (PAGE_SIZE - 1))) { - struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; - - if (!PageUptodate(page)) { - zero_user(page, 0, PAGE_SIZE); - copied -= (offset + copied) & (PAGE_SIZE - 1); - } - } - spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); @@ -1632,6 +1630,7 @@ again: } pos += ret; written += ret; + ret = 0; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); @@ -1809,8 +1808,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; + unsigned unaligned; bool sync = dio->sync; long ret; @@ -1818,22 +1818,12 @@ static long bch2_dio_write_loop(struct dio_write *dio) goto loop; while (1) { - size_t extra = dio->iter.count - - min(BIO_MAX_PAGES * PAGE_SIZE, dio->iter.count); - if (kthread) use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; - /* - * Don't issue more than 2MB at once, the bcachefs io path in - * io.c can't bounce more than that: - */ - - dio->iter.count -= extra; ret = bio_iov_iter_get_pages(bio, &dio->iter); - dio->iter.count += extra; current->faults_disabled_mapping = NULL; if (kthread) @@ -1851,7 +1841,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1914,7 +1904,7 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; @@ -2825,235 +2815,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3249,7 +3010,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..7063556d289b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ba73e5258e8d..a47923d67f7a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -966,15 +966,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -992,7 +983,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1523,7 +1514,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 1aca92cacdfc..19b79e60126a 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -31,9 +31,17 @@ #include <linux/blkdev.h> #include <linux/random.h> +#include <linux/sched/mm.h> #include <trace/events/bcachefs.h> +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -124,10 +132,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -611,7 +619,8 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); if (wbio->have_ioref) { @@ -1053,7 +1062,10 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; struct bio *bio; bool skip_put = true; + unsigned nofs_flags; int ret; + + nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); @@ -1100,6 +1112,16 @@ again: goto flush_io; } + /* + * It's possible for the allocator to fail, put us on the + * freelist waitlist, and then succeed in one of various retry + * paths: if that happens, we need to disable the skip_put + * optimization because otherwise there won't necessarily be a + * barrier before we free the bch_write_op: + */ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + skip_put = false; + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); @@ -1129,19 +1151,21 @@ again: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, key_to_write); } while (ret); if (!skip_put) continue_at(cl, bch2_write_index, index_update_wq(op)); +out: + memalloc_nofs_restore(nofs_flags); return; err: op->error = ret; op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; flush_io: /* * If the write can't all be submitted at once, we generally want to @@ -1152,7 +1176,7 @@ flush_io: */ if (current->flags & PF_WQ_WORKER) { continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; } closure_sync(cl); @@ -1163,7 +1187,7 @@ flush_io: if (op->error) { op->flags |= BCH_WRITE_DONE; continue_at_nobarrier(cl, bch2_write_done, NULL); - return; + goto out; } } @@ -1921,7 +1945,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } @@ -2174,7 +2199,7 @@ get_bio: goto out; } - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 0ad293bd6295..ded468d70f09 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -22,6 +22,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, #define BLK_STS_REMOVED ((__force blk_status_t)128) +const char *bch2_blk_status_to_str(blk_status_t); + enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index 684e4c9a5d98..b23727d212b9 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -78,7 +78,6 @@ struct bch_write_bio { u64 submit_time; struct bch_devs_list failed; - u8 order; u8 dev; unsigned split:1, diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index b4f7b61ba9ac..210ad1b0c469 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -847,7 +847,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); @@ -1135,9 +1135,8 @@ out: /* debug: */ -ssize_t bch2_journal_print_debug(struct journal *j, char *buf) +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; @@ -1147,7 +1146,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) spin_lock(&j->lock); s = READ_ONCE(j->reservations); - pr_buf(&out, + pr_buf(out, "active journal entries:\t%llu\n" "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" @@ -1165,44 +1164,44 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - pr_buf(&out, "error\n"); + pr_buf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - pr_buf(&out, "closed\n"); + pr_buf(out, "closed\n"); break; default: - pr_buf(&out, "%u/%u\n", + pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - pr_buf(&out, + pr_buf(out, "current entry refs:\t%u\n" "prev entry unwritten:\t", journal_state_count(s, s.idx)); if (s.prev_buf_unwritten) - pr_buf(&out, "yes, ref %u sectors %u\n", + pr_buf(out, "yes, ref %u sectors %u\n", journal_state_count(s, !s.idx), journal_prev_buf(j)->sectors); else - pr_buf(&out, "no\n"); + pr_buf(out, "no\n"); - pr_buf(&out, + pr_buf(out, "need write:\t\t%i\n" "replay done:\t\t%i\n", test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); for_each_member_device_rcu(ca, c, iter, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) continue; - pr_buf(&out, + pr_buf(out, "dev %u:\n" "\tnr\t\t%u\n" "\tavailable\t%u:%u\n" @@ -1221,34 +1220,29 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) spin_unlock(&j->lock); rcu_read_unlock(); - - return out.pos - buf; } -ssize_t bch2_journal_print_pins(struct journal *j, char *buf) +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; u64 i; spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - pr_buf(&out, "%llu: count %u\n", + pr_buf(out, "%llu: count %u\n", i, atomic_read(&pin_list->count)); list_for_each_entry(pin, &pin_list->list, list) - pr_buf(&out, "\t%px %ps\n", + pr_buf(out, "\t%px %ps\n", pin, pin->flush); if (!list_empty(&pin_list->flushed)) - pr_buf(&out, "flushed:\n"); + pr_buf(out, "flushed:\n"); list_for_each_entry(pin, &pin_list->flushed, list) - pr_buf(&out, "\t%px %ps\n", + pr_buf(out, "\t%px %ps\n", pin, pin->flush); } spin_unlock(&j->lock); - - return out.pos - buf; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 9c286f58c854..56438840efd7 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -281,7 +281,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, @@ -499,8 +499,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -ssize_t bch2_journal_print_debug(struct journal *, char *); -ssize_t bch2_journal_print_pins(struct journal *, char *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b7625285b3ad..89585833c846 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -6,6 +6,7 @@ #include "buckets.h" #include "checksum.h" #include "error.h" +#include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -661,7 +662,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) for_each_member_device(ca, c, iter) { if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; if ((ca->mi.state == BCH_MEMBER_STATE_RW || @@ -695,11 +696,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, "superblock not marked as containing replicas %s", (bch2_replicas_entry_to_text(&PBUF(buf), &replicas.e), buf)))) { @@ -759,7 +760,7 @@ static void __journal_write_alloc(struct journal *j, sectors > ja->sectors_free) continue; - bch2_dev_stripe_increment(c, ca, &j->wp.stripe); + bch2_dev_stripe_increment(ca, &j->wp.stripe); bch2_bkey_append_ptr(&w->key, (struct bch_extent_ptr) { @@ -796,7 +797,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, rcu_read_lock(); devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, - &c->rw_devs[BCH_DATA_JOURNAL]); + &c->rw_devs[BCH_DATA_journal]); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -914,7 +915,7 @@ static void journal_write_done(struct closure *cl) goto err; } - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); if (bch2_mark_replicas(c, &replicas.e)) goto err; @@ -961,7 +962,8 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", + bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { struct journal_buf *w = journal_prev_buf(j); unsigned long flags; @@ -1105,7 +1107,7 @@ retry_alloc: continue; } - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); bio = ca->journal.bio; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 4811ab9f879e..57591983eebd 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -70,7 +70,7 @@ static struct journal_space { rcu_read_lock(); for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; unsigned buckets_this_device, sectors_this_device; @@ -139,7 +139,7 @@ void bch2_journal_space_available(struct journal *j) rcu_read_lock(); for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) @@ -618,7 +618,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) return ret; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); seq = 0; @@ -627,7 +627,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) struct bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, journal_seq_pin(j, seq)->devs); seq++; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dd2011e295cc..2f3be487ef65 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -247,11 +247,15 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.target = data_opts.target, m->op.write_point = wp; - if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { m->op.alloc_reserve = RESERVE_MOVINGGC; + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + } else { + /* XXX: this should probably be passed in */ + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + } - m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| - BCH_WRITE_PAGES_STABLE| + m->op.flags |= BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| BCH_WRITE_DATA_ENCODED| BCH_WRITE_FROM_INTERNAL; @@ -316,12 +320,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -517,7 +521,7 @@ static int __bch2_move_data(struct bch_fs *c, bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_USER; + stats->data_type = BCH_DATA_user; stats->btree_id = btree_id; stats->pos = POS_MIN; @@ -642,7 +646,7 @@ int bch2_move_data(struct bch_fs *c, INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); - stats->data_type = BCH_DATA_USER; + stats->data_type = BCH_DATA_user; ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, pred, arg, stats, BTREE_ID_EXTENTS) ?: @@ -677,7 +681,7 @@ static int bch2_move_btree(struct bch_fs *c, bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_BTREE; + stats->data_type = BCH_DATA_btree; for (id = 0; id < BTREE_ID_NR; id++) { stats->btree_id = id; @@ -773,7 +777,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: - stats->data_type = BCH_DATA_JOURNAL; + stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; @@ -794,7 +798,7 @@ int bch2_data_job(struct bch_fs *c, if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - stats->data_type = BCH_DATA_JOURNAL; + stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0a87cd7405dd..55aa463f992f 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -12,6 +12,7 @@ #include "buckets.h" #include "clock.h" #include "disk_groups.h" +#include "error.h" #include "extents.h" #include "eytzinger.h" #include "io.h" @@ -43,13 +44,6 @@ #define COPYGC_BUCKETS_PER_ITER(ca) \ ((ca)->free[RESERVE_MOVINGGC].size / 2) -/* - * Max sectors to move per iteration: Have to take into account internal - * fragmentation from the multiple write points for each generation: - */ -#define COPYGC_SECTORS_PER_ITER(ca) \ - ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) - static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) @@ -62,18 +56,22 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; - return cmp_int(l->offset, r->offset); + return cmp_int(l->dev, r->dev) ?: + cmp_int(l->offset, r->offset); } -static bool __copygc_pred(struct bch_dev *ca, - struct bkey_s_c k) +static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) { - copygc_heap *h = &ca->copygc_heap; - const struct bch_extent_ptr *ptr = - bch2_bkey_has_device(k, ca->dev_idx); - - if (ptr) { - struct copygc_heap_entry search = { .offset = ptr->offset }; + copygc_heap *h = &c->copygc_heap; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct copygc_heap_entry search = { + .dev = ptr->dev, + .offset = ptr->offset + }; ssize_t i = eytzinger0_find_le(h->data, h->used, sizeof(h->data[0]), @@ -89,12 +87,13 @@ static bool __copygc_pred(struct bch_dev *ca, BUG_ON(i != j); #endif - return (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen); + if (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen) + return ptr->dev; } - return false; + return -1; } static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, @@ -102,14 +101,13 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - struct bch_dev *ca = arg; - - if (!__copygc_pred(ca, k)) + int dev_idx = __copygc_pred(c, k); + if (dev_idx < 0) return DATA_SKIP; - data_opts->target = dev_to_target(ca->dev_idx); + data_opts->target = io_opts->background_target; data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; - data_opts->rewrite_dev = ca->dev_idx; + data_opts->rewrite_dev = dev_idx; return DATA_REWRITE; } @@ -125,20 +123,21 @@ static bool have_copygc_reserve(struct bch_dev *ca) return ret; } -static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) +static int bch2_copygc(struct bch_fs *c) { - copygc_heap *h = &ca->copygc_heap; + copygc_heap *h = &c->copygc_heap; struct copygc_heap_entry e, *i; struct bucket_array *buckets; struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_not_moved = 0; + u64 sectors_reserved = 0; u64 buckets_to_move, buckets_not_moved = 0; - size_t b; + struct bch_dev *ca; + unsigned dev_idx; + size_t b, heap_size = 0; int ret; memset(&move_stats, 0, sizeof(move_stats)); - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); - /* * Find buckets with lowest sector counts, skipping completely * empty buckets, by building a maxheap sorted by sector count, @@ -147,38 +146,57 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) */ h->used = 0; - /* - * We need bucket marks to be up to date - gc can't be recalculating - * them: - */ - down_read(&c->gc_lock); - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - struct copygc_heap_entry e; - - if (m.owned_by_allocator || - m.data_type != BCH_DATA_USER || - !bucket_sectors_used(m) || - bucket_sectors_used(m) >= ca->mi.bucket_size) - continue; + for_each_rw_member(ca, c, dev_idx) + heap_size += ca->mi.nbuckets >> 7; - e = (struct copygc_heap_entry) { - .gen = m.gen, - .sectors = bucket_sectors_used(m), - .offset = bucket_to_sector(ca, b), - }; - heap_add_or_replace(h, e, -sectors_used_cmp, NULL); + if (h->size < heap_size) { + free_heap(&c->copygc_heap); + if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { + bch_err(c, "error allocating copygc heap"); + return 0; + } + } + + for_each_rw_member(ca, c, dev_idx) { + closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); + + spin_lock(&ca->fs->freelist_lock); + sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; + spin_unlock(&ca->fs->freelist_lock); + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + struct copygc_heap_entry e; + + if (m.owned_by_allocator || + m.data_type != BCH_DATA_user || + !bucket_sectors_used(m) || + bucket_sectors_used(m) >= ca->mi.bucket_size) + continue; + + e = (struct copygc_heap_entry) { + .dev = dev_idx, + .gen = m.gen, + .sectors = bucket_sectors_used(m), + .offset = bucket_to_sector(ca, b), + }; + heap_add_or_replace(h, e, -sectors_used_cmp, NULL); + } + up_read(&ca->bucket_lock); + } + + if (!sectors_reserved) { + bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); + return -1; } - up_read(&ca->bucket_lock); - up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) sectors_to_move += i->sectors; - while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { + while (sectors_to_move > sectors_reserved) { BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); sectors_to_move -= e.sectors; } @@ -186,30 +204,39 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) buckets_to_move = h->used; if (!buckets_to_move) - return; + return 0; eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), bucket_offset_cmp, NULL); - ret = bch2_move_data(c, &ca->copygc_pd.rate, - writepoint_ptr(&ca->copygc_write_point), + ret = bch2_move_data(c, &c->copygc_pd.rate, + writepoint_ptr(&c->copygc_write_point), POS_MIN, POS_MAX, - copygc_pred, ca, + copygc_pred, NULL, &move_stats); - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for (i = h->data; i < h->data + h->used; i++) { - size_t b = sector_to_bucket(ca, i->offset); - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + for_each_rw_member(ca, c, dev_idx) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + for (i = h->data; i < h->data + h->used; i++) { + struct bucket_mark m; + size_t b; + + if (i->dev != dev_idx) + continue; - if (i->gen == m.gen && bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); - buckets_not_moved++; + b = sector_to_bucket(ca, i->offset); + m = READ_ONCE(buckets->b[b].mark); + + if (i->gen == m.gen && + bucket_sectors_used(m)) { + sectors_not_moved += bucket_sectors_used(m); + buckets_not_moved++; + } } + up_read(&ca->bucket_lock); } - up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) bch_warn_ratelimited(c, @@ -220,9 +247,10 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) atomic64_read(&move_stats.keys_raced), atomic64_read(&move_stats.sectors_raced)); - trace_copygc(ca, + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), sectors_not_moved, buckets_to_move, buckets_not_moved); + return 0; } /* @@ -239,20 +267,27 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) +unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { - struct bch_fs *c = ca->fs; - struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); - u64 fragmented_allowed = ca->copygc_threshold + - ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); + struct bch_dev *ca; + unsigned dev_idx; + u64 fragmented_allowed = c->copygc_threshold; + u64 fragmented = 0; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); - return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); + fragmented_allowed += ((__dev_buckets_available(ca, usage) * + ca->mi.bucket_size) >> 1); + fragmented += usage.sectors_fragmented; + } + + return max_t(s64, 0, fragmented_allowed - fragmented); } static int bch2_copygc_thread(void *arg) { - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; + struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last, wait; @@ -263,7 +298,7 @@ static int bch2_copygc_thread(void *arg) break; last = atomic_long_read(&clock->now); - wait = bch2_copygc_wait_amount(ca); + wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { bch2_kthread_io_clock_wait(clock, last + wait, @@ -271,29 +306,30 @@ static int bch2_copygc_thread(void *arg) continue; } - bch2_copygc(c, ca); + if (bch2_copygc(c)) + break; } return 0; } -void bch2_copygc_stop(struct bch_dev *ca) +void bch2_copygc_stop(struct bch_fs *c) { - ca->copygc_pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&ca->copygc_pd.rate); + c->copygc_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->copygc_pd.rate); - if (ca->copygc_thread) { - kthread_stop(ca->copygc_thread); - put_task_struct(ca->copygc_thread); + if (c->copygc_thread) { + kthread_stop(c->copygc_thread); + put_task_struct(c->copygc_thread); } - ca->copygc_thread = NULL; + c->copygc_thread = NULL; } -int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) +int bch2_copygc_start(struct bch_fs *c) { struct task_struct *t; - if (ca->copygc_thread) + if (c->copygc_thread) return 0; if (c->opts.nochanges) @@ -302,21 +338,20 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, ca, - "bch_copygc[%s]", ca->name); + t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); if (IS_ERR(t)) return PTR_ERR(t); get_task_struct(t); - ca->copygc_thread = t; - wake_up_process(ca->copygc_thread); + c->copygc_thread = t; + wake_up_process(c->copygc_thread); return 0; } -void bch2_dev_copygc_init(struct bch_dev *ca) +void bch2_fs_copygc_init(struct bch_fs *c) { - bch2_pd_controller_init(&ca->copygc_pd); - ca->copygc_pd.d_term = 0; + bch2_pd_controller_init(&c->copygc_pd); + c->copygc_pd.d_term = 0; } diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index dcd479632cf1..922738247d03 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,8 +2,8 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -void bch2_copygc_stop(struct bch_dev *); -int bch2_copygc_start(struct bch_fs *, struct bch_dev *); -void bch2_dev_copygc_init(struct bch_dev *); +void bch2_copygc_stop(struct bch_fs *); +int bch2_copygc_start(struct bch_fs *); +void bch2_fs_copygc_init(struct bch_fs *); #endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 94d6c044a27d..afe25cd26c06 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -45,12 +45,9 @@ const char * const bch2_str_hash_types[] = { }; const char * const bch2_data_types[] = { - "none", - "sb", - "journal", - "btree", - "data", - "cached", +#define x(t, n) #t, + BCH_DATA_TYPES() +#undef x NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 3b051e7a8f1d..d6a832a38b20 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -260,6 +260,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ + x(rebuild_replicas, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index e15a2b1dc5d0..56a1f761271f 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -249,45 +249,42 @@ static int bch2_rebalance_thread(void *arg) return 0; } -ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) +void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_rebalance *r = &c->rebalance; struct rebalance_work w = rebalance_work(c); char h1[21], h2[21]; bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); - pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", + pr_buf(out, "fullest_dev (%i):\t%s/%s\n", w.dev_most_full_idx, h1, h2); bch2_hprint(&PBUF(h1), w.total_work << 9); bch2_hprint(&PBUF(h2), c->capacity << 9); - pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); + pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); - pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); + pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); switch (r->state) { case REBALANCE_WAITING: - pr_buf(&out, "waiting\n"); + pr_buf(out, "waiting\n"); break; case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - atomic_long_read(&c->io_clock[WRITE].now)) << 9); - pr_buf(&out, "throttled for %lu sec or %s io\n", + pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); break; case REBALANCE_RUNNING: - pr_buf(&out, "running\n"); - pr_buf(&out, "pos %llu:%llu\n", + pr_buf(out, "running\n"); + pr_buf(out, "pos %llu:%llu\n", r->move_stats.pos.inode, r->move_stats.pos.offset); break; } - - return out.pos - buf; } void bch2_rebalance_stop(struct bch_fs *c) diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 99e2a1fb6084..7ade0bb81cce 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -19,7 +19,7 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, struct bch_io_opts *); void bch2_rebalance_add_work(struct bch_fs *, u64); -ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); +void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 41b864dcdc39..28972f30e198 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -442,11 +442,18 @@ retry: * regular keys */ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); - bch2_trans_update(&trans, split_iter, split, !remark - ? BTREE_TRIGGER_NORUN - : BTREE_TRIGGER_NOOVERWRITES); + bch2_trans_update(&trans, split_iter, split, + BTREE_TRIGGER_NORUN); bch2_btree_iter_set_pos(iter, split->k.p); + + if (remark) { + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), + 0, split->k.size, + BTREE_TRIGGER_INSERT); + if (ret) + goto err; + } } while (bkey_cmp(iter->pos, k->k.p) < 0); if (remark) { @@ -967,7 +974,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); - if (!c->replicas.entries) { + if (!c->replicas.entries || + c->opts.rebuild_replicas) { bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 67a7128fd9af..6b6506c68609 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -113,16 +113,16 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, switch (k.k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: - e->data_type = BCH_DATA_BTREE; + e->data_type = BCH_DATA_btree; extent_to_replicas(k, e); break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - e->data_type = BCH_DATA_USER; + e->data_type = BCH_DATA_user; extent_to_replicas(k, e); break; case KEY_TYPE_stripe: - e->data_type = BCH_DATA_USER; + e->data_type = BCH_DATA_user; stripe_to_replicas(k, e); break; } @@ -137,7 +137,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, unsigned i; BUG_ON(!data_type || - data_type == BCH_DATA_SB || + data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); e->data_type = data_type; @@ -213,29 +213,20 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, return __replicas_entry_idx(r, search) >= 0; } -static bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search) { + bool marked; + if (!search->nr_devs) return true; verify_replicas_entry(search); - return __replicas_has_entry(&c->replicas, search) && - (!check_gc_replicas || - likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search)); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) -{ - bool marked; - percpu_down_read(&c->mark_lock); - marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); + marked = __replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); percpu_up_read(&c->mark_lock); return marked; @@ -423,66 +414,50 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *r) +static int __bch2_mark_replicas(struct bch_fs *c, + struct bch_replicas_entry *r, + bool check) { - return likely(bch2_replicas_marked(c, r, true)) - ? 0 + return likely(bch2_replicas_marked(c, r)) ? 0 + : check ? -1 : bch2_mark_replicas_slowpath(c, r); } -bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +{ + return __bch2_mark_replicas(c, r, false); +} + +static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + bool check) { struct bch_replicas_padded search; struct bch_devs_list cached = bch2_bkey_cached_devs(k); unsigned i; + int ret; for (i = 0; i < cached.nr; i++) { bch2_replicas_entry_cached(&search.e, cached.devs[i]); - if (!bch2_replicas_marked_locked(c, &search.e, - check_gc_replicas)) - return false; + ret = __bch2_mark_replicas(c, &search.e, check); + if (ret) + return ret; } bch2_bkey_to_replicas(&search.e, k); - return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); + return __bch2_mark_replicas(c, &search.e, check); } bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) + struct bkey_s_c k) { - bool marked; - - percpu_down_read(&c->mark_lock); - marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); - percpu_up_read(&c->mark_lock); - - return marked; + return __bch2_mark_bkey_replicas(c, k, true) == 0; } int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - int ret; - - for (i = 0; i < cached.nr; i++) { - bch2_replicas_entry_cached(&search.e, cached.devs[i]); - - ret = bch2_mark_replicas(c, &search.e); - if (ret) - return ret; - } - - bch2_bkey_to_replicas(&search.e, k); - - return bch2_mark_replicas(c, &search.e); + return __bch2_mark_bkey_replicas(c, k, false); } int bch2_replicas_gc_end(struct bch_fs *c, int ret) @@ -611,7 +586,7 @@ retry: struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - if (e->data_type == BCH_DATA_JOURNAL || + if (e->data_type == BCH_DATA_journal || c->usage_base->replicas[i] || percpu_u64_get(&c->usage[0]->replicas[i]) || percpu_u64_get(&c->usage[1]->replicas[i])) @@ -1037,13 +1012,13 @@ static bool have_enough_devs(struct replicas_status s, bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) { - return (have_enough_devs(s, BCH_DATA_JOURNAL, + return (have_enough_devs(s, BCH_DATA_journal, flags & BCH_FORCE_IF_METADATA_DEGRADED, flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_BTREE, + have_enough_devs(s, BCH_DATA_btree, flags & BCH_FORCE_IF_METADATA_DEGRADED, flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_USER, + have_enough_devs(s, BCH_DATA_user, flags & BCH_FORCE_IF_DATA_DEGRADED, flags & BCH_FORCE_IF_DATA_LOST)); } @@ -1053,9 +1028,9 @@ int bch2_replicas_online(struct bch_fs *c, bool meta) struct replicas_status s = bch2_replicas_status(c); return (meta - ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, - s.replicas[BCH_DATA_BTREE].redundancy) - : s.replicas[BCH_DATA_USER].redundancy) + 1; + ? min(s.replicas[BCH_DATA_journal].redundancy, + s.replicas[BCH_DATA_btree].redundancy) + : s.replicas[BCH_DATA_user].redundancy) + 1; } unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 8527d82841bb..8b95164fbb56 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -21,22 +21,18 @@ int bch2_replicas_entry_idx(struct bch_fs *, void bch2_devlist_to_replicas(struct bch_replicas_entry *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, - struct bch_replicas_entry *, bool); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); -bool bch2_bkey_replicas_marked_locked(struct bch_fs *, - struct bkey_s_c, bool); void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -bool bch2_bkey_replicas_marked(struct bch_fs *, - struct bkey_s_c, bool); +bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, unsigned dev) { - e->data_type = BCH_DATA_CACHED; + e->data_type = BCH_DATA_cached; e->nr_devs = 1; e->nr_required = 1; e->devs[0] = dev; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f2be64c869df..cee6cc938734 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -636,7 +636,8 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", + bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; closure_put(&ca->fs->sb_write); @@ -656,7 +657,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); @@ -684,7 +685,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) roundup((size_t) vstruct_bytes(sb), bdev_logical_block_size(ca->disk_sb.bdev))); - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0cdf285e4ffd..1d9a6bfa8c13 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -172,7 +172,7 @@ int bch2_congested(void *data, int bdi_bits) unsigned target = READ_ONCE(c->opts.foreground_target); const struct bch_devs_mask *devs = target ? bch2_target_to_mask(c, target) - : &c->rw_devs[BCH_DATA_USER]; + : &c->rw_devs[BCH_DATA_user]; for_each_member_device_rcu(ca, c, i, devs) { bdi = ca->disk_sb.bdev->bd_bdi; @@ -213,10 +213,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) int ret; bch2_rebalance_stop(c); - - for_each_member_device(ca, c, i) - bch2_copygc_stop(ca); - + bch2_copygc_stop(c); bch2_gc_thread_stop(c); /* @@ -396,8 +393,6 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) static int bch2_fs_read_write_late(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret; ret = bch2_gc_thread_start(c); @@ -406,13 +401,10 @@ static int bch2_fs_read_write_late(struct bch_fs *c) return ret; } - for_each_rw_member(ca, c, i) { - ret = bch2_copygc_start(c, ca); - if (ret) { - bch_err(c, "error starting copygc threads"); - percpu_ref_put(&ca->io_ref); - return ret; - } + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; } ret = bch2_rebalance_start(c); @@ -535,6 +527,7 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + free_heap(&c->copygc_heap); if (c->journal_reclaim_wq) destroy_workqueue(c->journal_reclaim_wq); @@ -684,6 +677,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); + bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); @@ -708,9 +702,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); - INIT_LIST_HEAD(&c->ec_new_stripe_list); - mutex_init(&c->ec_new_stripe_lock); - mutex_init(&c->ec_stripe_create_lock); + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + spin_lock_init(&c->ec_stripes_heap_lock); seqcount_init(&c->gc_pos_lock); @@ -1108,10 +1105,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_rwsem(&ca->bucket_lock); - writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); - - bch2_dev_copygc_init(ca); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_init(&ca->io_latency[READ]); @@ -1241,7 +1234,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) return ret; if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && - !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { + !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { mutex_lock(&c->sb_lock); bch2_mark_dev_superblock(ca->fs, ca, 0); mutex_unlock(&c->sb_lock); @@ -1352,7 +1345,11 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { - bch2_copygc_stop(ca); + /* + * Device going read only means the copygc reserve get smaller, so we + * don't want that happening while copygc is in progress: + */ + bch2_copygc_stop(c); /* * The allocator thread itself allocates btree nodes, so stop it first: @@ -1360,6 +1357,8 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); + + bch2_copygc_start(c); } static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -1374,9 +1373,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) if (bch2_dev_allocator_start(ca)) return "error starting allocator thread"; - if (bch2_copygc_start(c, ca)) - return "error starting copygc thread"; - return NULL; } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index c169d282a1f9..0cb29f43d99d 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -75,7 +75,6 @@ do { \ #define sysfs_hprint(file, val) \ do { \ if (attr == &sysfs_ ## file) { \ - struct printbuf out = _PBUF(buf, PAGE_SIZE); \ bch2_hprint(&out, val); \ pr_buf(&out, "\n"); \ return out.pos - buf; \ @@ -168,6 +167,7 @@ read_attribute(btree_updates); read_attribute(dirty_btree_nodes); read_attribute(btree_key_cache); read_attribute(btree_transactions); +read_attribute(stripes_heap); read_attribute(internal_uuid); @@ -238,24 +238,22 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) +static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); if (!fs_usage) return -ENOMEM; - bch2_fs_usage_to_text(&out, c, fs_usage); + bch2_fs_usage_to_text(out, c, fs_usage); percpu_up_read(&c->mark_lock); kfree(fs_usage); - - return out.pos - buf; + return 0; } -static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; struct btree_iter *iter; @@ -298,59 +296,26 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) if (ret) return ret; - return scnprintf(buf, PAGE_SIZE, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); -} - -static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) -{ - char *out = buf, *end = buf + PAGE_SIZE; - struct ec_stripe_head *h; - struct ec_stripe_new *s; - - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) { - out += scnprintf(out, end - out, - "target %u algo %u redundancy %u:\n", - h->target, h->algo, h->redundancy); - - if (h->s) - out += scnprintf(out, end - out, - "\tpending: blocks %u allocated %u\n", - h->s->blocks.nr, - bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr)); - - mutex_lock(&h->lock); - list_for_each_entry(s, &h->stripes, list) - out += scnprintf(out, end - out, - "\tin flight: blocks %u allocated %u pin %u\n", - s->blocks.nr, - bitmap_weight(s->blocks_allocated, - s->blocks.nr), - atomic_read(&s->pin)); - mutex_unlock(&h->lock); - - } - mutex_unlock(&c->ec_new_stripe_lock); - - return out - buf; + pr_buf(out, + "uncompressed data:\n" + " nr extents: %llu\n" + " size (bytes): %llu\n" + "compressed data:\n" + " nr extents: %llu\n" + " compressed size (bytes): %llu\n" + " uncompressed size (bytes): %llu\n", + nr_uncompressed_extents, + uncompressed_sectors << 9, + nr_compressed_extents, + compressed_sectors_compressed << 9, + compressed_sectors_uncompressed << 9); + return 0; } SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); @@ -378,9 +343,12 @@ SHOW(bch2_fs) sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + sysfs_pd_controller_show(copy_gc, &c->copygc_pd); - if (attr == &sysfs_rebalance_work) - return bch2_rebalance_work_show(c, buf); + if (attr == &sysfs_rebalance_work) { + bch2_rebalance_work_to_text(&out, c); + return out.pos - buf; + } sysfs_print(promote_whole_extents, c->promote_whole_extents); @@ -390,44 +358,61 @@ SHOW(bch2_fs) /* Debugging: */ if (attr == &sysfs_alloc_debug) - return show_fs_alloc_debug(c, buf); + return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; - if (attr == &sysfs_journal_debug) - return bch2_journal_print_debug(&c->journal, buf); + if (attr == &sysfs_journal_debug) { + bch2_journal_debug_to_text(&out, &c->journal); + return out.pos - buf; + } - if (attr == &sysfs_journal_pins) - return bch2_journal_print_pins(&c->journal, buf); + if (attr == &sysfs_journal_pins) { + bch2_journal_pins_to_text(&out, &c->journal); + return out.pos - buf; + } - if (attr == &sysfs_btree_updates) - return bch2_btree_updates_print(c, buf); + if (attr == &sysfs_btree_updates) { + bch2_btree_updates_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_dirty_btree_nodes) - return bch2_dirty_btree_nodes_print(c, buf); + if (attr == &sysfs_dirty_btree_nodes) { + bch2_dirty_btree_nodes_to_text(&out, c); + return out.pos - buf; + } if (attr == &sysfs_btree_key_cache) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); - bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); return out.pos - buf; } if (attr == &sysfs_btree_transactions) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); - bch2_btree_trans_to_text(&out, c); return out.pos - buf; } - if (attr == &sysfs_compression_stats) - return bch2_compression_stats(c, buf); + if (attr == &sysfs_stripes_heap) { + bch2_stripes_heap_to_text(&out, c); + return out.pos - buf; + } + + if (attr == &sysfs_compression_stats) { + bch2_compression_stats_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_new_stripes) - return bch2_new_stripes(c, buf); + if (attr == &sysfs_new_stripes) { + bch2_new_stripes_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_io_timers_read) - return bch2_io_timers_show(&c->io_clock[READ], buf); - if (attr == &sysfs_io_timers_write) - return bch2_io_timers_show(&c->io_clock[WRITE], buf); + if (attr == &sysfs_io_timers_read) { + bch2_io_timers_to_text(&out, &c->io_clock[READ]); + return out.pos - buf; + } + if (attr == &sysfs_io_timers_write) { + bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); + return out.pos - buf; + } #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() @@ -452,14 +437,11 @@ STORE(bch2_fs) } if (attr == &sysfs_copy_gc_enabled) { - struct bch_dev *ca; - unsigned i; ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; - for_each_member_device(ca, c, i) - if (ca->copygc_thread) - wake_up_process(ca->copygc_thread); + if (c->copygc_thread) + wake_up_process(c->copygc_thread); return ret; } @@ -474,6 +456,7 @@ STORE(bch2_fs) sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); sysfs_pd_controller_store(rebalance, &c->rebalance.pd); + sysfs_pd_controller_store(copy_gc, &c->copygc_pd); sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); @@ -583,6 +566,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_dirty_btree_nodes, &sysfs_btree_key_cache, &sysfs_btree_transactions, + &sysfs_stripes_heap, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, @@ -598,6 +582,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_enabled, &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + sysfs_pd_controller_files(copy_gc), &sysfs_new_stripes, @@ -696,11 +681,13 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + struct printbuf out = _PBUF(buf, PAGE_SIZE); -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ - buf, PAGE_SIZE); +#define x(name) \ + if (attr == &sysfs_time_stat_##name) { \ + bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ + return out.pos - buf; \ + } BCH_TIME_STATS() #undef x @@ -753,13 +740,13 @@ static int unsigned_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, - char *buf, bucket_map_fn *fn, void *private) +static int quantiles_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_dev *ca, + bucket_map_fn *fn, void *private) { size_t i, n; /* Compute 31 quantiles */ unsigned q[31], *p; - ssize_t ret = 0; down_read(&ca->bucket_lock); n = ca->mi.nbuckets; @@ -786,38 +773,33 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, vfree(p); for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); - buf[ret - 1] = '\n'; - - return ret; + pr_buf(out, "%u ", q[i]); + pr_buf(out, "\n"); + return 0; } -static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) +static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); enum alloc_reserve i; spin_lock(&ca->fs->freelist_lock); - pr_buf(&out, "free_inc:\t%zu\t%zu\n", + pr_buf(out, "free_inc:\t%zu\t%zu\n", fifo_used(&ca->free_inc), ca->free_inc.size); for (i = 0; i < RESERVE_NR; i++) - pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, + pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, fifo_used(&ca->free[i]), ca->free[i].size); spin_unlock(&ca->fs->freelist_lock); - - return out.pos - buf; } -static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + struct bch_dev_usage stats = bch2_dev_usage_read(ca); unsigned i, nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); @@ -825,7 +807,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].type]++; - return scnprintf(buf, PAGE_SIZE, + pr_buf(out, "free_inc: %zu/%zu\n" "free[RESERVE_BTREE]: %zu/%zu\n" "free[RESERVE_MOVINGGC]: %zu/%zu\n" @@ -861,27 +843,27 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ca->mi.nbuckets - ca->mi.first_bucket, stats.buckets_alloc, - stats.buckets[BCH_DATA_SB], - stats.buckets[BCH_DATA_JOURNAL], - stats.buckets[BCH_DATA_BTREE], - stats.buckets[BCH_DATA_USER], - stats.buckets[BCH_DATA_CACHED], + stats.buckets[BCH_DATA_sb], + stats.buckets[BCH_DATA_journal], + stats.buckets[BCH_DATA_btree], + stats.buckets[BCH_DATA_user], + stats.buckets[BCH_DATA_cached], stats.buckets_ec, - ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, - stats.sectors[BCH_DATA_SB], - stats.sectors[BCH_DATA_JOURNAL], - stats.sectors[BCH_DATA_BTREE], - stats.sectors[BCH_DATA_USER], - stats.sectors[BCH_DATA_CACHED], + __dev_buckets_available(ca, stats), + stats.sectors[BCH_DATA_sb], + stats.sectors[BCH_DATA_journal], + stats.sectors[BCH_DATA_btree], + stats.sectors[BCH_DATA_user], + stats.sectors[BCH_DATA_cached], stats.sectors_ec, stats.sectors_fragmented, - ca->copygc_threshold, + c->copygc_threshold, c->freelist_wait.list.first ? "waiting" : "empty", c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_OPEN_BUCKET_RESERVE, c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_BTREE], - nr[BCH_DATA_USER], + nr[BCH_DATA_btree], + nr[BCH_DATA_user], c->btree_reserve_cache_nr); } @@ -891,21 +873,18 @@ static const char * const bch2_rw[] = { NULL }; -static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) +static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); int rw, i; for (rw = 0; rw < 2; rw++) { - pr_buf(&out, "%s:\n", bch2_rw[rw]); + pr_buf(out, "%s:\n", bch2_rw[rw]); for (i = 1; i < BCH_DATA_NR; i++) - pr_buf(&out, "%-12s:%12llu\n", + pr_buf(out, "%-12s:%12llu\n", bch2_data_types[i], percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } - - return out.pos - buf; } SHOW(bch2_dev) @@ -942,8 +921,6 @@ SHOW(bch2_dev) return out.pos - buf; } - sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); - if (attr == &sysfs_cache_replacement_policy) { bch2_string_opt_to_text(&out, bch2_cache_replacement_policies, @@ -959,34 +936,44 @@ SHOW(bch2_dev) return out.pos - buf; } - if (attr == &sysfs_iodone) - return show_dev_iodone(ca, buf); + if (attr == &sysfs_iodone) { + dev_iodone_to_text(&out, ca); + return out.pos - buf; + } sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - if (attr == &sysfs_io_latency_stats_read) - return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); - if (attr == &sysfs_io_latency_stats_write) - return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); + if (attr == &sysfs_io_latency_stats_read) { + bch2_time_stats_to_text(&out, &ca->io_latency[READ]); + return out.pos - buf; + } + if (attr == &sysfs_io_latency_stats_write) { + bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); + return out.pos - buf; + } sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); if (attr == &sysfs_bucket_quantiles_last_read) - return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); + return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_last_write) - return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); + return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_fragmentation) - return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); + return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_oldest_gen) - return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); + return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_reserve_stats) - return show_reserve_stats(ca, buf); - if (attr == &sysfs_alloc_debug) - return show_dev_alloc_debug(ca, buf); + if (attr == &sysfs_reserve_stats) { + reserve_stats_to_text(&out, ca); + return out.pos - buf; + } + if (attr == &sysfs_alloc_debug) { + dev_alloc_debug_to_text(&out, ca); + return out.pos - buf; + } return 0; } @@ -997,8 +984,6 @@ STORE(bch2_dev) struct bch_fs *c = ca->fs; struct bch_member *mi; - sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); - if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -1083,8 +1068,6 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, &sysfs_wake_allocator, - - sysfs_pd_controller_files(copy_gc), NULL }; diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index e69d03d1109f..fd4044a6a08f 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -318,43 +318,40 @@ static void pr_time_units(struct printbuf *out, u64 ns) pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) { - struct printbuf out = _PBUF(buf, len); const struct time_unit *u; u64 freq = READ_ONCE(stats->average_frequency); u64 q, last_q = 0; int i; - pr_buf(&out, "count:\t\t%llu\n", + pr_buf(out, "count:\t\t%llu\n", stats->count); - pr_buf(&out, "rate:\t\t%llu/sec\n", + pr_buf(out, "rate:\t\t%llu/sec\n", freq ? div64_u64(NSEC_PER_SEC, freq) : 0); - pr_buf(&out, "frequency:\t"); - pr_time_units(&out, freq); + pr_buf(out, "frequency:\t"); + pr_time_units(out, freq); - pr_buf(&out, "\navg duration:\t"); - pr_time_units(&out, stats->average_duration); + pr_buf(out, "\navg duration:\t"); + pr_time_units(out, stats->average_duration); - pr_buf(&out, "\nmax duration:\t"); - pr_time_units(&out, stats->max_duration); + pr_buf(out, "\nmax duration:\t"); + pr_time_units(out, stats->max_duration); i = eytzinger0_first(NR_QUANTILES); u = pick_time_units(stats->quantiles.entries[i].m); - pr_buf(&out, "\nquantiles (%s):\t", u->name); + pr_buf(out, "\nquantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; q = max(stats->quantiles.entries[i].m, last_q); - pr_buf(&out, "%llu%s", + pr_buf(out, "%llu%s", div_u64(q, u->nsecs), is_last ? "\n" : " "); last_q = q; } - - return out.pos - buf; } void bch2_time_stats_exit(struct time_stats *stats) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 2b19a0038045..4dcd28456e00 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -398,7 +398,7 @@ static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) __bch2_time_stats_update(stats, start, local_clock()); } -size_t bch2_time_stats_print(struct time_stats *, char *, size_t); +void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); void bch2_time_stats_exit(struct time_stats *); void bch2_time_stats_init(struct time_stats *); @@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 745b2d0dcf78..09887c0f9a03 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1013,6 +1013,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); bool blk_poll(struct request_queue *q, blk_qc_t cookie); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index bafbccafae30..9b4e8295ed75 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -470,10 +470,10 @@ TRACE_EVENT(move_data, ); TRACE_EVENT(copygc, - TP_PROTO(struct bch_dev *ca, + TP_PROTO(struct bch_fs *c, u64 sectors_moved, u64 sectors_not_moved, u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(ca, + TP_ARGS(c, sectors_moved, sectors_not_moved, buckets_moved, buckets_not_moved), @@ -486,7 +486,7 @@ TRACE_EVENT(copygc, ), TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->sectors_moved = sectors_moved; __entry->sectors_not_moved = sectors_not_moved; __entry->buckets_moved = buckets_moved; diff --git a/kernel/locking/six.c b/kernel/locking/six.c index 3acee748e052..49d46ed2e18e 100644 --- a/kernel/locking/six.c +++ b/kernel/locking/six.c @@ -15,7 +15,7 @@ #endif #define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -#define six_release(l) lock_release(l, 0, _RET_IP_) +#define six_release(l) lock_release(l, _RET_IP_) struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ |