diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2021-04-04 22:34:05 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2021-04-04 22:34:54 -0400 |
commit | 2625a7f0726213283750cd65043363f679fda9e8 (patch) | |
tree | 69f1d080efbbceb74b329d7285f581a438df8f05 | |
parent | bd6be71b2d23f021dccdba0103826de7632c8e63 (diff) |
Merge with 8eb434efa5 bcachefs: Use x-macros for compat feature bits
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
78 files changed, 3804 insertions, 3162 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 09a7f8c8583a..8e9cbc0625f7 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -14,6 +14,7 @@ #include "ec.h" #include "error.h" #include "recovery.h" +#include "varint.h" #include <linux/kthread.h> #include <linux/math64.h> @@ -24,15 +25,12 @@ #include <linux/sort.h> #include <trace/events/bcachefs.h> -static const char * const bch2_alloc_field_names[] = { -#define x(name, bytes) #name, - BCH_ALLOC_FIELDS() +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() #undef x - NULL }; -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -54,10 +52,10 @@ static void pd_controllers_update(struct work_struct *work) * reclaimed by copy GC */ fragmented += max_t(s64, 0, (bucket_to_sector(ca, - stats.buckets[BCH_DATA_user] + - stats.buckets[BCH_DATA_cached]) - - (stats.sectors[BCH_DATA_user] + - stats.sectors[BCH_DATA_cached])) << 9); + stats.d[BCH_DATA_user].buckets + + stats.d[BCH_DATA_cached].buckets) - + (stats.d[BCH_DATA_user].sectors + + stats.d[BCH_DATA_cached].sectors)) << 9); } bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); @@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work) /* Persistent alloc info: */ -static inline u64 get_alloc_field(const struct bch_alloc *a, - const void **p, unsigned field) +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) @@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a, return v; } -static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, - unsigned field, u64 v) +static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; if (!v) return; @@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { .gen = 0 }; + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; - if (k.k->type == KEY_TYPE_alloc) { - const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; - const void *d = a->data; - unsigned idx = 0; + out->gen = in->gen; - ret.gen = a->gen; +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} + +static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + struct bkey_i_alloc *a = bkey_alloc_init(&dst->k); + void *d = a->v.data; + unsigned bytes, idx = 0; -#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); - BCH_ALLOC_FIELDS() + a->k.p = POS(src.dev, src.bucket); + a->v.fields = 0; + a->v.gen = src.gen; + +#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name); + BCH_ALLOC_FIELDS_V1() #undef x - } - return ret; + bytes = (void *) d - (void *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); } -void bch2_alloc_pack(struct bkey_i_alloc *dst, - const struct bkey_alloc_unpacked src) +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) { - unsigned idx = 0; - void *d = dst->v.data; + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + u8 *out = a->v.data; + u8 *end = (void *) &dst[1]; + u8 *last_nonzero_field = out; unsigned bytes; - dst->v.fields = 0; - dst->v.gen = src.gen; + a->k.p = POS(src.dev, src.bucket); + a->v.gen = src.gen; + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (src._name) { \ + out += bch2_varint_encode(out, src._name); \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + } -#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x + BUG_ON(out > end); + + out = last_nonzero_field; + a->v.nr_fields = last_nonzero_fieldnr; + + bytes = (u8 *) out - (u8 *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); +} + +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, + .bucket = k.k->p.offset, + .gen = 0, + }; + + if (k.k->type == KEY_TYPE_alloc_v2) + bch2_alloc_unpack_v2(&ret, k); + else if (k.k->type == KEY_TYPE_alloc) + bch2_alloc_unpack_v1(&ret, k); - bytes = (void *) d - (void *) &dst->v; - set_bkey_val_bytes(&dst->k, bytes); - memset_u64s_tail(&dst->v, 0, bytes); + return ret; +} + +void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2)) + bch2_alloc_pack_v2(dst, src); + else + bch2_alloc_pack_v1(dst, src); } static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) - bytes += BCH_ALLOC_FIELD_BYTES[i]; + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } -const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) +const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - const void *d = a.v->data; - unsigned i; + struct bkey_alloc_unpacked u; + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; - pr_buf(out, "gen %u", a.v->gen); + if (bch2_alloc_unpack_v2(&u, k)) + return "unpack error"; - for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) - if (a.v->fields & (1 << i)) - pr_buf(out, " %s %llu", - bch2_alloc_field_names[i], - get_alloc_field(a.v, &d, i)); + return NULL; +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + pr_buf(out, "gen %u oldest_gen %u data_type %u", + u.gen, u.oldest_gen, u.data_type); +#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); + BCH_ALLOC_FIELDS_V2() +#undef x } static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, @@ -213,11 +315,13 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, struct bucket *g; struct bkey_alloc_unpacked u; - if (level || k.k->type != KEY_TYPE_alloc) + if (level || + (k.k->type != KEY_TYPE_alloc && + k.k->type != KEY_TYPE_alloc_v2)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, 0); + g = bucket(ca, k.k->p.offset); u = bch2_alloc_unpack(k); g->_mark.gen = u.gen; @@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { - struct bch_dev *ca; - unsigned i; - int ret = 0; + int ret; down_read(&c->gc_lock); ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, @@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - return 0; } @@ -278,12 +360,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *ba; struct bucket *g; struct bucket_mark m; struct bkey_alloc_unpacked old_u, new_u; - __BKEY_PADDED(k, 8) alloc_key; /* hack: */ - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; int ret; retry: bch2_trans_begin(trans); @@ -302,193 +382,60 @@ retry: percpu_down_read(&c->mark_lock); ca = bch_dev_bkey_exists(c, iter->pos.inode); - ba = bucket_array(ca); - - g = &ba->b[iter->pos.offset]; + g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(g, m); + new_u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); if (!bkey_alloc_unpacked_cmp(old_u, new_u)) return 0; - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, new_u); - - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, new_u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_NORUN); ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - flags); + BTREE_INSERT_NOFAIL|flags); err: if (ret == -EINTR) goto retry; return ret; } -int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) +int bch2_alloc_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; - u64 first_bucket, nbuckets; + struct bch_dev *ca; + unsigned i; int ret = 0; - percpu_down_read(&c->mark_lock); - first_bucket = bucket_array(ca)->first_bucket; - nbuckets = bucket_array(ca)->nbuckets; - percpu_up_read(&c->mark_lock); - - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, - POS(ca->dev_idx, first_bucket), + iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (iter->pos.offset < nbuckets) { - bch2_trans_cond_resched(&trans); - - ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret) - break; - bch2_btree_iter_next_slot(iter); - } - - bch2_trans_exit(&trans); - - return ret; -} + for_each_member_device(ca, c, i) { + bch2_btree_iter_set_pos(iter, + POS(ca->dev_idx, ca->mi.first_bucket)); -int bch2_alloc_write(struct bch_fs *c, unsigned flags) -{ - struct bch_dev *ca; - unsigned i; - int ret = 0; + while (iter->pos.offset < ca->mi.nbuckets) { + bch2_trans_cond_resched(&trans); - for_each_rw_member(ca, c, i) { - bch2_dev_alloc_write(c, ca, flags); - if (ret) { - percpu_ref_put(&ca->io_ref); - break; + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + bch2_btree_iter_next_slot(iter); } } - +err: + bch2_trans_exit(&trans); return ret; } /* Bucket IO clocks: */ -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static inline u64 bucket_clock_freq(u64 capacity) -{ - return max(capacity >> 10, 2028ULL); -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += bucket_clock_freq(capacity); - - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = bucket_clock_freq(c->capacity); - mutex_init(&clock->lock); -} - int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { @@ -496,37 +443,38 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, struct bch_dev *ca = bch_dev_bkey_exists(c, dev); struct btree_iter *iter; struct bucket *g; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; - u16 *time; + u64 *time, now; int ret = 0; iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; + + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; percpu_down_read(&c->mark_lock); g = bucket(ca, bucket_nr); - u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - time = rw == READ ? &u.read_time : &u.write_time; - if (*time == c->bucket_clock[rw].hand) + now = atomic64_read(&c->io_clock[rw].now); + if (*time == now) goto out; - *time = c->bucket_clock[rw].hand; + *time = now; - bch2_alloc_pack(a, u); - - ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: + bch2_alloc_pack(c, a, u); + ret = bch2_trans_update(trans, iter, &a->k, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: bch2_trans_iter_put(trans, iter); @@ -550,7 +498,8 @@ out: static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { unsigned long gc_count = c->gc_count; - u64 available; + s64 available; + unsigned i; int ret = 0; ca->allocator_state = ALLOCATOR_BLOCKED; @@ -566,13 +515,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - available = max_t(s64, 0, dev_buckets_available(ca) - - ca->inc_gen_really_needs_gc); + available = dev_buckets_available(ca); + available -= ca->inc_gen_really_needs_gc; + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + available -= fifo_used(&ca->free[i]); + spin_unlock(&c->freelist_lock); + + available = max(available, 0LL); if (available > fifo_free(&ca->free_inc) || (available && - (!fifo_full(&ca->free[RESERVE_BTREE]) || - !fifo_full(&ca->free[RESERVE_MOVINGGC])))) + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) break; up_read(&c->gc_lock); @@ -588,20 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) { u8 gc_gen; - if (!is_available_bucket(mark)) + if (!is_available_bucket(m)) + return false; + + if (m.owned_by_allocator) return false; if (ca->buckets_nouse && - test_bit(bucket, ca->buckets_nouse)) + test_bit(b, ca->buckets_nouse)) return false; - gc_gen = bucket_gc_gen(ca, bucket); + gc_gen = bucket_gc_gen(bucket(ca, b)); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; @@ -615,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) +static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) { - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; + unsigned used = bucket_sectors_used(m); - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; - - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); - - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + if (used) { + /* + * Prefer to keep buckets that have been read more recently, and + * buckets that have more data in them: + */ + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - return (data_wantness << 9) | - (needs_journal_commit << 8) | - (bucket_gc_gen(ca, b) / 16); + return -last_read_scaled; + } else { + /* + * Prefer to use buckets with smaller gc_gen so that we don't + * have to walk the btree and recalculate oldest_gen - but shift + * off the low bits so that buckets will still have equal sort + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| + (bucket_gc_gen(g) >> 4); + } } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -674,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; + u64 now, last_seq_ondisk; size_t b, i, nr = 0; - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); + last_seq_ondisk = c->journal.last_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -691,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * all buckets have been visited. */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); + struct bucket *g = &buckets->b[b]; + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -727,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -872,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct btree_iter *iter, u64 *journal_seq, unsigned flags) { -#if 0 - __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -#else - /* hack: */ - __BKEY_PADDED(k, 8) alloc_key; -#endif struct bch_fs *c = trans->c; - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; @@ -893,34 +833,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - verify_not_on_freelist(c, ca, b); - - BUG_ON(!fifo_push(&ca->free_inc, b)); - g = bucket(ca, b); m = READ_ONCE(g->mark); - invalidating_cached_data = m.cached_sectors != 0; + BUG_ON(m.dirty_sectors); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_lock(&c->freelist_lock); + verify_not_on_freelist(c, ca, b); + BUG_ON(!fifo_push(&ca->free_inc, b)); + spin_unlock(&c->freelist_lock); /* * If we're not invalidating cached data, we only increment the bucket * gen in memory here, the incremented gen will be updated in the btree * by bch2_trans_mark_pointer(): */ + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + percpu_up_read(&c->mark_lock); + goto out; + } - if (!invalidating_cached_data) - bch2_invalidate_bucket(c, ca, b, &m); - else - bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); - - spin_unlock(&c->freelist_lock); percpu_up_read(&c->mark_lock); - if (!invalidating_cached_data) - goto out; - /* * If the read-only path is trying to shut down, we can't be generating * new btree updates: @@ -930,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, goto out; } - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); retry: ret = bch2_btree_iter_traverse(iter); @@ -941,7 +878,7 @@ retry: percpu_down_read(&c->mark_lock); g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - u = alloc_mem_to_key(g, m); + u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); @@ -951,14 +888,11 @@ retry: u.data_type = 0; u.dirty_sectors = 0; u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; - - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_BUCKET_INVALIDATE); /* @@ -973,8 +907,7 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED| flags); if (ret == -EINTR) goto retry; @@ -1135,6 +1068,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) return 0; } +static inline bool allocator_thread_running(struct bch_dev *ca) +{ + return ca->mi.state == BCH_MEMBER_STATE_RW && + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); +} + /** * bch_allocator_thread - move buckets from free_inc to reserves * @@ -1151,9 +1090,16 @@ static int bch2_allocator_thread(void *arg) int ret; set_freezable(); - ca->allocator_state = ALLOCATOR_RUNNING; while (1) { + if (!allocator_thread_running(ca)) { + ca->allocator_state = ALLOCATOR_STOPPED; + if (kthread_wait_freezable(allocator_thread_running(ca))) + break; + } + + ca->allocator_state = ALLOCATOR_RUNNING; + cond_resched(); if (kthread_should_stop()) break; @@ -1454,8 +1400,11 @@ int bch2_dev_allocator_start(struct bch_dev *ca) p = kthread_create(bch2_allocator_thread, ca, "bch-alloc/%s", ca->name); - if (IS_ERR(p)) + if (IS_ERR(p)) { + bch_err(ca->fs, "error creating allocator thread: %li", + PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); rcu_assign_pointer(ca->alloc_thread, p); @@ -1466,8 +1415,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index d10ff56e4de1..6fededcd9f86 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -7,12 +7,33 @@ #include "debug.h" struct bkey_alloc_unpacked { + u64 bucket; + u8 dev; u8 gen; + u8 oldest_gen; + u8 data_type; #define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x }; +struct bkey_alloc_buf { + struct bkey_i k; + + union { + struct { +#define x(_name, _bits) + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; +#undef x + } _v1; + struct { +#define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; +#undef x + } _v2; + }; +} __attribute__((packed, aligned(8))); + /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -20,23 +41,28 @@ struct bkey_alloc_unpacked { static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, struct bkey_alloc_unpacked r) { - return l.gen != r.gen -#define x(_name, _bits) || l._name != r._name - BCH_ALLOC_FIELDS() + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type +#define x(_name, ...) || l._name != r._name + BCH_ALLOC_FIELDS_V2() #undef x ; } struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -void bch2_alloc_pack(struct bkey_i_alloc *, +void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, const struct bkey_alloc_unpacked); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bucket *g, struct bucket_mark m) +alloc_mem_to_key(struct btree_iter *iter, + struct bucket *g, struct bucket_mark m) { return (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, .gen = m.gen, .oldest_gen = g->oldest_gen, .data_type = m.data_type, @@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m) #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ - .key_invalid = bch2_alloc_invalid, \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ } @@ -98,7 +130,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7a92e3d53254..8f0b94f591be 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) rcu_read_lock(); buckets = bucket_array(ca); - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark)) + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) + if (is_available_bucket(buckets->b[b].mark) && + !buckets->b[b].mark.owned_by_allocator) goto success; b = -1; success: @@ -204,9 +205,10 @@ success: static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) { switch (reserve) { - case RESERVE_ALLOC: - return 0; case RESERVE_BTREE: + case RESERVE_BTREE_MOVINGGC: + return 0; + case RESERVE_MOVINGGC: return OPEN_BUCKETS_COUNT / 4; default: return OPEN_BUCKETS_COUNT / 2; @@ -223,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bool may_alloc_partial, struct closure *cl) { - struct bucket_array *buckets; struct open_bucket *ob; - long bucket = 0; + long b = 0; spin_lock(&c->freelist_lock); @@ -259,22 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) goto out; switch (reserve) { - case RESERVE_ALLOC: - if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; - case RESERVE_BTREE: - if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= - ca->free[RESERVE_BTREE].size && - fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; + case RESERVE_BTREE_MOVINGGC: case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) goto out; break; default: @@ -292,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, trace_bucket_alloc_fail(ca, reserve); return ERR_PTR(-FREELIST_EMPTY); out: - verify_not_on_freelist(c, ca, bucket); + verify_not_on_freelist(c, ca, b); ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - buckets = bucket_array(ca); ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->ptr = (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = buckets->b[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), + .gen = bucket(ca, b)->mark.gen, + .offset = bucket_to_sector(ca, b), .dev = ca->dev_idx, }; @@ -458,16 +449,18 @@ bch2_bucket_alloc_set(struct bch_fs *c, * it's to a device we don't want: */ -static void bucket_alloc_from_stripe(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - unsigned flags) +static enum bucket_alloc_ret +bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + struct closure *cl) { struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; @@ -476,31 +469,39 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, unsigned i, ec_idx; if (!erasure_code) - return; + return 0; if (nr_replicas < 2) - return; + return 0; if (ec_open_bucket(c, ptrs)) - return; + return 0; - h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); + h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, + wp == &c->copygc_write_point, + cl); + if (IS_ERR(h)) + return -PTR_ERR(h); if (!h) - return; + return 0; devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); for (i = 0; i < devs_sorted.nr; i++) - open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; if (ob->ptr.dev == devs_sorted.devs[i] && - !test_and_set_bit(h->s->data_block_idx[ec_idx], - h->s->blocks_allocated)) + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) goto got_bucket; + } goto out_put_head; got_bucket: ca = bch_dev_bkey_exists(c, ob->ptr.dev); - ob->ec_idx = h->s->data_block_idx[ec_idx]; + ob->ec_idx = ec_idx; ob->ec = h->s; add_new_bucket(c, ptrs, devs_may_alloc, @@ -508,6 +509,7 @@ got_bucket: atomic_inc(&h->s->pin); out_put_head: bch2_ec_stripe_head_put(c, h); + return 0; } /* Sector allocator */ @@ -585,10 +587,13 @@ open_bucket_add_buckets(struct bch_fs *c, } if (!ec_open_bucket(c, ptrs)) { - bucket_alloc_from_stripe(c, ptrs, wp, &devs, + ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs, target, erasure_code, nr_replicas, nr_effective, - have_cache, flags); + have_cache, flags, _cl); + if (ret == FREELIST_EMPTY || + ret == OPEN_BUCKETS_EMPTY) + return ret; if (*nr_effective >= nr_replicas) return 0; } @@ -634,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); - open_bucket_for_each(c, &ob->ec->blocks, ob2, j) - drop |= ob2->ptr.dev == ca->dev_idx; - open_bucket_for_each(c, &ob->ec->parity, ob2, j) + for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { + if (!ob->ec->blocks[j]) + continue; + + ob2 = c->open_buckets + ob->ec->blocks[j]; drop |= ob2->ptr.dev == ca->dev_idx; + } mutex_unlock(&ob->ec->lock); } diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 20705460bb0a..be164d6108bb 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -10,38 +10,12 @@ struct ec_bucket_buf; -/* There's two of these clocks, one for reads and one for writes: */ -struct bucket_clock { - /* - * "now" in (read/write) IO time - incremented whenever we do X amount - * of reads or writes. - * - * Goes with the bucket read/write prios: when we read or write to a - * bucket we reset the bucket's prio to the current hand; thus hand - - * prio = time since bucket was last read/written. - * - * The units are some amount (bytes/sectors) of data read/written, and - * the units can change on the fly if we need to rescale to fit - * everything in a u16 - your only guarantee is that the units are - * consistent. - */ - u16 hand; - u16 max_last_io; - - int rw; - - struct io_timer rescale; - struct mutex lock; -}; - -/* There is one reserve for each type of btree, one for prios and gens - * and one for moving GC */ enum alloc_reserve { - RESERVE_ALLOC = -1, - RESERVE_BTREE = 0, - RESERVE_MOVINGGC = 1, - RESERVE_NONE = 2, - RESERVE_NR = 3, + RESERVE_BTREE_MOVINGGC = -2, + RESERVE_BTREE = -1, + RESERVE_MOVINGGC = 0, + RESERVE_NONE = 1, + RESERVE_NR = 2, }; typedef FIFO(long) alloc_fifo; @@ -89,7 +63,6 @@ struct write_point { u64 last_used; unsigned long write_point; enum bch_data_type type; - bool is_ec; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d3d67cc71faa..8e363e2fa8c4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -429,7 +429,9 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage[2]; + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -451,9 +453,6 @@ struct bch_dev { size_t fifo_last_bucket; - /* last calculated minimum prio */ - u16 max_last_bucket_io[2]; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -473,6 +472,7 @@ struct bch_dev { atomic64_t rebalance_work; struct journal_device journal; + u64 prev_journal_sector; struct work_struct io_error_work; @@ -509,8 +509,9 @@ enum { BCH_FS_ERRORS_FIXED, /* misc: */ - BCH_FS_FIXED_GENS, - BCH_FS_ALLOC_WRITTEN, + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, + BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -539,11 +540,13 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; unsigned level:8; + bool allocated; struct bkey_i *k; u32 journal_seq; u32 journal_offset; } *d; size_t nr; + size_t size; u64 journal_seq_base; }; @@ -579,7 +582,10 @@ struct bch_fs { struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; + struct journal_entry_res btree_root_journal_res; struct journal_entry_res replicas_journal_res; + struct journal_entry_res clock_journal_res; + struct journal_entry_res dev_usage_journal_res; struct bch_disk_groups_cpu __rcu *disk_groups; @@ -688,14 +694,6 @@ struct bch_fs { struct mutex usage_scratch_lock; struct bch_fs_usage *usage_scratch; - /* - * When we invalidate buckets, we use both the priority and the amount - * of good data to determine which buckets to reuse first - to weight - * those together consistently we keep track of the smallest nonzero - * priority of any bucket. - */ - struct bucket_clock bucket_clock[2]; - struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ @@ -759,7 +757,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -840,6 +838,7 @@ struct bch_fs { struct journal journal; struct list_head journal_entries; struct journal_keys journal_keys; + struct list_head journal_iters; u64 last_bucket_seq_cleanup; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 02a76c3d3acb..73eeeb10472a 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k) x(reflink_v, 16) \ x(inline_data, 17) \ x(btree_ptr_v2, 18) \ - x(indirect_inline_data, 19) + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, block:8, - idx:51; + redundancy:4, + idx:47; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:51, + __u64 idx:47, + redundancy:4, block:8, type:5; #endif @@ -603,13 +606,14 @@ struct bch_btree_ptr_v2 { __u64 mem_ptr; __le64 seq; __le16 sectors_written; - /* In case we ever decide to do variable size btree nodes: */ - __le16 sectors; + __le16 flags; struct bpos min_key; struct bch_extent_ptr start[0]; __u64 _data[0]; } __attribute__((packed, aligned(8))); +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + struct bch_extent { struct bch_val v; @@ -634,8 +638,6 @@ struct bch_reservation { #define BKEY_EXTENT_VAL_U64s_MAX \ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) - /* * Maximum possible size of an entire extent, key + value: */ #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) @@ -800,35 +802,40 @@ struct bch_alloc { __u8 data[]; } __attribute__((packed, aligned(8))); -#define BCH_ALLOC_FIELDS() \ +#define BCH_ALLOC_FIELDS_V1() \ x(read_time, 16) \ x(write_time, 16) \ x(data_type, 8) \ x(dirty_sectors, 16) \ x(cached_sectors, 16) \ - x(oldest_gen, 8) + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __attribute__((packed, aligned(8))); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) enum { -#define x(name, bytes) BCH_ALLOC_FIELD_##name, - BCH_ALLOC_FIELDS() +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() #undef x BCH_ALLOC_FIELD_NR }; -static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, - BCH_ALLOC_FIELDS() -#undef x -}; - -#define x(name, bits) + (bits / 8) -static const unsigned BKEY_ALLOC_VAL_U64s_MAX = - DIV_ROUND_UP(offsetof(struct bch_alloc, data) - BCH_ALLOC_FIELDS(), sizeof(u64)); -#undef x - -#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) - /* Quotas: */ enum quota_types { @@ -1132,8 +1139,8 @@ struct bch_sb_field_clean { struct bch_sb_field field; __le32 flags; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; __le64 journal_seq; union { @@ -1306,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); /* * Features: @@ -1333,15 +1341,23 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(btree_updates_journalled, 13) \ x(reflink_inline_data, 14) \ x(new_varint, 15) \ - x(journal_no_flush, 16) + x(journal_no_flush, 16) \ + x(alloc_v2, 17) \ + x(extents_across_btree_nodes, 18) + +#define BCH_SB_FEATURES_ALWAYS \ + ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_btree_updates_journalled)|\ + (1ULL << BCH_FEATURE_extents_across_btree_nodes)) #define BCH_SB_FEATURES_ALL \ - ((1ULL << BCH_FEATURE_new_siphash)| \ - (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (BCH_SB_FEATURES_ALWAYS| \ + (1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ (1ULL << BCH_FEATURE_new_varint)| \ - (1ULL << BCH_FEATURE_journal_no_flush)) + (1ULL << BCH_FEATURE_journal_no_flush)| \ + (1ULL << BCH_FEATURE_alloc_v2)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1350,15 +1366,25 @@ enum bch_sb_feature { BCH_FEATURE_NR, }; +#define BCH_SB_COMPAT() \ + x(alloc_info, 0) \ + x(alloc_metadata, 1) \ + x(extents_above_btree_updates_done, 2) \ + x(bformat_overflow_done, 3) + enum bch_sb_compat { - BCH_COMPAT_FEAT_ALLOC_INFO = 0, - BCH_COMPAT_FEAT_ALLOC_METADATA = 1, +#define x(f, n) BCH_COMPAT_##f, + BCH_SB_COMPAT() +#undef x + BCH_COMPAT_NR, }; /* options: */ #define BCH_REPLICAS_MAX 4U +#define BCH_BKEY_PTRS_MAX 16U + enum bch_error_actions { BCH_ON_ERROR_CONTINUE = 0, BCH_ON_ERROR_RO = 1, @@ -1492,7 +1518,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist, 3) \ x(blacklist_v2, 4) \ x(usage, 5) \ - x(data_usage, 6) + x(data_usage, 6) \ + x(clock, 7) \ + x(dev_usage, 8) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1540,6 +1568,30 @@ struct jset_entry_data_usage { struct bch_replicas_entry r; } __attribute__((packed)); +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __attribute__((packed)); + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __attribute__((packed)); + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 buckets_unavailable; + + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1562,8 +1614,8 @@ struct jset { __u8 encrypted_start[0]; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; /* Sequence number of oldest dirty journal entry */ __le64 last_seq; diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 0e626b098d91..f679fc2151bc 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -14,6 +14,9 @@ #define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) #define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) +#define BCH_FORCE_IF_LOST \ + (BCH_FORCE_IF_DATA_LOST| \ + BCH_FORCE_IF_METADATA_LOST) #define BCH_FORCE_IF_DEGRADED \ (BCH_FORCE_IF_DATA_DEGRADED| \ BCH_FORCE_IF_METADATA_DEGRADED) @@ -168,10 +171,11 @@ struct bch_ioctl_disk_set_state { }; enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_NR = 3, + BCH_DATA_OP_SCRUB = 0, + BCH_DATA_OP_REREPLICATE = 1, + BCH_DATA_OP_MIGRATE = 2, + BCH_DATA_OP_REWRITE_OLD_NODES = 3, + BCH_DATA_OP_NR = 4, }; /* @@ -184,11 +188,13 @@ enum bch_data_ops { * job. The file descriptor is O_CLOEXEC. */ struct bch_ioctl_data { - __u32 op; + __u16 op; + __u8 start_btree; + __u8 end_btree; __u32 flags; - struct bpos start; - struct bpos end; + struct bpos start_pos; + struct bpos end_pos; union { struct { diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index c06d0a965be1..e1906f257ef2 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -551,7 +551,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, unsigned bits, u64 offset) { - offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + + bits = min(bits, unpacked_bits); + + offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); f->bits_per_field[i] = bits; f->field_offset[i] = cpu_to_le64(offset); diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 2d2c640305e2..48821f6c09aa 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r) return bkey_cmp(l, r) < 0 ? l : r; } +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bkey_cmp(l, r) > 0 ? l : r; +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); @@ -525,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v); BKEY_VAL_ACCESSORS(inline_data); BKEY_VAL_ACCESSORS(btree_ptr_v2); BKEY_VAL_ACCESSORS(indirect_inline_data); +BKEY_VAL_ACCESSORS(alloc_v2); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h new file mode 100644 index 000000000000..0d7c67a959af --- /dev/null +++ b/fs/bcachefs/bkey_buf.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_BUF_H +#define _BCACHEFS_BKEY_BUF_H + +#include "bcachefs.h" + +struct bkey_buf { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_buf_realloc(s, c, k.k->u64s); + bkey_reassemble(s->k, k); +} + +static inline void bch2_bkey_buf_copy(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_i *src) +{ + bch2_bkey_buf_realloc(s, c, src->k.u64s); + bkey_copy(s->k, src); +} + +static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, + struct bch_fs *c, + struct btree *b, + struct bkey_packed *src) +{ + bch2_bkey_buf_realloc(s, c, BKEY_U64s + + bkeyp_val_u64s(&b->format, src)); + bch2_bkey_unpack(b, s->k, src); +} + +static inline void bch2_bkey_buf_init(struct bkey_buf *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h deleted file mode 100644 index f607a0cb37ed..000000000000 --- a/fs/bcachefs/bkey_on_stack.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_ON_STACK_H -#define _BCACHEFS_BKEY_ON_STACK_H - -#include "bcachefs.h" - -struct bkey_on_stack { - struct bkey_i *k; - u64 onstack[12]; -}; - -static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, - struct bch_fs *c, unsigned u64s) -{ - if (s->k == (void *) s->onstack && - u64s > ARRAY_SIZE(s->onstack)) { - s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); - memcpy(s->k, s->onstack, sizeof(s->onstack)); - } -} - -static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, - struct bch_fs *c, - struct bkey_s_c k) -{ - bkey_on_stack_realloc(s, c, k.k->u64s); - bkey_reassemble(s->k, k); -} - -static inline void bkey_on_stack_init(struct bkey_on_stack *s) -{ - s->k = (void *) s->onstack; -} - -static inline void bkey_on_stack_exit(struct bkey_on_stack *s, - struct bch_fs *c) -{ - if (s->k != (void *) s->onstack) - mempool_free(s->k, &c->large_bkey_pool); - s->k = NULL; -} - -#endif /* _BCACHEFS_BKEY_ON_STACK_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index 99e0a4011fae..2e1d9cd65f43 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bkey_sort.h" #include "bset.h" #include "extents.h" @@ -187,11 +187,11 @@ bch2_sort_repack_merge(struct bch_fs *c, bool filter_whiteouts) { struct bkey_packed *out = vstruct_last(dst), *k_packed; - struct bkey_on_stack k; + struct bkey_buf k; struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); - bkey_on_stack_init(&k); + bch2_bkey_buf_init(&k); while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { if (filter_whiteouts && bkey_whiteout(k_packed)) @@ -204,7 +204,7 @@ bch2_sort_repack_merge(struct bch_fs *c, * node; we have to make a copy of the entire key before calling * normalize */ - bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); + bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); bch2_bkey_unpack(src, k.k, k_packed); if (filter_whiteouts && @@ -215,7 +215,7 @@ bch2_sort_repack_merge(struct bch_fs *c, } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - bkey_on_stack_exit(&k, c); + bch2_bkey_buf_exit(&k, c); return nr; } @@ -315,11 +315,11 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; - struct bkey_on_stack split; + struct bkey_buf split; unsigned i; memset(&nr, 0, sizeof(nr)); - bkey_on_stack_init(&split); + bch2_bkey_buf_init(&split); sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); for (i = 0; i < iter->used;) { @@ -379,7 +379,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, /* * r wins, but it overlaps in the middle of l - split l: */ - bkey_on_stack_reassemble(&split, c, l.s_c); + bch2_bkey_buf_reassemble(&split, c, l.s_c); bch2_cut_back(bkey_start_pos(r.k), split.k); bch2_cut_front_s(r.k->p, l); @@ -398,7 +398,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - bkey_on_stack_exit(&split, c); + bch2_bkey_buf_exit(&split, c); return nr; } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1dc1f7460f9a..19c219cb317b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" +#include "error.h" #include <linux/prefetch.h> #include <linux/sched/mm.h> @@ -81,8 +83,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; @@ -812,9 +813,12 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != iter->btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bkey_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; } @@ -822,7 +826,8 @@ lock_node: struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, const struct bkey_i *k, enum btree_id btree_id, - unsigned level) + unsigned level, + bool nofill) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -837,6 +842,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { + if (nofill) + goto out; + b = bch2_btree_node_fill(c, NULL, k, btree_id, level, SIX_LOCK_read, true); @@ -844,8 +852,12 @@ retry: if (!b) goto retry; + if (IS_ERR(b) && + !bch2_btree_cache_cannibalize_lock(c, NULL)) + goto retry; + if (IS_ERR(b)) - return b; + goto out; } else { lock_node: ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); @@ -880,13 +892,18 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - return ERR_PTR(-EIO); + b = ERR_PTR(-EIO); + goto out; } - EBUG_ON(b->c.btree_id != btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); - + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bkey_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); +out: + bch2_btree_cache_cannibalize_unlock(c); return b; } @@ -899,10 +916,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree *parent; struct btree_node_iter node_iter; struct bkey_packed *k; - BKEY_PADDED(k) tmp; + struct bkey_buf tmp; struct btree *ret = NULL; unsigned level = b->c.level; + bch2_bkey_buf_init(&tmp); + parent = btree_iter_node(iter, level + 1); if (!parent) return NULL; @@ -936,9 +955,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, if (!k) goto out; - bch2_bkey_unpack(parent, &tmp.k, k); + bch2_bkey_buf_unpack(&tmp, c, parent, k); - ret = bch2_btree_node_get(c, iter, &tmp.k, level, + ret = bch2_btree_node_get(c, iter, tmp.k, level, SIX_LOCK_intent, _THIS_IP_); if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { @@ -958,7 +977,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, if (sib == btree_prev_sib) btree_node_unlock(iter, level); - ret = bch2_btree_node_get(c, iter, &tmp.k, level, + ret = bch2_btree_node_get(c, iter, tmp.k, level, SIX_LOCK_intent, _THIS_IP_); /* @@ -993,30 +1012,46 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), - n2->data->min_key)); + if (bkey_cmp(bkey_successor(n1->key.k.p), + n2->data->min_key)) { + char buf1[200], buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); + + bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" + "prev: %s\n" + "next: %s\n", + bch2_btree_ids[iter->btree_id], level, + buf1, buf2); + + six_unlock_intent(&ret->c.lock); + ret = NULL; + } } bch2_btree_trans_verify_locks(trans); + bch2_bkey_buf_exit(&tmp, c); + return ret; } void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, - const struct bkey_i *k, unsigned level) + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(!btree_node_locked(iter, level + 1)); + BUG_ON(iter && !btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) return; - bch2_btree_node_fill(c, iter, k, iter->btree_id, - level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, @@ -1068,6 +1103,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) { - pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); - pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); + pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index e766ef552ce7..5fffae92effb 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -26,13 +26,13 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, - enum btree_id, unsigned); + enum btree_id, unsigned, bool); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, struct btree *, enum btree_node_sibling); void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, - const struct bkey_i *, unsigned); + const struct bkey_i *, enum btree_id, unsigned); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 6268ea637d19..3a1518999c4b 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -8,7 +8,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_methods.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -50,39 +50,228 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } +/* + * Missing: if an interior btree node is empty, we need to do something - + * perhaps just kill it + */ static int bch2_gc_check_topology(struct bch_fs *c, - struct bkey_s_c k, - struct bpos *expected_start, - struct bpos expected_end, + struct btree *b, + struct bkey_buf *prev, + struct bkey_buf cur, bool is_last) { + struct bpos node_start = b->data->min_key; + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bkey_successor(prev->k->k.p); + char buf1[200], buf2[200]; + bool update_min = false; + bool update_max = false; int ret = 0; - if (k.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + + if (bkey_deleted(&prev->k->k)) + scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu", + node_start.inode, + node_start.offset); + else + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); + + if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, + (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) + update_min = true; + } + + if (fsck_err_on(is_last && + bkey_cmp(cur.k->k.p, node_end), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), + (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) + update_max = true; + + bch2_bkey_buf_copy(prev, c, cur.k); + + if (update_min || update_max) { + struct bkey_i *new; + struct bkey_i_btree_ptr_v2 *bp = NULL; + struct btree *n; + + if (update_max) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); + if (ret) + return ret; + } + + new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_copy(new, cur.k); + + if (new->k.type == KEY_TYPE_btree_ptr_v2) + bp = bkey_i_to_btree_ptr_v2(new); + + if (update_min) + bp->v.min_key = expected_start; + if (update_max) + new->k.p = node_end; + if (bp) + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true); + + ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new); + if (ret) { + kfree(new); + return ret; + } - if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, - "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", - bp.v->min_key.inode, - bp.v->min_key.offset, - expected_start->inode, - expected_start->offset)) { - BUG(); + n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id, + b->c.level - 1, true); + if (n) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, n); + + bkey_copy(&n->key, new); + if (update_min) + n->data->min_key = expected_start; + if (update_max) + n->data->max_key = node_end; + + ret = __bch2_btree_node_hash_insert(&c->btree_cache, n); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&n->c.lock); } } +fsck_err: + return ret; +} - *expected_start = bkey_cmp(k.k->p, POS_MAX) - ? bkey_successor(k.k->p) - : k.k->p; +static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + int ret = 0; - if (fsck_err_on(is_last && - bkey_cmp(k.k->p, expected_end), c, - "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", - k.k->p.inode, - k.k->p.offset, - expected_end.inode, - expected_end.offset)) { - BUG(); + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + + if (fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen)) { + if (p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen)) { + if (p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(!p.ptr.cached && + gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen)) + do_update = true; + + if (p.has_ec) { + struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx)) + do_update = true; + } + } + + if (do_update) { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct bkey_i *new; + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); + return -EINVAL; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_reassemble(new, *k); + + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0); + })); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct stripe *m = genradix_ptr(&c->stripes[true], + entry->stripe_ptr.idx); + + if (!m || !m->alive) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + + ret = bch2_journal_key_insert(c, btree_id, level, new); + if (ret) + kfree(new); + else + *k = bkey_i_to_s_c(new); } fsck_err: return ret; @@ -90,7 +279,9 @@ fsck_err: /* marking of btree keys/nodes: */ -static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, +static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c k, u8 *max_stale, bool initial) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -104,7 +295,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, BUG_ON(bch2_journal_seq_verify && k.k->version.lo > journal_cur_seq(&c->journal)); - /* XXX change to fsck check */ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, "key version number higher than recorded: %llu > %llu", k.k->version.lo, @@ -116,37 +306,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, "superblock not marked as containing replicas (type %u)", k.k->type)) { ret = bch2_mark_bkey_replicas(c, k); - if (ret) - return ret; - } - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); - struct bucket *g2 = PTR_BUCKET(ca, ptr, false); - - if (mustfix_fsck_err_on(!g->gen_valid, c, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - } - - if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen, g->mark.gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; - set_bit(BCH_FS_FIXED_GENS, &c->flags); + if (ret) { + bch_err(c, "error marking bkey replicas: %i", ret); + goto err; } } + + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); } bkey_for_each_ptr(ptrs, ptr) { @@ -161,16 +327,19 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); fsck_err: +err: + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, bool initial) { - struct bpos next_node_start = b->data->min_key; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; + struct bkey_buf prev, cur; int ret = 0; *max_stale = 0; @@ -179,37 +348,42 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, return 0; bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { bch2_bkey_debugcheck(c, b, k); - ret = bch2_gc_mark_key(c, k, max_stale, initial); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, max_stale, initial); if (ret) break; bch2_btree_node_iter_advance(&iter, b); if (b->c.level) { - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_gc_check_topology(c, b, &prev, cur, bch2_btree_node_iter_end(&iter)); if (ret) break; } } + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); return ret; } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - bool initial, bool metadata_only) + bool initial) { struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 + unsigned depth = bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -233,7 +407,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, if (max_stale > 64) bch2_btree_node_rewrite(c, iter, b->data->keys.seq, - BTREE_INSERT_USE_RESERVE| BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); else if (!bch2_btree_gc_rewrite_disabled && @@ -253,7 +426,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; if (!btree_node_fake(b)) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, initial); gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -262,16 +436,18 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, } static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, - struct journal_keys *journal_keys, unsigned target_depth) { struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bpos next_node_start = b->data->min_key; + struct bkey_buf cur, prev; u8 max_stale = 0; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_debugcheck(c, b, k); @@ -279,57 +455,82 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(c, k, &max_stale, true); - if (ret) + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, &max_stale, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); break; + } if (b->c.level) { - struct btree *child; - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); bch2_btree_and_journal_iter_advance(&iter); - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + ret = bch2_gc_check_topology(c, b, + &prev, cur, !bch2_btree_and_journal_iter_peek(&iter).k); if (ret) break; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } - if (b->c.level > target_depth) { - child = bch2_btree_node_get_noiter(c, &tmp.k, - b->c.btree_id, b->c.level - 1); - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; + if (b->c.level > target_depth) { + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + struct btree *child; - ret = bch2_gc_btree_init_recurse(c, child, - journal_keys, target_depth); - six_unlock_read(&child->c.lock); + bch2_bkey_buf_reassemble(&cur, c, k); + bch2_btree_and_journal_iter_advance(&iter); + child = bch2_btree_node_get_noiter(c, cur.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(child); + + if (fsck_err_on(ret == -EIO, c, + "unreadable btree node")) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); if (ret) - break; + return ret; + + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + continue; } - } else { - bch2_btree_and_journal_iter_advance(&iter); + + if (ret) { + bch_err(c, "%s: error %i getting btree node", + __func__, ret); + break; + } + + ret = bch2_gc_btree_init_recurse(c, child, + target_depth); + six_unlock_read(&child->c.lock); + + if (ret) + break; } } - +fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); return ret; } static int bch2_gc_btree_init(struct bch_fs *c, - struct journal_keys *journal_keys, - enum btree_id btree_id, - bool metadata_only) + enum btree_id btree_id) { struct btree *b; - unsigned target_depth = metadata_only ? 1 - : bch2_expensive_debug_checks ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 + unsigned target_depth = bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; int ret = 0; @@ -355,15 +556,17 @@ static int bch2_gc_btree_init(struct bch_fs *c, } if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(c, b, - journal_keys, target_depth); + ret = bch2_gc_btree_init_recurse(c, b, target_depth); if (!ret) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, true); fsck_err: six_unlock_read(&b->c.lock); + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } @@ -373,8 +576,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c, bool initial) { enum btree_id ids[BTREE_ID_NR]; unsigned i; @@ -386,11 +588,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; int ret = initial - ? bch2_gc_btree_init(c, journal_keys, - id, metadata_only) - : bch2_gc_btree(c, id, initial, metadata_only); - if (ret) + ? bch2_gc_btree_init(c, id) + : bch2_gc_btree(c, id, initial); + if (ret) { + bch_err(c, "%s: ret %i", __func__, ret); return ret; + } } return 0; @@ -546,8 +749,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->mi.nbuckets * sizeof(struct bucket)); ca->buckets[1] = NULL; - free_percpu(ca->usage[1]); - ca->usage[1] = NULL; + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; } free_percpu(c->usage_gc); @@ -555,13 +758,12 @@ static void bch2_gc_free(struct bch_fs *c) } static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) + bool initial) { struct bch_dev *ca; - bool verify = !metadata_only && - (!initial || - (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); - unsigned i; + bool verify = (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; int ret = 0; #define copy_field(_f, _msg, ...) \ @@ -570,18 +772,17 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, "stripe %zu has wrong "_msg \ ": got %u, should be %u", \ - dst_iter.pos, ##__VA_ARGS__, \ + iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - dst->dirty = true; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ @@ -592,48 +793,46 @@ static int bch2_gc_done(struct bch_fs *c, bch2_data_types[dst->b[b].mark.data_type],\ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - if (!metadata_only) { - struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); - struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + { + struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; - c->ec_stripes_heap.used = 0; - - while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && - (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { - BUG_ON(src_iter.pos != dst_iter.pos); - - copy_stripe_field(alive, "alive"); - copy_stripe_field(sectors, "sectors"); - copy_stripe_field(algorithm, "algorithm"); - copy_stripe_field(nr_blocks, "nr_blocks"); - copy_stripe_field(nr_redundant, "nr_redundant"); - copy_stripe_field(blocks_nonempty, - "blocks_nonempty"); + while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { + dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); + + if (dst->alive != src->alive || + dst->sectors != src->sectors || + dst->algorithm != src->algorithm || + dst->nr_blocks != src->nr_blocks || + dst->nr_redundant != src->nr_redundant) { + bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); + ret = -EINVAL; + goto fsck_err; + } for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); - if (dst->alive) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_insert(c, dst, dst_iter.pos); - spin_unlock(&c->ec_stripes_heap_lock); - } + dst->blocks_nonempty = 0; + for (i = 0; i < dst->nr_blocks; i++) + dst->blocks_nonempty += dst->block_sectors[i] != 0; - genradix_iter_advance(&dst_iter, &c->stripes[0]); - genradix_iter_advance(&src_iter, &c->stripes[1]); + genradix_iter_advance(&iter, &c->stripes[1]); } } - for_each_member_device(ca, c, i) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { struct bucket_array *dst = __bucket_array(ca, 0); struct bucket_array *src = __bucket_array(ca, 1); size_t b; @@ -648,12 +847,23 @@ static int bch2_gc_done(struct bch_fs *c, dst->b[b].oldest_gen = src->b[b].oldest_gen; } - }; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); - bch2_dev_usage_from_buckets(c); + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + } + }; { unsigned nr = fs_usage_u64s(c); @@ -663,28 +873,20 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); - if (!metadata_only) { - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], - "persistent_reserved[%i]", i); - } + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); char buf[80]; - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - bch2_replicas_entry_to_text(&PBUF(buf), e); copy_fs_field(replicas[i], "%s", buf); @@ -697,11 +899,12 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_stripe_field #undef copy_field fsck_err: + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } -static int bch2_gc_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i; @@ -718,7 +921,7 @@ static int bch2_gc_start(struct bch_fs *c, for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); - BUG_ON(ca->usage[1]); + BUG_ON(ca->usage_gc); ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), @@ -729,9 +932,9 @@ static int bch2_gc_start(struct bch_fs *c, return -ENOMEM; } - ca->usage[1] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[1]) { - bch_err(c, "error allocating ca->usage[gc]"); + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); return -ENOMEM; } @@ -765,13 +968,6 @@ static int bch2_gc_start(struct bch_fs *c, d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; d->gen_valid = s->gen_valid; - - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) { - d->_mark = s->mark; - d->_mark.owned_by_allocator = 0; - } } }; @@ -798,8 +994,7 @@ static int bch2_gc_start(struct bch_fs *c, * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) +int bch2_gc(struct bch_fs *c, bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -815,13 +1010,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); again: - ret = bch2_gc_start(c, metadata_only); + ret = bch2_gc_start(c); if (ret) goto out; bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); + ret = bch2_gc_btrees(c, initial); if (ret) goto out; @@ -831,16 +1026,15 @@ again: bch2_mark_allocator_buckets(c); c->gc_count++; -out: - if (!ret && - (test_bit(BCH_FS_FIXED_GENS, &c->flags) || - (!iter && bch2_test_restart_gc))) { + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { /* * XXX: make sure gens we fixed got saved */ if (iter++ <= 2) { - bch_info(c, "Fixed gens, restarting mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); percpu_down_write(&c->mark_lock); @@ -855,12 +1049,12 @@ out: bch_info(c, "Unable to fix bucket gens, looping"); ret = -EINVAL; } - +out: if (!ret) { bch2_journal_block(&c->journal); percpu_down_write(&c->mark_lock); - ret = bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_done(c, initial); bch2_journal_unblock(&c->journal); } else { @@ -930,10 +1124,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack sk; + struct bkey_buf sk; int ret = 0; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, @@ -942,7 +1136,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { if (gc_btree_gens_key(c, k)) { - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); bch2_extent_normalize(c, bkey_i_to_s(sk.k)); bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); @@ -962,7 +1156,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) } bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -1074,7 +1268,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, } if (bch2_keylist_realloc(&keylist, NULL, 0, - (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { + BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { trace_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); return; @@ -1354,7 +1548,7 @@ static int bch2_gc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic_long_read(&clock->now); + unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); int ret; @@ -1375,7 +1569,7 @@ static int bch2_gc_thread(void *arg) if (c->btree_gc_periodic) { unsigned long next = last + c->capacity / 16; - if (atomic_long_read(&clock->now) >= next) + if (atomic64_read(&clock->now) >= next) break; bch2_io_clock_schedule_timeout(clock, next); @@ -1387,14 +1581,14 @@ static int bch2_gc_thread(void *arg) } __set_current_state(TASK_RUNNING); - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); last_kick = atomic_read(&c->kick_gc); /* * Full gc is currently incompatible with btree key cache: */ #if 0 - ret = bch2_gc(c, NULL, false, false); + ret = bch2_gc(c, false, false); #else ret = bch2_gc_gens(c); #endif @@ -1424,11 +1618,14 @@ int bch2_gc_thread_start(struct bch_fs *c) { struct task_struct *p; - BUG_ON(c->gc_thread); + if (c->gc_thread) + return 0; p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); - if (IS_ERR(p)) + if (IS_ERR(p)) { + bch_err(c, "error creating gc thread: %li", PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); c->gc_thread = p; diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 3694a3df62a8..fa604efc70cc 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -6,8 +6,7 @@ void bch2_coalesce(struct bch_fs *); -struct journal_keys; -int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); +int bch2_gc(struct bch_fs *, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 87f97ccb3f1f..a84a473101dc 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -608,11 +608,16 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, } static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, struct btree *b, struct bset *i, unsigned offset, int write) { - pr_buf(out, "error validating btree node %sat btree ", - write ? "before write " : ""); + pr_buf(out, "error validating btree node "); + if (write) + pr_buf(out, "before write "); + if (ca) + pr_buf(out, "on %s ", ca->name); + pr_buf(out, "at btree "); btree_pos_to_text(out, c, b); pr_buf(out, "\n node offset %u", b->written); @@ -631,30 +636,30 @@ enum btree_validate_ret { BTREE_RETRY_READ = 64, }; -#define btree_err(type, c, b, i, msg, ...) \ +#define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ __label__ out; \ char _buf[300]; \ - char *buf = _buf; \ + char *buf2 = _buf; \ struct printbuf out = PBUF(_buf); \ \ - buf = kmalloc(4096, GFP_ATOMIC); \ - if (buf) \ - out = _PBUF(buf, 4986); \ + buf2 = kmalloc(4096, GFP_ATOMIC); \ + if (buf2) \ + out = _PBUF(buf2, 4986); \ \ - btree_err_msg(&out, c, b, i, b->written, write); \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", buf); \ + mustfix_fsck_err(c, "%s", buf2); \ goto out; \ } \ \ switch (write) { \ case READ: \ - bch_err(c, "%s", buf); \ + bch_err(c, "%s", buf2); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -675,7 +680,7 @@ enum btree_validate_ret { } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", buf); \ + bch_err(c, "corrupt metadata before write: %s", buf2); \ \ if (bch2_fs_inconsistent(c)) { \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ @@ -684,16 +689,16 @@ enum btree_validate_ret { break; \ } \ out: \ - if (buf != _buf) \ - kfree(buf); \ + if (buf2 != _buf) \ + kfree(buf2); \ true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -static int validate_bset(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors, - int write, bool have_retry) +static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned sectors, int write, bool have_retry) { unsigned version = le16_to_cpu(i->version); const char *err; @@ -702,18 +707,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b, btree_err_on((version != BCH_BSET_VERSION_OLD && version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, b, i, + BTREE_ERR_FATAL, c, ca, b, i, "unsupported bset version"); if (btree_err_on(b->written + sectors > c->opts.btree_node_size, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; return 0; } btree_err_on(b->written && !i->u64s, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "empty bset"); if (!b->written) { @@ -727,16 +732,16 @@ static int validate_bset(struct bch_fs *c, struct btree *b, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - BTREE_ERR_MUST_RETRY, c, b, i, + BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - BTREE_ERR_MUST_RETRY, c, b, i, + BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect level"); if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { @@ -753,8 +758,13 @@ static int validate_bset(struct bch_fs *c, struct btree *b, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %llu:%llu should be %llu:%llu", b->data->min_key.inode, b->data->min_key.offset, @@ -763,7 +773,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), - BTREE_ERR_MUST_RETRY, c, b, i, + BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %llu:%llu", bn->max_key.inode, bn->max_key.offset); @@ -788,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, #endif err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, - BTREE_ERR_FATAL, c, b, i, + BTREE_ERR_FATAL, c, ca, b, i, "invalid bkey format: %s", err); compat_bformat(b->c.level, b->c.btree_id, version, @@ -820,14 +830,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, const char *invalid; if (btree_err_on(bkey_next(k) > vstruct_last(i), - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, NULL, b, i, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -850,8 +860,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, b, i, - "invalid bkey:\n%s\n%s", invalid, buf); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey: %s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -884,7 +894,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&PBUF(buf2), u.k); bch2_dump_bset(c, b, i, 0); - btree_err(BTREE_ERR_FATAL, c, b, i, + btree_err(BTREE_ERR_FATAL, c, NULL, b, i, "keys out of order: %s > %s", buf1, buf2); /* XXX: repair this */ @@ -897,7 +907,8 @@ fsck_err: return ret; } -int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) +int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, bool have_retry) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -909,20 +920,22 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry unsigned u64s; int ret, retry_read = 0, write = READ; + b->version_ondisk = U16_MAX; + iter = mempool_alloc(&c->fill_iter, GFP_NOIO); sort_iter_init(iter, b); iter->size = (btree_blocks(c) + 1) * 2; if (bch2_meta_read_fault("btree")) - btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, + btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "bad magic"); btree_err_on(!b->data->keys.seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "bad btree header"); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { @@ -930,7 +943,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry &bkey_i_to_btree_ptr_v2(&b->key)->v; btree_err_on(b->data->keys.seq != bp->seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "got wrong btree node (seq %llx want %llx)", b->data->keys.seq, bp->seq); } @@ -945,7 +958,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i = &b->data->keys; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -953,7 +966,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); btree_err_on(bch2_crc_cmp(csum, b->data->csum), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); bset_encrypt(c, i, b->written << 9); @@ -973,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry break; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -981,7 +994,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); btree_err_on(bch2_crc_cmp(csum, bne->csum), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); bset_encrypt(c, i, b->written << 9); @@ -989,7 +1002,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry sectors = vstruct_sectors(bne, c->block_bits); } - ret = validate_bset(c, b, i, sectors, + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + + ret = validate_bset(c, ca, b, i, sectors, READ, have_retry); if (ret) goto fsck_err; @@ -1011,7 +1027,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry true); btree_err_on(blacklisted && first, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "first btree node bset has blacklisted journal seq"); if (blacklisted && !first) continue; @@ -1028,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bset_byte_offset(b, bne) < btree_bytes(c); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq, - BTREE_ERR_WANT_RETRY, c, b, NULL, + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, "found bset signature after last bset"); sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); @@ -1063,7 +1079,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, b, i, + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey %s: %s", buf, invalid); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1154,7 +1170,7 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, b, can_retry)) + !bch2_btree_node_read_done(c, ca, b, can_retry)) break; if (!can_retry) { @@ -1320,12 +1336,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c, struct btree_write_bio *wbio) { struct btree *b = wbio->wbio.bio.bi_private; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct bkey_buf k; struct bch_extent_ptr *ptr; struct btree_trans trans; struct btree_iter *iter; int ret; + bch2_bkey_buf_init(&k); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, @@ -1344,21 +1361,22 @@ retry: BUG_ON(!btree_node_hashed(b)); - bkey_copy(&tmp.k, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, + bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) goto err; - ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + ret = bch2_btree_node_update_key(c, iter, b, k.k); if (ret == -EINTR) goto retry; if (ret) goto err; out: bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); return; @@ -1458,7 +1476,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) return -1; - ret = validate_bset(c, b, i, sectors, WRITE, false) ?: + ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?: validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); if (ret) { bch2_inconsistent_error(c); @@ -1476,7 +1494,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - BKEY_PADDED(key) k; + struct bkey_buf k; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1487,6 +1505,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool validate_before_checksum = false; void *data; + bch2_bkey_buf_init(&k); + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1620,7 +1640,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, validate_before_checksum = true; /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) + if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change) validate_before_checksum = true; /* if we're going to be encrypting, check metadata validity first: */ @@ -1695,15 +1715,16 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, * just make all btree node writes FUA to keep things sane. */ - bkey_copy(&k.key, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) ptr->offset += b->written; b->written += sectors_to_write; /* XXX: submitting IO with btree locks held: */ - bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); + bch2_bkey_buf_exit(&k, c); return; err: set_btree_node_noevict(b); @@ -1823,23 +1844,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } -void bch2_btree_verify_flushed(struct bch_fs *c) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) { - unsigned long flags = READ_ONCE(b->flags); - - BUG_ON((flags & (1 << BTREE_NODE_dirty)) || - (flags & (1 << BTREE_NODE_write_in_flight))); - } - rcu_read_unlock(); -} - void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) { struct bucket_table *tbl; diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 6de92263af6c..89685bd57fc0 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -121,7 +121,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -134,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct bch_fs *, struct btree *, struct btree_iter *); -int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); +int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); @@ -185,7 +186,6 @@ do { \ void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); -void bch2_btree_verify_flushed(struct bch_fs *); void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 21253be5aab6..c41fe4e0bc00 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_iter.h" #include "btree_key_cache.h" @@ -33,13 +34,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) static inline bool btree_iter_pos_before_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; + return bkey_cmp(iter->real_pos, b->data->min_key) < 0; } static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; + return bkey_cmp(b->key.k.p, iter->real_pos) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -490,7 +491,6 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter) static void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned level) { - struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[level]; struct btree_node_iter tmp = l->iter; bool locked = btree_node_locked(iter, level); @@ -515,12 +515,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, if (!bch2_btree_node_relock(iter, level)) return; - /* - * Ideally this invariant would always be true, and hopefully in the - * future it will be, but for now set_pos_same_leaf() breaks it: - */ - BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && - !btree_iter_pos_in_node(iter, l->b)); + BUG_ON(!btree_iter_pos_in_node(iter, l->b)); /* * node iterators don't use leaf node iterator: @@ -543,12 +538,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, : bch2_btree_node_iter_prev_all(&tmp, l->b); k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { + if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { msg = "before"; goto err; } - if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { msg = "after"; goto err; } @@ -571,12 +566,11 @@ err: } panic("iterator should be %s key at level %u:\n" - "iter pos %s %llu:%llu\n" + "iter pos %llu:%llu\n" "prev key %s\n" "cur key %s\n", msg, level, - iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", - iter->pos.inode, iter->pos.offset, + iter->real_pos.inode, iter->real_pos.offset, buf1, buf2); } @@ -584,12 +578,24 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) { unsigned i; - bch2_btree_trans_verify_locks(iter->trans); + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + + bch2_btree_iter_verify_locks(iter); for (i = 0; i < BTREE_MAX_DEPTH; i++) bch2_btree_iter_verify_level(iter, i); } +static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + enum btree_iter_type type = btree_iter_type(iter); + + BUG_ON((type == BTREE_ITER_KEYS || + type == BTREE_ITER_CACHED) && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0)); +} + void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) { struct btree_iter *iter; @@ -605,6 +611,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} #endif @@ -630,12 +637,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, struct bkey_packed *where) { struct btree_iter_level *l = &iter->l[b->c.level]; - struct bpos pos = btree_iter_search_key(iter); if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) return; - if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) + if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) bch2_btree_node_iter_advance(&l->iter, l->b); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); @@ -670,7 +676,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, bool iter_current_key_modified = orig_iter_pos >= offset && orig_iter_pos <= offset + clobber_u64s; - struct bpos iter_pos = btree_iter_search_key(iter); btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -678,7 +683,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -693,7 +698,7 @@ found: return; if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -829,12 +834,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) { - struct bpos pos = btree_iter_search_key(iter); struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -897,10 +901,16 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) static inline void __btree_iter_init(struct btree_iter *iter, unsigned level) { - struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &pos); + bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } @@ -1041,27 +1051,32 @@ static void btree_iter_prefetch(struct btree_iter *iter) struct btree_iter_level *l = &iter->l[iter->level]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; - BKEY_PADDED(k) tmp; + struct bkey_buf tmp; unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ? (iter->level > 1 ? 0 : 2) : (iter->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(iter, iter->level); + bch2_bkey_buf_init(&tmp); + while (nr) { if (!bch2_btree_node_relock(iter, iter->level)) - return; + break; bch2_btree_node_iter_advance(&node_iter, l->b); k = bch2_btree_node_iter_peek(&node_iter, l->b); if (!k) break; - bch2_bkey_unpack(l->b, &tmp.k, k); - bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); + bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, + iter->level - 1); } if (!was_locked) btree_node_unlock(iter, iter->level); + + bch2_bkey_buf_exit(&tmp, c); } static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, @@ -1093,30 +1108,34 @@ static __always_inline int btree_iter_down(struct btree_iter *iter, struct btree *b; unsigned level = iter->level - 1; enum six_lock_type lock_type = __btree_lock_want(iter, level); - BKEY_PADDED(k) tmp; + struct bkey_buf tmp; + int ret; EBUG_ON(!btree_node_locked(iter, iter->level)); - bch2_bkey_unpack(l->b, &tmp.k, + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); - if (unlikely(IS_ERR(b))) - return PTR_ERR(b); + b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; mark_btree_node_locked(iter, level, lock_type); btree_iter_node_set(iter, b); - if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && - unlikely(b != btree_node_mem_ptr(&tmp.k))) + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(iter, level + 1, b); if (iter->flags & BTREE_ITER_PREFETCH) btree_iter_prefetch(iter); iter->level = level; - - return 0; +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; } static void btree_iter_up(struct btree_iter *iter) @@ -1330,21 +1349,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) return ret; } -static inline void bch2_btree_iter_checks(struct btree_iter *iter) -{ - enum btree_iter_type type = btree_iter_type(iter); - - EBUG_ON(iter->btree_id >= BTREE_ID_NR); - - BUG_ON((type == BTREE_ITER_KEYS || - type == BTREE_ITER_CACHED) && - (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || - bkey_cmp(iter->pos, iter->k.p) > 0)); - - bch2_btree_iter_verify_locks(iter); - bch2_btree_iter_verify_level(iter, iter->level); -} - /* Iterate across nodes (leaf and interior nodes) */ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) @@ -1353,7 +1357,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); if (iter->uptodate == BTREE_ITER_UPTODATE) return iter->l[iter->level].b; @@ -1368,7 +1372,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = b->key.k.p; + iter->pos = iter->real_pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; bch2_btree_iter_verify(iter); @@ -1382,7 +1386,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); /* already got to end? */ if (!btree_iter_node(iter, iter->level)) @@ -1419,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (btree_node_read_locked(iter, iter->level)) btree_node_unlock(iter, iter->level); - iter->pos = bkey_successor(iter->pos); + iter->pos = iter->real_pos = bkey_successor(iter->pos); iter->level = iter->min_depth; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); @@ -1430,7 +1434,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) b = iter->l[iter->level].b; } - iter->pos = b->key.k.p; + iter->pos = iter->real_pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; bch2_btree_iter_verify(iter); @@ -1440,36 +1444,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* Iterate across keys (in leaf nodes only) */ -void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -{ - struct btree_iter_level *l = &iter->l[0]; - - EBUG_ON(iter->level != 0); - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); - EBUG_ON(!btree_node_locked(iter, 0)); - EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); - - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - btree_iter_advance_to_pos(iter, l, -1); - - /* - * XXX: - * keeping a node locked that's outside (even just outside) iter->pos - * breaks __bch2_btree_node_lock(). This seems to only affect - * bch2_btree_node_get_sibling so for now it's fixed there, but we - * should try to get rid of this corner case. - * - * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) - */ - - if (bch2_btree_node_iter_end(&l->iter) && - btree_iter_pos_after_node(iter, l->b)) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -} - static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) { unsigned l = iter->level; @@ -1508,67 +1482,85 @@ out: btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, - bool strictly_greater) +static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) { - struct bpos old = btree_iter_search_key(iter); - int cmp; + int cmp = bkey_cmp(new_pos, iter->real_pos); - iter->flags &= ~BTREE_ITER_IS_EXTENTS; - iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; + iter->real_pos = new_pos; + btree_iter_pos_changed(iter, cmp); + + bch2_btree_iter_verify(iter); +} + +void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, + bool strictly_greater) +{ bkey_init(&iter->k); iter->k.p = iter->pos = new_pos; - cmp = bkey_cmp(btree_iter_search_key(iter), old); + iter->flags &= ~BTREE_ITER_IS_EXTENTS; + iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; - btree_iter_pos_changed(iter, cmp); + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - int cmp = bkey_cmp(new_pos, iter->pos); + __bch2_btree_iter_set_pos(iter, new_pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) != 0); +} - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; +static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter) +{ + struct bpos pos = iter->k.p; + bool ret = bkey_cmp(pos, POS_MAX) != 0; - btree_iter_pos_changed(iter, cmp); + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } -static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - bool ret; - - bkey_init(&iter->k); - iter->k.p = iter->pos = l->b->key.k.p; + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = bkey_cmp(pos, POS_MIN) != 0; - ret = bkey_cmp(iter->pos, POS_MAX) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - iter->k.p = iter->pos = bkey_successor(iter->pos); - - btree_iter_pos_changed(iter, 1); + pos = bkey_predecessor(pos); + bch2_btree_iter_set_pos(iter, pos); return ret; } -static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - bool ret; + struct bpos next_pos = iter->l[0].b->key.k.p; + bool ret = bkey_cmp(next_pos, POS_MAX) != 0; - bkey_init(&iter->k); - iter->k.p = iter->pos = l->b->data->min_key; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + /* + * Typically, we don't want to modify iter->pos here, since that + * indicates where we searched from - unless we got to the end of the + * btree, in that case we want iter->pos to reflect that: + */ + if (ret) + btree_iter_set_search_pos(iter, bkey_successor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MAX); - ret = bkey_cmp(iter->pos, POS_MIN) != 0; - if (ret) { - iter->k.p = iter->pos = bkey_predecessor(iter->pos); + return ret; +} - if (iter->flags & BTREE_ITER_IS_EXTENTS) - iter->k.p = iter->pos = bkey_predecessor(iter->pos); - } +static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +{ + struct bpos next_pos = iter->l[0].b->data->min_key; + bool ret = bkey_cmp(next_pos, POS_MIN) != 0; + + if (ret) + btree_iter_set_search_pos(iter, bkey_predecessor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MIN); - btree_iter_pos_changed(iter, -1); return ret; } @@ -1611,7 +1603,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); if (iter->uptodate == BTREE_ITER_UPTODATE && !bkey_deleted(&iter->k)) @@ -1634,13 +1629,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) * iter->pos should always be equal to the key we just * returned - except extents can straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); + iter->real_pos = k.k->p; + iter->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_iter_verify_level(iter, 0); + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; } @@ -1650,14 +1647,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance_pos(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - return bch2_btree_iter_peek(iter); } @@ -1699,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); while (1) { ret = bch2_btree_iter_traverse(iter); @@ -1709,10 +1701,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) k = __bch2_btree_iter_peek_with_updates(iter); if (k.k && bkey_deleted(k.k)) { - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); + bch2_btree_iter_advance_pos(iter); continue; } @@ -1724,11 +1713,10 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) } /* - * iter->pos should always be equal to the key we just - * returned - except extents can straddle iter->pos: + * iter->pos should be mononotically increasing, and always be equal to + * the key we just returned - except extents can straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; @@ -1737,14 +1725,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance_pos(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - return bch2_btree_iter_peek_with_updates(iter); } @@ -1760,7 +1743,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + btree_iter_set_search_pos(iter, iter->pos); if (iter->uptodate == BTREE_ITER_UPTODATE && !bkey_deleted(&iter->k)) @@ -1768,24 +1754,49 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) while (1) { ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto no_key; + } k = __btree_iter_peek(iter, l); - if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0 + : bkey_cmp(bkey_start_pos(k.k), pos) > 0)) k = __btree_iter_prev(iter, l); if (likely(k.k)) break; - if (!btree_iter_set_pos_to_prev_leaf(iter)) - return bkey_s_c_null; + if (!btree_iter_set_pos_to_prev_leaf(iter)) { + k = bkey_s_c_null; + goto no_key; + } } EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); - iter->pos = bkey_start_pos(k.k); - iter->uptodate = BTREE_ITER_UPTODATE; + + /* Extents can straddle iter->pos: */ + if (bkey_cmp(k.k->p, pos) < 0) + iter->pos = k.k->p; + iter->real_pos = k.k->p; + iter->uptodate = BTREE_ITER_UPTODATE; +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; +no_key: + /* + * __btree_iter_peek() may have set iter->k to a key we didn't want, and + * then we errored going to the previous leaf - make sure it's + * consistent with iter->pos: + */ + BUG_ON(bkey_cmp(pos, iter->pos) && + bkey_cmp(iter->pos, POS_MIN)); + bkey_init(&iter->k); + iter->k.p = iter->pos; + goto out; } /** @@ -1794,27 +1805,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { - struct bpos pos = bkey_start_pos(&iter->k); - - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); - - if (unlikely(!bkey_cmp(pos, POS_MIN))) + if (!bch2_btree_iter_rewind_pos(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); - return bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; struct bkey_s_c k; - struct bkey n; - int ret; + struct bpos pos, next_start; /* keys & holes can't span inode numbers: */ if (iter->pos.offset == KEY_OFFSET_MAX) { @@ -1822,53 +1823,36 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) return bkey_s_c_null; bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); - - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); } - /* - * iterator is now at the correct position for inserting at iter->pos, - * but we need to keep iterating until we find the first non whiteout so - * we know how big a hole we have, if any: - */ - - node_iter = l->iter; - k = __btree_iter_unpack(iter, l, &iter->k, - bch2_btree_node_iter_peek(&node_iter, l->b)); + pos = iter->pos; + k = bch2_btree_iter_peek(iter); + iter->pos = pos; - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - /* - * We're not setting iter->uptodate because the node iterator - * doesn't necessarily point at the key we're returning: - */ - - EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); - bch2_btree_iter_verify_level(iter, 0); + if (bkey_err(k)) return k; - } - /* hole */ + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) + return k; - if (!k.k) - k.k = &l->b->key.k; + next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; - bkey_init(&n); - n.p = iter->pos; - bch2_key_resize(&n, + bkey_init(&iter->k); + iter->k.p = iter->pos; + bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) + (next_start.inode == iter->pos.inode + ? next_start.offset : KEY_OFFSET_MAX) - - n.p.offset)); + iter->pos.offset)); - EBUG_ON(!n.size); + EBUG_ON(!iter->k.size); - iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_iter_verify_level(iter, 0); + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + return (struct bkey_s_c) { &iter->k, NULL }; } @@ -1879,18 +1863,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); if (iter->uptodate == BTREE_ITER_UPTODATE) return btree_iter_peek_uptodate(iter); + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return __bch2_btree_iter_peek_slot_extents(iter); + ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return __bch2_btree_iter_peek_slot_extents(iter); - k = __btree_iter_peek_all(iter, l, &iter->k); EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); @@ -1903,20 +1890,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } iter->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_iter_verify_level(iter, 0); + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; } struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance_pos(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - return bch2_btree_iter_peek_slot(iter); } @@ -1926,7 +1909,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); ret = bch2_btree_iter_traverse(iter); if (unlikely(ret)) @@ -1957,6 +1940,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, bkey_init(&iter->k); iter->k.p = pos; iter->flags = flags; + iter->real_pos = btree_iter_search_key(iter); iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; iter->level = 0; @@ -2096,7 +2080,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, if (best && bkey_cmp(bpos_diff(best->pos, pos), - bpos_diff(iter->pos, pos)) < 0) + bpos_diff(iter->real_pos, pos)) < 0) continue; best = iter; @@ -2117,9 +2101,12 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, iter->flags &= ~BTREE_ITER_USER_FLAGS; iter->flags |= flags & BTREE_ITER_USER_FLAGS; - if (iter->flags & BTREE_ITER_INTENT) - bch2_btree_iter_upgrade(iter, 1); - else + if (iter->flags & BTREE_ITER_INTENT) { + if (!iter->locks_want) { + __bch2_btree_iter_unlock(iter); + iter->locks_want = 1; + } + } else bch2_btree_iter_downgrade(iter); BUG_ON(iter->btree_id != btree_id); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9a7f8d0197ec..12c519ae2a60 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -174,7 +174,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); -void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 244c5dbcd3e9..4357aefdb668 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -349,8 +349,6 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| BTREE_INSERT_JOURNAL_RESERVED| BTREE_INSERT_JOURNAL_RECLAIM); err: @@ -580,6 +578,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_splice(&bc->dirty, &bc->clean); list_for_each_entry_safe(ck, n, &bc->clean, list) { + cond_resched(); + bch2_journal_pin_drop(&c->journal, &ck->journal); bch2_journal_preres_put(&c->journal, &ck->res); @@ -593,6 +593,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) BUG_ON(bc->nr_keys); list_for_each_entry_safe(ck, n, &bc->freed, list) { + cond_resched(); + list_del(&ck->list); kmem_cache_free(bch2_key_cache, ck); } diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index dad3e344dcf9..2f8b5521718a 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -16,7 +16,8 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty; + return nr_dirty > max_dirty && + test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); } struct bkey_cached * diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index dc7de27112c6..03894e923037 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -57,7 +57,7 @@ struct btree_write { struct btree_alloc { struct open_buckets ob; - BKEY_PADDED(k); + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_bkey_cached_common { @@ -76,6 +76,7 @@ struct btree { u16 written; u8 nsets; u8 nr_key_bits; + u16 version_ondisk; struct bkey_format format; @@ -247,6 +248,8 @@ enum btree_iter_uptodate { struct btree_iter { struct btree_trans *trans; struct bpos pos; + /* what we're searching for/what the iterator actually points to: */ + struct bpos real_pos; struct bpos pos_after_commit; u16 flags; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index adb07043cbb3..a25138080169 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -20,7 +20,6 @@ enum btree_insert_flags { __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, __BTREE_INSERT_USE_RESERVE, - __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_JOURNAL_RECLAIM, @@ -43,7 +42,6 @@ enum btree_insert_flags { /* for copygc, or when merging btree nodes */ #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) -#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 8f96756ba648..275dcabbbdd6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -195,21 +195,18 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, { struct write_point *wp; struct btree *b; - BKEY_PADDED(k) tmp; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct open_buckets ob = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; - if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { + if (flags & BTREE_INSERT_USE_RESERVE) { nr_reserve = 0; - alloc_reserve = RESERVE_ALLOC; - } else if (flags & BTREE_INSERT_USE_RESERVE) { - nr_reserve = BTREE_NODE_RESERVE / 2; - alloc_reserve = RESERVE_BTREE; + alloc_reserve = RESERVE_BTREE_MOVINGGC; } else { nr_reserve = BTREE_NODE_RESERVE; - alloc_reserve = RESERVE_NONE; + alloc_reserve = RESERVE_BTREE; } mutex_lock(&c->btree_reserve_cache_lock); @@ -225,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, + wp = bch2_alloc_sectors_start(c, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, @@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev bch2_bset_init_first(b, &b->data->keys); b->c.level = level; b->c.btree_id = as->btree_id; + b->version_ondisk = c->sb.version; memset(&b->nr, 0, sizeof(b->nr)); b->data->magic = cpu_to_le64(bset_magic(c)); @@ -300,7 +301,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev bp->v.mem_ptr = 0; bp->v.seq = b->data->keys.seq; bp->v.sectors_written = 0; - bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); } if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) @@ -577,8 +577,6 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_trans_init(&trans, c, 0, 512); ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_JOURNAL_RECLAIM| BTREE_INSERT_JOURNAL_RESERVED, @@ -1232,6 +1230,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, src = n; } + /* Also clear out the unwritten whiteouts area: */ + b->whiteout_u64s = 0; + i->u64s = cpu_to_le16((u64 *) dst - i->_data); set_btree_bset_end(b, b->set); @@ -1457,15 +1458,6 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as; struct closure cl; int ret = 0; - struct btree_insert_entry *i; - - /* - * We already have a disk reservation and open buckets pinned; this - * allocation must not block: - */ - trans_for_each_update(trans, i) - if (btree_node_type_needs_gc(i->iter->btree_id)) - flags |= BTREE_INSERT_USE_RESERVE; closure_init_stack(&cl); @@ -1926,10 +1918,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, retry: as = bch2_btree_update_start(iter->trans, iter->btree_id, parent ? btree_update_reserve_required(c, parent) : 0, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); + BTREE_INSERT_NOFAIL, &cl); if (IS_ERR(as)) { ret = PTR_ERR(as); diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index e7816afe4a08..53ea91b32fd5 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && - bkey_cmp(bkey_start_pos(&insert->k), - bkey_predecessor(b->data->min_key)) < 0); EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > @@ -219,7 +216,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, { struct bch_fs *c = trans->c; - BUG_ON(bkey_cmp(insert->k.p, iter->pos)); + BUG_ON(bkey_cmp(insert->k.p, iter->real_pos)); BUG_ON(bch2_debug_check_bkeys && bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(iter->level, iter->btree_id))); @@ -287,7 +284,8 @@ btree_key_can_insert_cached(struct btree_trans *trans, BUG_ON(iter->level); if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(trans->c)) + bch2_btree_key_cache_must_wait(trans->c) && + !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) return BTREE_INSERT_NEED_JOURNAL_RECLAIM; if (u64s <= ck->u64s) @@ -508,6 +506,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, /* * Can't be holding any read locks when we go to take write locks: + * another thread could be holding an intent lock on the same node we + * have a read lock on, and it'll block trying to take a write lock + * (because we hold a read lock) and it could be blocking us by holding + * its own read lock (while we're trying to to take write locks). * * note - this must be done after bch2_trans_journal_preres_get_cold() * or anything else that might call bch2_trans_relock(), since that @@ -515,9 +517,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, */ trans_for_each_iter(trans, iter) { if (iter->nodes_locked != iter->nodes_intent_locked) { - EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); - EBUG_ON(trans->iters_live & (1ULL << iter->idx)); - bch2_btree_iter_unlock_noinline(iter); + if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || + (trans->iters_live & (1ULL << iter->idx))) { + if (!bch2_btree_iter_upgrade(iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; + } + } else { + bch2_btree_iter_unlock_noinline(iter); + } } } @@ -695,26 +703,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l, bkey_cmp(l->pos, r->pos); } -static void bch2_trans_update2(struct btree_trans *trans, +static int bch2_trans_update2(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { struct btree_insert_entry *i, n = (struct btree_insert_entry) { .iter = iter, .k = insert }; + int ret; btree_insert_entry_checks(trans, n.iter, n.k); - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return ret; + + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; trans_for_each_update2(trans, i) { if (btree_iter_pos_cmp(n.iter, i->iter) == 0) { *i = n; - return; + return 0; } if (btree_iter_pos_cmp(n.iter, i->iter) <= 0) @@ -723,6 +736,7 @@ static void bch2_trans_update2(struct btree_trans *trans, array_insert_item(trans->updates2, trans->nr_updates2, i - trans->updates2, n); + return 0; } static int extent_update_to_keys(struct btree_trans *trans, @@ -743,9 +757,9 @@ static int extent_update_to_keys(struct btree_trans *trans, iter->flags |= BTREE_ITER_INTENT; __bch2_btree_iter_set_pos(iter, insert->k.p, false); - bch2_trans_update2(trans, iter, insert); + ret = bch2_trans_update2(trans, iter, insert); bch2_trans_iter_put(trans, iter); - return 0; + return ret; } static int extent_handle_overwrites(struct btree_trans *trans, @@ -775,8 +789,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, bch2_cut_back(start, update); __bch2_btree_iter_set_pos(update_iter, update->k.p, false); - bch2_trans_update2(trans, update_iter, update); + ret = bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); + if (ret) + goto err; } if (bkey_cmp(k.k->p, end) > 0) { @@ -790,8 +806,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, bch2_cut_front(end, update); __bch2_btree_iter_set_pos(update_iter, update->k.p, false); - bch2_trans_update2(trans, update_iter, update); + ret = bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); + if (ret) + goto err; } else { update_iter = bch2_trans_copy_iter(trans, iter); @@ -805,8 +823,10 @@ static int extent_handle_overwrites(struct btree_trans *trans, update->k.size = 0; __bch2_btree_iter_set_pos(update_iter, update->k.p, false); - bch2_trans_update2(trans, update_iter, update); + ret = bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); + if (ret) + goto err; } k = bch2_btree_iter_next_with_updates(iter); @@ -826,7 +846,7 @@ int __bch2_trans_commit(struct btree_trans *trans) int ret = 0; if (!trans->nr_updates) - goto out_noupdates; + goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&trans->c->gc_lock); @@ -840,7 +860,7 @@ int __bch2_trans_commit(struct btree_trans *trans) unlikely(!percpu_ref_tryget(&trans->c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) - return ret; + goto out_reset; } #ifdef CONFIG_BCACHEFS_DEBUG @@ -859,8 +879,8 @@ int __bch2_trans_commit(struct btree_trans *trans) trans_trigger_run = false; trans_for_each_update(trans, i) { - if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && - (ret = bch2_btree_iter_traverse(i->iter)))) { + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { trace_trans_restart_traverse(trans->ip); goto out; } @@ -869,8 +889,8 @@ int __bch2_trans_commit(struct btree_trans *trans) * We're not using bch2_btree_iter_upgrade here because * we know trans->nounlock can't be set: */ - if (unlikely(i->iter->locks_want < 1 && - !__bch2_btree_iter_upgrade(i->iter, 1))) { + if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && + !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { trace_trans_restart_upgrade(trans->ip); ret = -EINTR; goto out; @@ -911,17 +931,22 @@ int __bch2_trans_commit(struct btree_trans *trans) trans_for_each_update(trans, i) { if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ret = extent_update_to_keys(trans, i->iter, i->k); - if (ret) - goto out; } else { - bch2_trans_update2(trans, i->iter, i->k); + ret = bch2_trans_update2(trans, i->iter, i->k); } + if (ret) + goto out; } trans_for_each_update2(trans, i) { - BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); BUG_ON(i->iter->locks_want < 1); + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { + trace_trans_restart_traverse(trans->ip); + goto out; + } + u64s = jset_u64s(i->k->k.u64s); if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) @@ -941,18 +966,14 @@ retry: trans_for_each_iter(trans, iter) if ((trans->iters_live & (1ULL << iter->idx)) && - (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { - if (trans->flags & BTREE_INSERT_NOUNLOCK) - bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); - else - bch2_btree_iter_set_pos(iter, iter->pos_after_commit); - } + (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) + bch2_btree_iter_set_pos(iter, iter->pos_after_commit); out: bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); -out_noupdates: +out_reset: bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); return ret; @@ -971,10 +992,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .trigger_flags = flags, .iter = iter, .k = k }; - EBUG_ON(bkey_cmp(iter->pos, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_start_pos(&k->k) - : k->k.p)); +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(bkey_cmp(iter->pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&k->k) + : k->k.p)); + + trans_for_each_update(trans, i) { + BUG_ON(bkey_cmp(i->iter->pos, + (i->iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&i->k->k) + : i->k->k.p)); + + BUG_ON(i != trans->updates && + btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0); + } +#endif iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; @@ -1074,8 +1107,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, bch2_trans_update(trans, iter, &k, 0); return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags); + BTREE_INSERT_NOFAIL|flags); } int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 1934b845ea15..ba7a472a1bb7 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; + struct bch_dev *ca; unsigned i; percpu_down_write(&c->mark_lock); @@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c) fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + percpu_up_write(&c->mark_lock); } @@ -189,14 +198,27 @@ out_pool: return ret; } +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { + struct bch_fs *c = ca->fs; struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); - memset(&ret, 0, sizeof(ret)); - acc_u64s_percpu((u64 *) &ret, - (u64 __percpu *) ca->usage[0], - sizeof(ret) / sizeof(u64)); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } @@ -261,7 +283,8 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - unsigned u64s = fs_usage_u64s(c); + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) (u64 __percpu *) c->usage[idx], u64s); percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + write_seqcount_end(&c->usage_lock); preempt_enable(); } @@ -376,15 +409,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m) return !is_available_bucket(m); } -static inline int is_fragmented_bucket(struct bucket_mark m, - struct bch_dev *ca) +static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bucket_mark m) { - if (!m.owned_by_allocator && - m.data_type == BCH_DATA_user && - bucket_sectors_used(m)) - return max_t(int, 0, (int) ca->mi.bucket_size - - bucket_sectors_used(m)); - return 0; + return bucket_sectors_used(m) + ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + : 0; } static inline int is_stripe_data_bucket(struct bucket_mark m) @@ -392,11 +422,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m) return m.stripe && m.data_type != BCH_DATA_parity; } -static inline int bucket_stripe_sectors(struct bucket_mark m) -{ - return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; -} - static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors @@ -456,20 +481,20 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, if (type == BCH_DATA_sb || type == BCH_DATA_journal) fs_usage->hidden += size; - dev_usage->buckets[type] += nr; + dev_usage->d[type].buckets += nr; } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, - bool gc) + u64 journal_seq, bool gc) { struct bch_dev_usage *u; percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - u = this_cpu_ptr(ca->usage[gc]); + u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) account_bucket(fs_usage, u, bucket_type(old), @@ -481,50 +506,24 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + u->buckets_ec += (int) new.stripe - (int) old.stripe; u->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); - u->buckets_ec += (int) new.stripe - (int) old.stripe; - u->sectors_ec += bucket_stripe_sectors(new) - - bucket_stripe_sectors(old); - - u->sectors[old.data_type] -= old.dirty_sectors; - u->sectors[new.data_type] += new.dirty_sectors; - u->sectors[BCH_DATA_cached] += + u->d[old.data_type].sectors -= old.dirty_sectors; + u->d[new.data_type].sectors += new.dirty_sectors; + u->d[BCH_DATA_cached].sectors += (int) new.cached_sectors - (int) old.cached_sectors; - u->sectors_fragmented += - is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); + + u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); } -__flatten -void bch2_dev_usage_from_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket_mark old = { .v.counter = 0 }; - struct bucket_array *buckets; - struct bucket *g; - unsigned i; - int cpu; - - c->usage_base->hidden = 0; - - for_each_member_device(ca, c, i) { - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(ca->usage[0], cpu), 0, - sizeof(*ca->usage[0])); - - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, c->usage_base, - old, g->mark, false); - } -} - static inline int update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, @@ -650,46 +649,6 @@ unwind: ret; \ }) -static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *ret, - bool gc) -{ - struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); - struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - BUG_ON(!is_available_bucket(new)); - - new.owned_by_allocator = true; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - new.gen++; - })); - - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - - if (old.cached_sectors) - update_cached_sectors(c, fs_usage, ca->dev_idx, - -((s64) old.cached_sectors)); - - if (!gc) - *ret = old; - return 0; -} - -void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) -{ - do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, - ca, b, old); - - if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, bucket_to_sector(ca, b), - old->cached_sectors); -} - static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) @@ -702,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + /* + * XXX: this is wrong, this means we'll be doing updates to the percpu + * buckets_alloc counter that don't have an open journal buffer and + * we'll race with the machinery that accumulates that to ca->usage_base + */ + bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -734,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bucket_mark old_m, m; /* We don't do anything for deletions - do we?: */ - if (new.k->type != KEY_TYPE_alloc) + if (new.k->type != KEY_TYPE_alloc && + new.k->type != KEY_TYPE_alloc_v2) return 0; /* @@ -757,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c, m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; m.cached_sectors = u.cached_sectors; + m.stripe = u.stripe != 0; if (journal_seq) { m.journal_seq_valid = 1; @@ -764,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; g->oldest_gen = u.oldest_gen; g->gen_valid = 1; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; /* * need to know if we're getting called from the invalidate path or @@ -827,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (c) bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, gc); + old, new, 0, gc); return 0; } @@ -964,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, +static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, unsigned ptr_idx, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags, - bool enabled) + u64 journal_seq, unsigned flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; @@ -981,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, char buf[200]; int ret; - if (enabled) - g->ec_redundancy = s->nr_redundant; + if (g->stripe && g->stripe != k.k->p.offset) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EINVAL; + } old = bucket_cmpxchg(g, new, ({ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, @@ -990,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, if (ret) return ret; - if (new.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!new.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - new.stripe = enabled; - - if ((flags & BTREE_TRIGGER_GC) && parity) { - new.data_type = enabled ? BCH_DATA_parity : 0; - new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; + if (parity) { + new.data_type = BCH_DATA_parity; + new.dirty_sectors = le16_to_cpu(s->sectors); } if (journal_seq) { @@ -1015,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, } })); - if (!enabled) - g->ec_redundancy = 0; + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); return 0; } @@ -1085,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); @@ -1212,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c, unsigned i; int ret; + BUG_ON(gc && old_s); + if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); @@ -1219,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c, } if (!new_s) { - /* Deleting: */ - for (i = 0; i < old_s->nr_blocks; i++) { - ret = bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - - if (!gc && m->on_heap) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_del(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); - } - - if (gc) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); memset(m, 0, sizeof(*m)); } else { - BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); - BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); - - for (i = 0; i < new_s->nr_blocks; i++) { - if (!old_s || - memcmp(new_s->ptrs + i, - old_s->ptrs + i, - sizeof(struct bch_extent_ptr))) { - - if (old_s) { - bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - ret = bucket_set_stripe(c, new, i, fs_usage, - journal_seq, flags, true); - if (ret) - return ret; - } - } - m->alive = true; m->sectors = le16_to_cpu(new_s->sectors); m->algorithm = new_s->algorithm; @@ -1274,16 +1198,8 @@ static int bch2_mark_stripe(struct bch_fs *c, m->blocks_nonempty += !!m->block_sectors[i]; } - if (gc && old_s) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); - bch2_bkey_to_replicas(&m->r.e, new); - if (gc) - update_replicas(c, fs_usage, &m->r.e, - ((s64) m->sectors * m->nr_redundant)); - if (!gc) { spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_update(c, m, idx); @@ -1291,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c, } } + if (gc) { + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(c, new, i, fs_usage, + journal_seq, flags); + if (ret) + return ret; + } + + update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant)); + } + return 0; } @@ -1314,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, switch (k.k->type) { case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: @@ -1382,9 +1318,6 @@ int bch2_mark_update(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; - struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_old; struct bkey_s_c old; struct bkey unpacked; int ret = 0; @@ -1424,23 +1357,24 @@ int bch2_mark_update(struct btree_trans *trans, BTREE_TRIGGER_OVERWRITE|flags); } } else { + struct btree_iter *copy; + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, new->k.size, fs_usage, trans->journal_res.seq, BTREE_TRIGGER_INSERT|flags); - while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { - unsigned offset = 0; - s64 sectors; + copy = bch2_trans_copy_iter(trans, iter); - old = bkey_disassemble(b, _old, &unpacked); - sectors = -((s64) old.k->size); + for_each_btree_key_continue(copy, 0, old, ret) { + unsigned offset = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - return 0; + break; switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: @@ -1473,9 +1407,8 @@ int bch2_mark_update(struct btree_trans *trans, trans->journal_res.seq, flags) ?: 1; if (ret <= 0) break; - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); } return ret; @@ -1506,27 +1439,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, pr_err("overlapping with"); if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { - struct btree *b = iter_l(i->iter)->b; - struct btree_node_iter node_iter = iter_l(i->iter)->iter; - struct bkey_packed *_k; - - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k; + struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); + struct bkey_s_c k; + int ret; - pr_info("_k %px format %u", _k, _k->format); - k = bkey_disassemble(b, _k, &unpacked); - - if (btree_node_is_extents(b) + for_each_btree_key_continue(copy, 0, k, ret) { + if (btree_node_type_is_extents(i->iter->btree_id) ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 : bkey_cmp(i->k->k.p, k.k->p)) break; bch2_bkey_val_to_text(&PBUF(buf), c, k); pr_err("%s", buf); - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); } else { struct bkey_cached *ck = (void *) i->iter->l[0].b; @@ -1582,9 +1508,10 @@ static int trans_get_key(struct btree_trans *trans, return ret; } -static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, - const struct bch_extent_ptr *ptr, - struct bkey_alloc_unpacked *u) +static struct bkey_alloc_buf * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); @@ -1592,8 +1519,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree struct bucket *g; struct btree_iter *iter; struct bkey_s_c k; + struct bkey_alloc_buf *a; int ret; + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (IS_ERR(a)) + return a; + iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); if (iter) { *u = bch2_alloc_unpack(k); @@ -1605,17 +1537,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree ret = bch2_btree_iter_traverse(iter); if (ret) { bch2_trans_iter_put(trans, iter); - return ret; + return ERR_PTR(ret); } percpu_down_read(&c->mark_lock); g = bucket(ca, pos.offset); - *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); } *_iter = iter; - return 0; + return a; } static int bch2_trans_mark_pointer(struct btree_trans *trans, @@ -1625,34 +1557,27 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; int ret; - ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); out: bch2_trans_iter_put(trans, iter); return ret; } static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, - struct bch_extent_stripe_ptr p, + struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; @@ -1662,14 +1587,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_replicas_padded r; int ret = 0; - ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); + ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k); if (ret < 0) return ret; if (k.k->type != KEY_TYPE_stripe) { bch2_fs_inconsistent(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); + (u64) p.ec.idx); + ret = -EIO; + goto out; + } + + if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { + bch2_fs_inconsistent(c, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); ret = -EIO; goto out; } @@ -1680,8 +1613,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto out; bkey_reassemble(&s->k_i, k); - stripe_blockcount_set(&s->v, p.block, - stripe_blockcount_get(&s->v, p.block) + + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + sectors); bch2_trans_update(trans, iter, &s->k_i, 0); @@ -1732,7 +1665,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_trans_mark_stripe_ptr(trans, p.ec, + ret = bch2_trans_mark_stripe_ptr(trans, p, disk_sectors, data_type); if (ret) return ret; @@ -1748,34 +1681,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, } static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, - const struct bch_extent_ptr *ptr, - s64 sectors, bool parity) + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) { - struct bkey_i_alloc *a; + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct bkey_alloc_buf *a; struct btree_iter *iter; struct bkey_alloc_unpacked u; - int ret; + bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + int ret = 0; - ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); if (parity) { + s64 sectors = le16_to_cpu(s.v->sectors); + + if (deleting) + sectors = -sectors; + u.dirty_sectors += sectors; u.data_type = u.dirty_sectors ? BCH_DATA_parity : 0; } - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; + if (!deleting) { + if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, + "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", + iter->pos.inode, iter->pos.offset, u.gen, + u.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + u.stripe = s.k->p.offset; + u.stripe_redundancy = s.v->nr_redundant; + } else { + u.stripe = 0; + u.stripe_redundancy = 0; + } + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); err: bch2_trans_iter_put(trans, iter); return ret; @@ -1785,51 +1735,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; + struct bkey_s_c_stripe old_s = { NULL }; + struct bkey_s_c_stripe new_s = { NULL }; struct bch_replicas_padded r; unsigned i; int ret = 0; + if (old.k->type == KEY_TYPE_stripe) + old_s = bkey_s_c_to_stripe(old); + if (new.k->type == KEY_TYPE_stripe) + new_s = bkey_s_c_to_stripe(new); + /* * If the pointers aren't changing, we don't need to do anything: */ - if (new_s && old_s && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + if (new_s.k && old_s.k && + new_s.v->nr_blocks == old_s.v->nr_blocks && + new_s.v->nr_redundant == old_s.v->nr_redundant && + !memcmp(old_s.v->ptrs, new_s.v->ptrs, + new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; - if (new_s) { - unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant; - s64 sectors = le16_to_cpu(new_s->sectors); + if (new_s.k) { + s64 sectors = le16_to_cpu(new_s.v->sectors); bch2_bkey_to_replicas(&r.e, new); - update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); - for (i = 0; i < new_s->nr_blocks; i++) { - bool parity = i >= nr_data; - - ret = bch2_trans_mark_stripe_alloc_ref(trans, - &new_s->ptrs[i], sectors, parity); + for (i = 0; i < new_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, + i, false); if (ret) return ret; } } - if (old_s) { - unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant; - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + if (old_s.k) { + s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); bch2_bkey_to_replicas(&r.e, old); - update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); - - for (i = 0; i < old_s->nr_blocks; i++) { - bool parity = i >= nr_data; + update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); - ret = bch2_trans_mark_stripe_alloc_ref(trans, - &old_s->ptrs[i], sectors, parity); + for (i = 0; i < old_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, + i, true); if (ret) return ret; } @@ -1898,8 +1847,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, } bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - bch2_trans_update(trans, iter, n, 0); out: ret = sectors; @@ -2025,15 +1972,13 @@ int bch2_trans_mark_update(struct btree_trans *trans, BTREE_TRIGGER_OVERWRITE|flags); } } else { - struct btree *b = iter_l(iter)->b; - struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_old; - struct bkey unpacked; + struct btree_iter *copy; + struct bkey _old; EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - bkey_init(&unpacked); - old = (struct bkey_s_c) { &unpacked, NULL }; + bkey_init(&_old); + old = (struct bkey_s_c) { &_old, NULL }; ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, new->k.size, @@ -2041,18 +1986,16 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (ret) return ret; - while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { - unsigned flags = BTREE_TRIGGER_OVERWRITE; - unsigned offset = 0; - s64 sectors; + copy = bch2_trans_copy_iter(trans, iter); - old = bkey_disassemble(b, _old, &unpacked); - sectors = -((s64) old.k->size); + for_each_btree_key_continue(copy, 0, old, ret) { + unsigned offset = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - return 0; + break; switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: @@ -2083,15 +2026,169 @@ int bch2_trans_mark_update(struct btree_trans *trans, ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), offset, sectors, flags); if (ret) - return ret; - - bch2_btree_node_iter_advance(&node_iter, b); + break; } + bch2_trans_iter_put(trans, copy); } return ret; } +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + struct bkey_alloc_buf *a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), + }; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); + + if (u.data_type && u.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type ?: type], + u.dirty_sectors, sectors, ca->mi.bucket_size, + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + if (u.data_type == type && + u.dirty_sectors == sectors) + goto out; + + u.data_type = type; + u.dirty_sectors = sectors; + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); +out: + bch2_trans_iter_put(trans, iter); + return ret; +} + +int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + return __bch2_trans_do(trans, res, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, + ca->mi.bucket_size)); + +} + +static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) +{ + int ret; + + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + if (b != *bucket) { + if (*bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + *bucket, type, *bucket_sectors); + if (ret) + return ret; + } + + *bucket = b; + *bucket_sectors = 0; + } + + *bucket_sectors += sectors; + start += sectors; + } while (!ret && start < end); + + return 0; +} + +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; + unsigned i, bucket_sectors = 0; + int ret; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + if (bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) + return ret; + } + + return 0; +} + +int bch2_trans_mark_dev_sb(struct bch_fs *c, + struct disk_reservation *res, + struct bch_dev *ca) +{ + return bch2_trans_do(c, res, NULL, 0, + __bch2_trans_mark_dev_sb(&trans, res, ca)); +} + /* Disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) @@ -2107,7 +2204,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) #define SECTORS_CACHE 1024 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, int flags) + u64 sectors, int flags) { struct bch_fs_pcpu *pcpu; u64 old, v, get; @@ -2192,7 +2289,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ca->mi.bucket_size / c->opts.btree_node_size); /* XXX: these should be tunable */ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve * 2); bool resize = ca->buckets[0] != NULL; @@ -2209,7 +2306,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || - !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -2296,13 +2392,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage[0]); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -ENOMEM; + } + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 3a5ed1fcaf78..6d15c455e7cc 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -{ - return c->bucket_clock[rw].hand - g->io_time[rw]; -} - /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) +static inline u8 bucket_gc_gen(struct bucket *g) { - struct bucket *g = bucket(ca, b); - return g->mark.gen - g->oldest_gen; } @@ -153,18 +146,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark) return mark.dirty_sectors + mark.cached_sectors; } -static inline bool bucket_unused(struct bucket_mark mark) -{ - return !mark.owned_by_allocator && - !mark.data_type && - !bucket_sectors_used(mark); -} - static inline bool is_available_bucket(struct bucket_mark mark) { - return (!mark.owned_by_allocator && - !mark.dirty_sectors && - !mark.stripe); + return !mark.dirty_sectors && !mark.stripe; } static inline bool bucket_needs_journal_commit(struct bucket_mark m, @@ -178,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *); - static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -223,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) READ_ONCE(c->replicas.nr); } +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); @@ -245,8 +232,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, - size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, @@ -270,6 +255,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); +int bch2_trans_mark_metadata_bucket(struct btree_trans *, + struct disk_reservation *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, + struct bch_dev *); + /* disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); @@ -284,8 +275,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) int bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - unsigned, int); + struct disk_reservation *, + u64, int); static inline struct disk_reservation bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) @@ -302,8 +293,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) static inline int bch2_disk_reservation_get(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, - unsigned nr_replicas, + u64 sectors, unsigned nr_replicas, int flags) { *res = bch2_disk_reservation_init(c, nr_replicas); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index d6057d22b18e..404c89a7a264 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -37,11 +37,12 @@ struct bucket { const struct bucket_mark mark; }; - u16 io_time[2]; + u64 io_time[2]; u8 oldest_gen; u8 gc_gen; unsigned gen_valid:1; - u8 ec_redundancy; + u8 stripe_redundancy; + u32 stripe; }; struct bucket_array { @@ -52,16 +53,15 @@ struct bucket_array { }; struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; - /* _compressed_ sectors: */ - u64 sectors[BCH_DATA_NR]; - u64 sectors_fragmented; - - u64 buckets_ec; - u64 sectors_ec; + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ + u64 fragmented; + } d[BCH_DATA_NR]; }; struct bch_fs_usage { diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index e7c8969aaad1..49842ec88390 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; arg.ec_buckets = src.buckets_ec; - arg.ec_sectors = src.sectors_ec; + arg.ec_sectors = 0; for (i = 0; i < BCH_DATA_NR; i++) { - arg.buckets[i] = src.buckets[i]; - arg.sectors[i] = src.sectors[i]; + arg.buckets[i] = src.d[i].buckets; + arg.sectors[i] = src.d[i].sectors; } percpu_ref_put(&ca->ref); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index a01073e54a33..3d88719ba86c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> @@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -463,7 +464,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 833537cc8fd0..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1d1590de55e8..4324cfe7eed0 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), + if (time_after_eq((unsigned long) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); @@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; - unsigned long now = atomic_long_add_return(sectors, &clock->now); + unsigned long now = atomic64_add_return(sectors, &clock->now); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); @@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) unsigned i; spin_lock(&clock->timer_lock); - now = atomic_long_read(&clock->now); + now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) pr_buf(out, "%ps:\t%li\n", @@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) int bch2_io_clock_init(struct io_clock *clock) { - atomic_long_set(&clock->now, 0); + atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index 92c740a47565..5fae0012d808 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -26,7 +26,7 @@ struct io_timer { typedef HEAP(struct io_timer *) io_timer_heap; struct io_clock { - atomic_long_t now; + atomic64_t now; u16 __percpu *pcpu_buf; unsigned max_slop; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 0d68a277cfd7..f63651d291e5 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) struct bvec_iter iter; void *expected_start = NULL; - __bio_for_each_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (expected_start && expected_start != page_address(bv.bv_page) + bv.bv_offset) return false; @@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c, ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); + /* + * ZSTD requires that when we decompress we pass in the exact + * compressed size - rounding it up to the nearest sector + * doesn't work, so we use the first 4 bytes of the buffer for + * that. + * + * Additionally, the ZSTD code seems to have a bug where it will + * write just past the end of the buffer - so subtract a fudge + * factor (7 bytes) from the dst buffer size to account for + * that. + */ size_t len = ZSTD_compressCCtx(ctx, - dst + 4, dst_len - 4, + dst + 4, dst_len - 4 - 7, src, src_len, c->zstd_params); if (ZSTD_isError(len)) diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index bbe3fefa2651..06dbca32e189 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) memcpy(n_ondisk, n_sorted, btree_bytes(c)); - if (bch2_btree_node_read_done(c, v, false)) + if (bch2_btree_node_read_done(c, ca, v, false)) goto out; n_sorted = c->verify_data->data; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index eb03adc2d533..ec871b5eb992 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -4,7 +4,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + if (!bkey_cmp(k.k->p, POS_MIN)) + return "stripe at pos 0"; + if (k.k->p.inode) return "invalid stripe key"; @@ -138,44 +141,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, stripe_blockcount_get(s, i)); } -static int ptr_matches_stripe(struct bch_fs *c, - struct bch_stripe *v, - const struct bch_extent_ptr *ptr) +/* returns blocknr in stripe that we matched: */ +static int bkey_matches_stripe(struct bch_stripe *s, + struct bkey_s_c k) { - unsigned i; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; - for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { - const struct bch_extent_ptr *ptr2 = v->ptrs + i; - - if (ptr->dev == ptr2->dev && - ptr->gen == ptr2->gen && - ptr->offset >= ptr2->offset && - ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) - return i; - } - - return -1; -} - -static int extent_matches_stripe(struct bch_fs *c, - struct bch_stripe *v, - struct bkey_s_c k) -{ - - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - int idx; - - extent_for_each_ptr(e, ptr) { - idx = ptr_matches_stripe(c, v, ptr); - if (idx >= 0) - return idx; - } - break; - } - } + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(s, ptr, i)) + return i; return -1; } @@ -200,46 +177,95 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) return false; } +/* Stripe bufs: */ + +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) +{ + unsigned i; + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + kvpfree(buf->data[i], buf->size << 9); + buf->data[i] = NULL; + } +} + +static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + unsigned offset, unsigned size) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1U << v->csum_granularity_bits; + unsigned end = offset + size; + unsigned i; + + BUG_ON(end > le16_to_cpu(v->sectors)); + + offset = round_down(offset, csum_granularity); + end = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)); + + buf->offset = offset; + buf->size = end - offset; + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + if (!buf->data[i]) + goto err; + } + + return 0; +err: + ec_stripe_buf_exit(buf); + return -ENOMEM; +} + /* Checksumming: */ -static void ec_generate_checksums(struct ec_stripe_buf *buf) +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, + unsigned block, unsigned offset) { struct bch_stripe *v = &buf->key.v; unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned csums_per_device = stripe_csums_per_device(v); - unsigned csum_bytes = bch_crc_bytes[v->csum_type]; - unsigned i, j; + unsigned end = buf->offset + buf->size; + unsigned len = min(csum_granularity, end - offset); + + BUG_ON(offset >= end); + BUG_ON(offset < buf->offset); + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + (len & (csum_granularity - 1))); + + return bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[block] + ((offset - buf->offset) << 9), + len << 9); +} - if (!csum_bytes) +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, j, csums_per_device = stripe_csums_per_device(v); + + if (!v->csum_type) return; BUG_ON(buf->offset); BUG_ON(buf->size != le16_to_cpu(v->sectors)); - for (i = 0; i < v->nr_blocks; i++) { - for (j = 0; j < csums_per_device; j++) { - unsigned offset = j << v->csum_granularity_bits; - unsigned len = min(csum_granularity, buf->size - offset); - - struct bch_csum csum = - bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[i] + (offset << 9), - len << 9); - - memcpy(stripe_csum(v, i, j), &csum, csum_bytes); - } - } + for (i = 0; i < v->nr_blocks; i++) + for (j = 0; j < csums_per_device; j++) + stripe_csum_set(v, i, j, + ec_block_checksum(buf, i, j << v->csum_granularity_bits)); } static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &buf->key.v; unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned csum_bytes = bch_crc_bytes[v->csum_type]; unsigned i; - if (!csum_bytes) + if (!v->csum_type) return; for (i = 0; i < v->nr_blocks; i++) { @@ -252,21 +278,18 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) while (offset < end) { unsigned j = offset >> v->csum_granularity_bits; unsigned len = min(csum_granularity, end - offset); - struct bch_csum csum; + struct bch_csum want = stripe_csum_get(v, i, j); + struct bch_csum got = ec_block_checksum(buf, i, offset); - BUG_ON(offset & (csum_granularity - 1)); - BUG_ON(offset + len != le16_to_cpu(v->sectors) && - ((offset + len) & (csum_granularity - 1))); + if (bch2_crc_cmp(want, got)) { + char buf2[200]; - csum = bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[i] + ((offset - buf->offset) << 9), - len << 9); + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); - if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { bch_err_ratelimited(c, - "checksum error while doing reconstruct read (%u:%u)", - i, j); + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, + want.lo, got.lo, buf2); clear_bit(i, buf->valid); break; } @@ -287,20 +310,16 @@ static void ec_generate_ec(struct ec_stripe_buf *buf) raid_gen(nr_data, v->nr_redundant, bytes, buf->data); } -static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) -{ - return nr - bitmap_weight(buf->valid, nr); -} - static unsigned ec_nr_failed(struct ec_stripe_buf *buf) { - return __ec_nr_failed(buf, buf->key.v.nr_blocks); + return buf->key.v.nr_blocks - + bitmap_weight(buf->valid, buf->key.v.nr_blocks); } static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &buf->key.v; - unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; + unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = buf->size << 9; @@ -323,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) static void ec_block_endio(struct bio *bio) { struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_stripe *v = &ec_bio->buf->key.v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; @@ -331,6 +352,13 @@ static void ec_block_endio(struct bio *bio) bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(ca->fs, + "error %s stripe: stale pointer after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to"); + clear_bit(ec_bio->idx, ec_bio->buf->valid); + } + bio_put(&ec_bio->bio); percpu_ref_put(&ca->io_ref); closure_put(cl); @@ -347,6 +375,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); + clear_bit(idx, buf->valid); + return; + } + if (!bch2_dev_get_ioref(ca, rw)) { clear_bit(idx, buf->valid); return; @@ -389,87 +425,77 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, percpu_ref_put(&ca->io_ref); } -/* recovery read path: */ -int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) { struct btree_trans trans; struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + if (k.k->type != KEY_TYPE_stripe) { + ret = -ENOENT; + goto err; + } + bkey_reassemble(&stripe->key.k_i, k); +err: + bch2_trans_exit(&trans); + return ret; +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ struct ec_stripe_buf *buf; struct closure cl; - struct bkey_s_c k; struct bch_stripe *v; - unsigned stripe_idx; - unsigned offset, end; - unsigned i, nr_data, csum_granularity; - int ret = 0, idx; + unsigned i, offset; + int ret = 0; closure_init_stack(&cl); BUG_ON(!rbio->pick.has_ec); - stripe_idx = rbio->pick.ec.idx; - buf = kzalloc(sizeof(*buf), GFP_NOIO); if (!buf) return -ENOMEM; - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, - POS(0, stripe_idx), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { + ret = get_stripe_key(c, rbio->pick.ec.idx, buf); + if (ret) { bch_err_ratelimited(c, - "error doing reconstruct read: stripe not found"); + "error doing reconstruct read: error %i looking up stripe", ret); kfree(buf); - return bch2_trans_exit(&trans) ?: -EIO; + return -EIO; } - bkey_reassemble(&buf->key.k_i, k); - bch2_trans_exit(&trans); - v = &buf->key.v; - nr_data = v->nr_blocks - v->nr_redundant; - - idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); - BUG_ON(idx < 0); - - csum_granularity = 1U << v->csum_granularity_bits; - - offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; - end = offset + bio_sectors(&rbio->bio); - - BUG_ON(end > le16_to_cpu(v->sectors)); - - buf->offset = round_down(offset, csum_granularity); - buf->size = min_t(unsigned, le16_to_cpu(v->sectors), - round_up(end, csum_granularity)) - buf->offset; - - for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); - if (!buf->data[i]) { - ret = -ENOMEM; - goto err; - } + if (!bch2_ptr_matches_stripe(v, rbio->pick)) { + bch_err_ratelimited(c, + "error doing reconstruct read: pointer doesn't match stripe"); + ret = -EIO; + goto err; } - memset(buf->valid, 0xFF, sizeof(buf->valid)); - - for (i = 0; i < v->nr_blocks; i++) { - struct bch_extent_ptr *ptr = v->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; + if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { + bch_err_ratelimited(c, + "error doing reconstruct read: read is bigger than stripe"); + ret = -EIO; + goto err; + } - if (ptr_stale(ca, ptr)) { - bch_err_ratelimited(c, - "error doing reconstruct read: stale pointer"); - clear_bit(i, buf->valid); - continue; - } + ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + if (ret) + goto err; + for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); - } closure_sync(&cl); @@ -487,10 +513,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) goto err; memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, - buf->data[idx] + ((offset - buf->offset) << 9)); + buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); err: - for (i = 0; i < v->nr_blocks; i++) - kfree(buf->data[i]); + ec_stripe_buf_exit(buf); kfree(buf); return ret; } @@ -643,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c, static int ec_stripe_delete(struct bch_fs *c, size_t idx) { - //pr_info("deleting stripe %zu", idx); return bch2_btree_delete_range(c, BTREE_ID_EC, POS(0, idx), POS(0, idx + 1), @@ -675,13 +699,14 @@ static void ec_stripe_delete_work(struct work_struct *work) /* stripe creation: */ static int ec_stripe_bkey_insert(struct bch_fs *c, - struct ec_stripe_new *s, - struct bkey_i_stripe *stripe) + struct bkey_i_stripe *stripe, + struct disk_reservation *res) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bpos start_pos = POS(0, c->ec_stripe_hint); + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; bch2_trans_init(&trans, c, 0, 0); @@ -692,7 +717,7 @@ retry: BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { - start_pos = POS_MIN; + start_pos = min_pos; bch2_btree_iter_set_pos(iter, start_pos); continue; } @@ -717,7 +742,7 @@ found_slot: bch2_trans_update(&trans, iter, &stripe->k_i, 0); - ret = bch2_trans_commit(&trans, &s->res, NULL, + ret = bch2_trans_commit(&trans, res, NULL, BTREE_INSERT_NOFAIL); err: bch2_trans_iter_put(&trans, iter); @@ -731,6 +756,46 @@ err: return ret; } +static int ec_stripe_bkey_update(struct btree_trans *trans, + struct bkey_i_stripe *new) +{ + struct btree_iter *iter; + struct bkey_s_c k; + const struct bch_stripe *existing; + unsigned i; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_EC, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || k.k->type != KEY_TYPE_stripe) { + bch_err(trans->c, "error updating stripe: not found"); + ret = -ENOENT; + goto err; + } + + existing = bkey_s_c_to_stripe(k).v; + + if (existing->nr_blocks != new->v.nr_blocks) { + bch_err(trans->c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, + stripe_blockcount_get(existing, i)); + + bch2_trans_update(trans, iter, &new->k_i, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static void extent_stripe_ptr_add(struct bkey_s_extent e, struct ec_stripe_buf *s, struct bch_extent_ptr *ptr, @@ -745,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, *dst = (struct bch_extent_stripe_ptr) { .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, .block = block, + .redundancy = s->key.v.nr_redundant, .idx = s->key.k.p.offset, }; } @@ -757,10 +823,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_extent e; - struct bkey_on_stack sk; - int ret = 0, dev, idx; + struct bkey_buf sk; + int ret = 0, dev, block; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); /* XXX this doesn't support the reflink btree */ @@ -779,29 +845,28 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, continue; } - idx = extent_matches_stripe(c, &s->key.v, k); - if (idx < 0) { + block = bkey_matches_stripe(&s->key.v, k); + if (block < 0) { bch2_btree_iter_next(iter); continue; } - dev = s->key.v.ptrs[idx].dev; + dev = s->key.v.ptrs[block].dev; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); e = bkey_i_to_s_extent(sk.k); bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); BUG_ON(!ec_ptr); - extent_stripe_ptr_add(e, s, ec_ptr, idx); + extent_stripe_ptr_add(e, s, ec_ptr, block); bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); bch2_trans_update(&trans, iter, sk.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + BTREE_INSERT_NOFAIL); if (ret == -EINTR) ret = 0; if (ret) @@ -809,7 +874,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, } bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -823,14 +888,13 @@ static void ec_stripe_create(struct ec_stripe_new *s) struct open_bucket *ob; struct bkey_i *k; struct stripe *m; - struct bch_stripe *v = &s->stripe.key.v; + struct bch_stripe *v = &s->new_stripe.key.v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - struct closure cl; int ret; BUG_ON(s->h->s == s); - closure_init_stack(&cl); + closure_sync(&s->iodone); if (s->err) { if (s->err != -EROFS) @@ -838,41 +902,52 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } + if (s->have_existing_stripe) { + ec_validate_checksums(c, &s->existing_stripe); + + if (ec_do_recov(c, &s->existing_stripe)) { + bch_err(c, "error creating stripe: error reading existing stripe"); + goto err; + } + + for (i = 0; i < nr_data; i++) + if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + + ec_stripe_buf_exit(&s->existing_stripe); + } + BUG_ON(!s->allocated); if (!percpu_ref_tryget(&c->writes)) goto err; - BUG_ON(bitmap_weight(s->blocks_allocated, - s->blocks.nr) != s->blocks.nr); - - ec_generate_ec(&s->stripe); + ec_generate_ec(&s->new_stripe); - ec_generate_checksums(&s->stripe); + ec_generate_checksums(&s->new_stripe); /* write p/q: */ for (i = nr_data; i < v->nr_blocks; i++) - ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); - - closure_sync(&cl); + ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); + closure_sync(&s->iodone); - for (i = nr_data; i < v->nr_blocks; i++) - if (!test_bit(i, s->stripe.valid)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); - goto err_put_writes; - } + if (ec_nr_failed(&s->new_stripe)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err_put_writes; + } - ret = s->existing_stripe - ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, - &s->res, NULL, BTREE_INSERT_NOFAIL) - : ec_stripe_bkey_insert(c, s, &s->stripe.key); + ret = s->have_existing_stripe + ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_bkey_update(&trans, &s->new_stripe.key)) + : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); goto err_put_writes; } for_each_keylist_key(&s->keys, k) { - ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k); if (ret) { bch_err(c, "error creating stripe: error %i updating pointers", ret); break; @@ -880,31 +955,33 @@ static void ec_stripe_create(struct ec_stripe_new *s) } spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); -#if 0 - pr_info("created a %s stripe %llu", - s->existing_stripe ? "existing" : "new", - s->stripe.key.k.p.offset); -#endif + m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); + BUG_ON(m->on_heap); - bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); + bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); spin_unlock(&c->ec_stripes_heap_lock); err_put_writes: percpu_ref_put(&c->writes); err: bch2_disk_reservation_put(c, &s->res); - open_bucket_for_each(c, &s->blocks, ob, i) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } - - bch2_open_buckets_put(c, &s->parity); + for (i = 0; i < v->nr_blocks; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (i < nr_data) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } else { + bch2_open_bucket_put(c, ob); + } + } bch2_keylist_free(&s->keys, s->inline_keys); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) - kvpfree(s->stripe.data[i], s->stripe.size << 9); + ec_stripe_buf_exit(&s->existing_stripe); + ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); kfree(s); } @@ -981,7 +1058,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ca = bch_dev_bkey_exists(c, ob->ptr.dev); offset = ca->mi.bucket_size - ob->sectors_free; - return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, @@ -993,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, if (!ob) return; - //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); - ec = ob->ec; mutex_lock(&ec->lock); @@ -1088,7 +1163,6 @@ static void ec_stripe_key_init(struct bch_fs *c, static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; - unsigned i; lockdep_assert_held(&h->lock); @@ -1097,41 +1171,27 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) return -ENOMEM; mutex_init(&s->lock); + closure_init(&s->iodone, NULL); atomic_set(&s->pin, 1); s->c = c; s->h = h; s->nr_data = min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; + BCH_BKEY_PTRS_MAX) - h->redundancy; s->nr_parity = h->redundancy; bch2_keylist_init(&s->keys, s->inline_keys); - s->stripe.offset = 0; - s->stripe.size = h->blocksize; - memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); - - ec_stripe_key_init(c, &s->stripe.key, s->nr_data, + ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { - s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); - if (!s->stripe.data[i]) - goto err; - } - h->s = s; - return 0; -err: - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) - kvpfree(s->stripe.data[i], s->stripe.size << 9); - kfree(s); - return -ENOMEM; } static struct ec_stripe_head * ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, - unsigned algo, unsigned redundancy) + unsigned algo, unsigned redundancy, + bool copygc) { struct ec_stripe_head *h; struct bch_dev *ca; @@ -1147,6 +1207,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->target = target; h->algo = algo; h->redundancy = redundancy; + h->copygc = copygc; rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); @@ -1171,16 +1232,17 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) if (h->s && h->s->allocated && bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr) == h->s->blocks.nr) + h->s->nr_data) == h->s->nr_data) ec_stripe_set_pending(c, h); mutex_unlock(&h->lock); } struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, - unsigned target, - unsigned algo, - unsigned redundancy) + unsigned target, + unsigned algo, + unsigned redundancy, + bool copygc) { struct ec_stripe_head *h; @@ -1191,76 +1253,98 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && - h->redundancy == redundancy) { + h->redundancy == redundancy && + h->copygc == copygc) { mutex_lock(&h->lock); goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy); + h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc); found: mutex_unlock(&c->ec_stripe_head_lock); return h; } -/* - * XXX: use a higher watermark for allocating open buckets here: - */ -static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) +static enum bucket_alloc_ret +new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + struct closure *cl) { - struct bch_devs_mask devs; + struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - unsigned i, nr_have, nr_data = - min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; - int ret = 0; - - devs = h->devs; - - for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { - __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); - --nr_data; + enum bucket_alloc_ret ret = ALLOC_SUCCESS; + + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (test_bit(i, h->s->blocks_gotten)) { + __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } } - BUG_ON(h->s->blocks.nr > nr_data); - BUG_ON(h->s->parity.nr > h->redundancy); - - open_bucket_for_each(c, &h->s->parity, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - open_bucket_for_each(c, &h->s->blocks, ob, i) - __clear_bit(ob->ptr.dev, devs.d); + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); percpu_down_read(&c->mark_lock); rcu_read_lock(); - if (h->s->parity.nr < h->redundancy) { - nr_have = h->s->parity.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->parity, + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->parity_stripe, &devs, - h->redundancy, - &nr_have, + h->s->nr_parity, + &nr_have_parity, &have_cache, - RESERVE_NONE, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, 0, - NULL); + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data + h->s->nr_parity, + h->s->nr_data); + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } - if (h->s->blocks.nr < nr_data) { - nr_have = h->s->blocks.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->blocks, + buckets.nr = 0; + if (nr_have_data < h->s->nr_data) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->block_stripe, &devs, - nr_data, - &nr_have, + h->s->nr_data, + &nr_have_data, &have_cache, - RESERVE_NONE, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, 0, - NULL); + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data, 0); + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } @@ -1272,53 +1356,101 @@ err: /* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, - unsigned target, - unsigned algo, - unsigned redundancy) + struct ec_stripe_head *head) { ec_stripes_heap *h = &c->ec_stripes_heap; struct stripe *m; size_t heap_idx; u64 stripe_idx; + s64 ret = -1; if (may_create_new_stripe(c)) return -1; spin_lock(&c->ec_stripes_heap_lock); for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; stripe_idx = h->data[heap_idx].idx; m = genradix_ptr(&c->stripes[0], stripe_idx); - if (m->algorithm == algo && - m->nr_redundant == redundancy && + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && + m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { bch2_stripes_heap_del(c, m, stripe_idx); - spin_unlock(&c->ec_stripes_heap_lock); - return stripe_idx; + ret = stripe_idx; + break; } } - spin_unlock(&c->ec_stripes_heap_lock); - return -1; + return ret; } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, + struct ec_stripe_head *h) { - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + unsigned i; + s64 idx; int ret; - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (!ret) - bkey_reassemble(&stripe->key.k_i, k); - bch2_trans_exit(&trans); + idx = get_existing_stripe(c, h); + if (idx < 0) { + bch_err(c, "failed to find an existing stripe"); + return -ENOSPC; + } + + h->s->have_existing_stripe = true; + ret = get_stripe_key(c, idx, &h->s->existing_stripe); + if (ret) { + bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); + return ret; + } + + if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { + /* + * this is a problem: we have deleted from the + * stripes heap already + */ + BUG(); + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); + BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); + + for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { + if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { + __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); + } + + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } + + bkey_copy(&h->s->new_stripe.key.k_i, + &h->s->existing_stripe.key.k_i); + + return 0; +} + +static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, + struct ec_stripe_head *h) +{ + int ret; + + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, 0); + + if (ret) { + /* + * This means we need to wait for copygc to + * empty out buckets from existing stripes: + */ + bch_err(c, "failed to reserve stripe"); + } return ret; } @@ -1326,86 +1458,58 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, unsigned target, unsigned algo, - unsigned redundancy) + unsigned redundancy, + bool copygc, + struct closure *cl) { - struct closure cl; struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, data_idx = 0; - s64 idx; int ret; + bool needs_stripe_new; - closure_init_stack(&cl); - - h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); - if (!h) + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); + if (!h) { + bch_err(c, "no stripe head"); return NULL; + } - if (!h->s) { + needs_stripe_new = !h->s; + if (needs_stripe_new) { if (ec_new_stripe_alloc(c, h)) { - bch2_ec_stripe_head_put(c, h); - return NULL; + ret = -ENOMEM; + bch_err(c, "failed to allocate new stripe"); + goto err; } - idx = get_existing_stripe(c, target, algo, redundancy); - if (idx >= 0) { - h->s->existing_stripe = true; - h->s->existing_stripe_idx = idx; - if (get_stripe_key(c, idx, &h->s->stripe)) { - /* btree error */ - BUG(); - } - - for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) - if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { - __set_bit(i, h->s->blocks_allocated); - ec_block_io(c, &h->s->stripe, READ, i, &cl); - } - } + if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) + BUG(); } - if (!h->s->allocated) { - if (!h->s->existing_stripe && - !h->s->res.sectors) { - ret = bch2_disk_reservation_get(c, &h->s->res, - h->blocksize, - h->s->nr_parity, 0); - if (ret) { - /* What should we do here? */ - bch_err(c, "unable to create new stripe: %i", ret); - bch2_ec_stripe_head_put(c, h); - h = NULL; - goto out; - - } - - } - - if (new_stripe_alloc_buckets(c, h)) { - bch2_ec_stripe_head_put(c, h); - h = NULL; - goto out; - } - - open_bucket_for_each(c, &h->s->blocks, ob, i) { - data_idx = find_next_zero_bit(h->s->blocks_allocated, - h->s->nr_data, data_idx); - BUG_ON(data_idx >= h->s->nr_data); - - h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; - h->s->data_block_idx[i] = data_idx; - data_idx++; - } + /* + * Try reserve a new stripe before reusing an + * existing stripe. This will prevent unnecessary + * read amplification during write oriented workloads. + */ + ret = 0; + if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe) + ret = __bch2_ec_stripe_head_reserve(c, h); + if (ret && needs_stripe_new) + ret = __bch2_ec_stripe_head_reuse(c, h); + if (ret) + goto err; - open_bucket_for_each(c, &h->s->parity, ob, i) - h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + if (!h->s->allocated) { + ret = new_stripe_alloc_buckets(c, h, cl); + if (ret) + goto err; - //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); h->s->allocated = true; } -out: - closure_sync(&cl); + return h; + +err: + bch2_ec_stripe_head_put(c, h); + return ERR_PTR(-ret); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -1421,12 +1525,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) if (!h->s) goto unlock; - open_bucket_for_each(c, &h->s->blocks, ob, i) - if (ob->ptr.dev == ca->dev_idx) - goto found; - open_bucket_for_each(c, &h->s->parity, ob, i) + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + + ob = c->open_buckets + h->s->blocks[i]; if (ob->ptr.dev == ca->dev_idx) goto found; + } goto unlock; found: h->s->err = -EROFS; @@ -1437,13 +1543,23 @@ unlock: mutex_unlock(&c->ec_stripe_head_lock); } +void bch2_stripes_heap_start(struct bch_fs *c) +{ + struct genradix_iter iter; + struct stripe *m; + + genradix_for_each(&c->stripes[0], iter, m) + if (m->alive) + bch2_stripes_heap_insert(c, m, iter.pos); +} + static int __bch2_stripe_write_key(struct btree_trans *trans, struct btree_iter *iter, struct stripe *m, size_t idx, struct bkey_i_stripe *new_key) { - struct bch_fs *c = trans->c; + const struct bch_stripe *v; struct bkey_s_c k; unsigned i; int ret; @@ -1458,16 +1574,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, if (k.k->type != KEY_TYPE_stripe) return -EIO; + v = bkey_s_c_to_stripe(k).v; + for (i = 0; i < v->nr_blocks; i++) + if (m->block_sectors[i] != stripe_blockcount_get(v, i)) + goto write; + return 0; +write: bkey_reassemble(&new_key->k_i, k); - spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < new_key->v.nr_blocks; i++) stripe_blockcount_set(&new_key->v, i, m->block_sectors[i]); - m->dirty = false; - - spin_unlock(&c->ec_stripes_heap_lock); bch2_trans_update(trans, iter, &new_key->k_i, 0); return 0; @@ -1491,7 +1608,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); genradix_for_each(&c->stripes[0], giter, m) { - if (!m->dirty) + if (!m->alive) continue; ret = __bch2_trans_do(&trans, NULL, NULL, @@ -1516,18 +1633,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, int ret = 0; if (k.k->type == KEY_TYPE_stripe) { - struct stripe *m; - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_NOATOMIC); if (ret) return ret; - - spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes[0], k.k->p.offset); - bch2_stripes_heap_insert(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); } return ret; @@ -1608,19 +1718,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) h->target, h->algo, h->redundancy); if (h->s) - pr_buf(out, "\tpending: blocks %u allocated %u\n", - h->s->blocks.nr, + pr_buf(out, "\tpending: blocks %u+%u allocated %u\n", + h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr)); + h->s->nr_data)); } mutex_unlock(&c->ec_stripe_head_lock); mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", - s->blocks.nr, - bitmap_weight(s->blocks_allocated, - s->blocks.nr), + pr_buf(out, "\tin flight: blocks %u+%u pin %u\n", + s->nr_data, s->nr_parity, atomic_read(&s->pin)); } mutex_unlock(&c->ec_stripe_new_lock); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 15f751fc2a35..765baa9d9264 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -60,9 +60,51 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s) } static inline void *stripe_csum(struct bch_stripe *s, - unsigned dev, unsigned csum_idx) + unsigned block, unsigned csum_idx) { - return (void *) s + stripe_csum_offset(s, dev, csum_idx); + EBUG_ON(block >= s->nr_blocks); + EBUG_ON(csum_idx >= stripe_csums_per_device(s)); + + return (void *) s + stripe_csum_offset(s, block, csum_idx); +} + +static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + struct bch_csum csum = { 0 }; + + memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); + return csum; +} + +static inline void stripe_csum_set(struct bch_stripe *s, + unsigned block, unsigned csum_idx, + struct bch_csum csum) +{ + memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); +} + +static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s, + const struct bch_extent_ptr *ptr, + unsigned block) +{ + unsigned nr_data = s->nr_blocks - s->nr_redundant; + + if (block >= nr_data) + return false; + + return ptr->dev == s->ptrs[block].dev && + ptr->gen == s->ptrs[block].gen && + ptr->offset >= s->ptrs[block].offset && + ptr->offset < s->ptrs[block].offset + le16_to_cpu(s->sectors); +} + +static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, + struct extent_ptr_decoded p) +{ + BUG_ON(!p.has_ec); + + return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block); } struct bch_read_bio; @@ -71,9 +113,9 @@ struct ec_stripe_buf { /* might not be buffering the entire stripe: */ unsigned offset; unsigned size; - unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; + unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - void *data[EC_STRIPE_MAX]; + void *data[BCH_BKEY_PTRS_MAX]; union { struct bkey_i_stripe key; @@ -88,6 +130,7 @@ struct ec_stripe_new { struct ec_stripe_head *h; struct mutex lock; struct list_head list; + struct closure iodone; /* counts in flight writes, stripe is created when pin == 0 */ atomic_t pin; @@ -98,20 +141,18 @@ struct ec_stripe_new { u8 nr_parity; bool allocated; bool pending; - bool existing_stripe; - u64 existing_stripe_idx; + bool have_existing_stripe; - unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; - - struct open_buckets blocks; - u8 data_block_idx[EC_STRIPE_MAX]; - struct open_buckets parity; + unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; struct keylist keys; u64 inline_keys[BKEY_U64s * 8]; - struct ec_stripe_buf stripe; + struct ec_stripe_buf new_stripe; + struct ec_stripe_buf existing_stripe; }; struct ec_stripe_head { @@ -121,6 +162,7 @@ struct ec_stripe_head { unsigned target; unsigned algo; unsigned redundancy; + bool copygc; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -145,8 +187,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, - unsigned, unsigned); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, + unsigned, unsigned, unsigned, bool, struct closure *); void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); @@ -156,6 +198,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_ec_flush_new_stripes(struct bch_fs *); +void bch2_stripes_heap_start(struct bch_fs *); + struct journal_keys; int bch2_stripes_read(struct bch_fs *, struct journal_keys *); int bch2_stripes_write(struct bch_fs *, unsigned); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index e4d633fca5bf..847770166223 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -4,11 +4,9 @@ #include <linux/llist.h> -#define EC_STRIPE_MAX 16 - struct bch_replicas_padded { struct bch_replicas_entry e; - u8 pad[EC_STRIPE_MAX]; + u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -20,11 +18,10 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; - unsigned alive:1; - unsigned dirty:1; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ unsigned on_heap:1; u8 blocks_nonempty; - u16 block_sectors[EC_STRIPE_MAX]; + u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_replicas_padded r; }; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index fd011df3cb99..16d2bca8a662 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -100,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter, struct bpos *end) { struct btree_trans *trans = iter->trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey_packed *_k; - unsigned nr_iters = 0; + struct btree_iter *copy; + struct bkey_s_c k; + unsigned nr_iters = 0; int ret; - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - b = iter->l[0].b; - node_iter = iter->l[0].iter; - - BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && - bkey_cmp(bkey_start_pos(&insert->k), - bkey_predecessor(b->data->min_key)) < 0); - - *end = bpos_min(insert->k.p, b->key.k.p); + *end = insert->k.p; /* extent_update_to_keys(): */ nr_iters += 1; @@ -127,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, if (ret < 0) return ret; - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + copy = bch2_trans_copy_iter(trans, iter); + + for_each_btree_key_continue(copy, 0, k, ret) { unsigned offset = 0; if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) @@ -156,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, &nr_iters, EXTENT_ITERS_MAX); if (ret) break; - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); return ret < 0 ? ret : 0; } @@ -193,18 +179,13 @@ bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *_k; struct bkey_s_c k; - struct bkey unpacked; - int sectors; - - _k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!_k) - return BTREE_INSERT_OK; + int ret, sectors; - k = bkey_disassemble(l->b, _k, &unpacked); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; /* Check if we're splitting a compressed extent: */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 7fae6a4ba26f..595dd0add509 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -215,9 +215,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx sectors %u written %u min_key ", + pr_buf(out, "seq %llx written %u min_key ", le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors), le16_to_cpu(bp.v->sectors_written)); bch2_bpos_to_text(out, bp.v->min_key); @@ -665,7 +664,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) } bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) + unsigned nr_replicas, bool compressed) { struct btree_trans trans; struct btree_iter *iter; @@ -683,7 +682,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { + if (nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { ret = false; break; } @@ -693,6 +693,27 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, return ret; } +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (p.has_ec) + replicas += p.ec.redundancy; + + replicas++; + + } + + return replicas; +} + static unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded p) { @@ -707,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (ca->mi.state != BCH_MEMBER_STATE_FAILED) durability = max_t(unsigned, durability, ca->mi.durability); - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); - - if (WARN_ON(!s)) - goto out; + if (p.has_ec) + durability += p.ec.redundancy; - durability += s->nr_redundant; - } -out: return durability; } @@ -764,6 +778,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, } } +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); + + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); +} + void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) { @@ -1046,16 +1069,17 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_devs_list devs; const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; const char *reason; unsigned nonce = UINT_MAX; + unsigned i; - if (k.k->type == KEY_TYPE_btree_ptr) + if (k.k->type == KEY_TYPE_btree_ptr || + k.k->type == KEY_TYPE_btree_ptr_v2) size_ondisk = c->opts.btree_node_size; - if (k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) @@ -1101,6 +1125,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) } } + devs = bch2_bkey_devs(k); + bubble_sort(devs.devs, devs.nr, u8_cmp); + for (i = 0; i + 1 < devs.nr; i++) + if (devs.devs[i] == devs.devs[i + 1]) + return "multiple ptrs to same device"; + return NULL; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 74c7bb8f9104..3988315fc404 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -538,12 +538,15 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); + +unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, unsigned, unsigned); +void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 93ad6dbe8c16..56cfb0d60c03 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" @@ -84,6 +84,7 @@ struct dio_read { struct closure cl; struct kiocb *req; long ret; + bool should_dirty; struct bch_read_bio rbio; }; @@ -281,28 +282,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -316,13 +302,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -645,18 +625,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -670,10 +644,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -692,31 +666,29 @@ struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -724,41 +696,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -819,11 +759,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -855,7 +792,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; - struct bkey_on_stack sk; + struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; int ret = 0; @@ -863,7 +800,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, rbio->c = c; rbio->start_time = local_clock(); - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); retry: while (1) { struct bkey_s_c k; @@ -881,7 +818,7 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); ret = bch2_read_indirect_extent(trans, &offset_into_extent, &sk); @@ -926,13 +863,12 @@ retry: bio_endio(&rbio->bio); } - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -941,7 +877,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); @@ -976,8 +912,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -1077,34 +1011,35 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1128,7 +1063,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1282,7 +1217,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_PAGES * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1685,12 +1620,22 @@ again: /* O_DIRECT reads */ +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + static void bch2_dio_read_complete(struct closure *cl) { struct dio_read *dio = container_of(cl, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret, 0); - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); } static void bch2_direct_IO_read_endio(struct bio *bio) @@ -1705,8 +1650,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio) static void bch2_direct_IO_read_split_endio(struct bio *bio) { + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + bch2_direct_IO_read_endio(bio); - bio_check_pages_dirty(bio); /* transfers ownership */ + bio_check_or_release(bio, should_dirty); } static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) @@ -1760,6 +1708,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) dio->req = req; dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); goto start; while (iter->count) { @@ -1781,7 +1735,9 @@ start: } offset += bio->bi_iter.bi_size; - bio_set_pages_dirty(bio); + + if (dio->should_dirty) + bio_set_pages_dirty(bio); if (iter->count) closure_get(&dio->cl); @@ -1795,7 +1751,7 @@ start: closure_sync(&dio->cl); closure_debug_destroy(&dio->cl); ret = dio->ret; - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); return ret; } else { return -EIOCBQUEUED; @@ -1851,8 +1807,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned, iter_count; + unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; @@ -1863,7 +1820,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) iter_count = dio->iter.count; if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -1873,7 +1830,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); /* * If the fault handler returned an error but also signalled @@ -1906,7 +1863,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1928,7 +1885,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) dio->op.opts.data_replicas, 0); if (unlikely(ret) && !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), dio->op.opts.data_replicas)) + bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0)) goto err; task_io_account_write(bio->bi_iter.bi_size); @@ -1969,7 +1928,7 @@ loop: i_size_write(&inode->v, req->ki_pos); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (dio->op.error) { @@ -2479,9 +2438,9 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bkey_on_stack copy; + struct bkey_buf copy; struct btree_trans trans; - struct btree_iter *src, *dst; + struct btree_iter *src, *dst, *del; loff_t shift, new_size; u64 src_start; int ret; @@ -2489,7 +2448,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - bkey_on_stack_init(©); + bch2_bkey_buf_init(©); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); /* @@ -2551,6 +2510,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, POS(inode->v.i_ino, src_start >> 9), BTREE_ITER_INTENT); dst = bch2_trans_copy_iter(&trans, src); + del = bch2_trans_copy_iter(&trans, src); while (1) { struct disk_reservation disk_res = @@ -2571,13 +2531,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if (!k.k || k.k->p.inode != inode->v.i_ino) break; - BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); - if (insert && bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) break; reassemble: - bkey_on_stack_reassemble(©, c, k); + bch2_bkey_buf_reassemble(©, c, k); if (insert && bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) @@ -2604,6 +2562,7 @@ reassemble: delete.k.p = copy.k->k.p; delete.k.size = copy.k->k.size; delete.k.p.offset -= shift >> 9; + bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2624,9 +2583,7 @@ reassemble: BUG_ON(ret); } - bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); - - ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: + ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?: bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: bch2_trans_commit(&trans, &disk_res, &inode->ei_journal_seq, @@ -2654,7 +2611,7 @@ bkey_err: } err: bch2_trans_exit(&trans); - bkey_on_stack_exit(©, c); + bch2_bkey_buf_exit(©, c); bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; @@ -2890,235 +2847,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..2537a3d25ede 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); @@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index cc0a4b0f0e5b..a2654c862b7b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "acl.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" @@ -886,17 +886,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack cur, prev; + struct bkey_buf cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; - bkey_on_stack_init(&cur); - bkey_on_stack_init(&prev); + bch2_bkey_buf_init(&cur); + bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -915,7 +919,7 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&cur, c, k); + bch2_bkey_buf_reassemble(&cur, c, k); ret = bch2_read_indirect_extent(&trans, &offset_into_extent, &cur); @@ -923,7 +927,7 @@ retry: break; k = bkey_i_to_s_c(cur.k); - bkey_on_stack_realloc(&prev, c, k.k->u64s); + bch2_bkey_buf_realloc(&prev, c, k.k->u64s); sectors = min(sectors, k.k->size - offset_into_extent); @@ -957,8 +961,8 @@ retry: FIEMAP_EXTENT_LAST); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&cur, c); - bkey_on_stack_exit(&prev, c); + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; } @@ -995,15 +999,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -1012,16 +1007,16 @@ static const struct file_operations bch_file_operations = { .open = generic_file_open, .fsync = bch2_fsync, .splice_read = generic_file_splice_read, - /* - * Broken, on v5.3: +#if 0 + /* Busted: */ .splice_write = iter_file_splice_write, - */ +#endif .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1091,7 +1086,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1577,9 +1572,7 @@ got_sb: if (ret) goto err_put_super; - sb->s_bdi->congested_fn = bch2_congested; - sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 39f872de0c18..66c9dad2ef3e 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "dirent.h" #include "error.h" @@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans, buf[name.len] = '\0'; name.name = buf; - ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); + ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0); if (ret && ret != -EINTR) bch_err(c, "remove_dirent: err %i looking up directory inode", ret); if (ret) @@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans, struct inode_walker *w, u64 inum) { if (inum != w->cur_inum) { - int ret = bch2_inode_find_by_inum_trans(trans, inum, - &w->inode); + int ret = __bch2_inode_find_by_inum_trans(trans, inum, + &w->inode, 0); if (ret && ret != -ENOENT) return ret; @@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc, bch2_trans_update(trans, k_iter, &delete, 0); return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, - tmp, BCH_HASH_SET_MUST_CREATE); + tmp, 0); } static int fsck_hash_delete_at(struct btree_trans *trans, @@ -464,11 +464,11 @@ static int check_extents(struct bch_fs *c) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack prev; + struct bkey_buf prev; u64 i_sectors; int ret = 0; - bkey_on_stack_init(&prev); + bch2_bkey_buf_init(&prev); prev.k->k = KEY(0, 0, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -500,7 +500,7 @@ retry: goto err; } } - bkey_on_stack_reassemble(&prev, c, k); + bch2_bkey_buf_reassemble(&prev, c, k); ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) @@ -569,7 +569,7 @@ err: fsck_err: if (ret == -EINTR) goto retry; - bkey_on_stack_exit(&prev, c); + bch2_bkey_buf_exit(&prev, c); return bch2_trans_exit(&trans) ?: ret; } @@ -673,7 +673,7 @@ retry: continue; } - ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); + ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0); if (ret && ret != -ENOENT) break; @@ -787,7 +787,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) bch_verbose(c, "checking root directory"); - ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); + ret = bch2_trans_do(c, NULL, NULL, 0, + __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO, + root_inode, 0)); if (ret && ret != -ENOENT) return ret; @@ -834,7 +836,8 @@ static int check_lostfound(struct bch_fs *c, goto create_lostfound; } - ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); + ret = bch2_trans_do(c, NULL, NULL, 0, + __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0)); if (ret && ret != -ENOENT) return ret; @@ -1072,6 +1075,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links, if (inum < range_start || inum >= *range_end) return; + if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) { + *range_end = inum; + return; + } + link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); if (!link) { bch_verbose(c, "allocation failed during fsck - will need another pass"); @@ -1346,23 +1354,25 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, nlinks_iter = genradix_iter_init(links, 0); while ((k = bch2_btree_iter_peek(iter)).k && - !(ret2 = bkey_err(k))) { + !(ret2 = bkey_err(k)) && + iter->pos.offset < range_end) { peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); if (!link && (!k.k || iter->pos.offset >= range_end)) break; nlinks_pos = range_start + nlinks_iter.pos; - if (iter->pos.offset > nlinks_pos) { + + if (link && nlinks_pos < iter->pos.offset) { /* Should have been caught by dirents pass: */ - need_fsck_err_on(link && link->count, c, + need_fsck_err_on(link->count, c, "missing inode %llu (nlink %u)", nlinks_pos, link->count); genradix_iter_advance(&nlinks_iter, links); goto peek_nlinks; } - if (iter->pos.offset < nlinks_pos || !link) + if (!link || nlinks_pos > iter->pos.offset) link = &zero_links; if (k.k && k.k->type == KEY_TYPE_inode) { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index bf1c7319669c..81feb47fe8f9 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -479,7 +479,7 @@ int bch2_inode_create(struct btree_trans *trans, u64 min, max, start, *hint; int ret; - unsigned cpu = raw_smp_processor_id(); + u64 cpu = raw_smp_processor_id(); unsigned bits = (c->opts.inodes_32bit ? 31 : 63) - c->inode_shard_bits; @@ -628,16 +628,19 @@ err: return ret; } -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) +int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + unsigned flags) { struct btree_iter *iter; struct bkey_s_c k; int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(0, inode_nr), BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_cached(iter); + POS(0, inode_nr), flags); + k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED + ? bch2_btree_iter_peek_cached(iter) + : bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; @@ -650,6 +653,14 @@ err: return ret; } +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + return __bch2_inode_find_by_inum_trans(trans, inode_nr, + inode, BTREE_ITER_CACHED); + +} + int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, struct bch_inode_unpacked *inode) { diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index dbdfcf63d079..1caf036ae928 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -73,6 +73,8 @@ int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); int bch2_inode_rm(struct bch_fs *, u64, bool); +int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *, unsigned); int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index d31a1d449c03..5f74583f5c61 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -9,7 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -135,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -183,18 +183,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, /* Extent update path: */ -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool *maybe_extending, - s64 *i_sectors_delta, - s64 *disk_sectors_delta) +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *maybe_extending, + bool *should_check_enospc, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { + struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; *maybe_extending = true; + *should_check_enospc = false; *i_sectors_delta = 0; *disk_sectors_delta = 0; @@ -213,6 +218,11 @@ static int sum_sector_overwrites(struct btree_trans *trans, (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) - bch2_bkey_nr_ptrs_fully_allocated(old)); + if (!*should_check_enospc && + (new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *should_check_enospc = true; + if (bkey_cmp(old.k->p, new->k.p) >= 0) { /* * Check if there's already data above where we're @@ -250,7 +260,7 @@ int bch2_extent_update(struct btree_trans *trans, { /* this must live until after bch2_trans_commit(): */ struct bkey_inode_buf inode_p; - bool extending = false; + bool extending = false, should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; @@ -258,8 +268,9 @@ int bch2_extent_update(struct btree_trans *trans, if (ret) return ret; - ret = sum_sector_overwrites(trans, iter, k, + ret = bch2_sum_sector_overwrites(trans, iter, k, &extending, + &should_check_enospc, &i_sectors_delta, &disk_sectors_delta); if (ret) @@ -269,7 +280,8 @@ int bch2_extent_update(struct btree_trans *trans, disk_sectors_delta > (s64) disk_res->sectors) { ret = bch2_disk_reservation_add(trans->c, disk_res, disk_sectors_delta - disk_res->sectors, - 0); + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret; } @@ -320,8 +332,7 @@ int bch2_extent_update(struct btree_trans *trans, ret = bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + BTREE_INSERT_NOFAIL); if (ret) return ret; @@ -404,14 +415,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct bkey_on_stack sk; + struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; struct btree_iter *iter; int ret; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -423,7 +434,7 @@ int bch2_write_index_default(struct bch_write_op *op) k = bch2_keylist_front(keys); - bkey_on_stack_realloc(&sk, c, k->k.u64s); + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); bkey_copy(sk.k, k); bch2_cut_front(iter->pos, sk.k); @@ -440,7 +451,7 @@ int bch2_write_index_default(struct bch_write_op *op) } while (!bch2_keylist_empty(keys)); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -488,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); @@ -1617,14 +1625,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio { struct btree_trans trans; struct btree_iter *iter; - struct bkey_on_stack sk; + struct bkey_buf sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -1636,7 +1644,7 @@ retry: if (bkey_err(k)) goto err; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); @@ -1657,7 +1665,7 @@ retry: out: bch2_rbio_done(rbio); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; @@ -1670,14 +1678,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, { struct btree_trans trans; struct btree_iter *iter; - struct bkey_on_stack sk; + struct bkey_buf sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -1687,7 +1695,7 @@ retry: BTREE_ITER_SLOTS, k, ret) { unsigned bytes, sectors, offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); @@ -1736,7 +1744,7 @@ err: rbio->bio.bi_status = BLK_STS_IOERR; out: bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); bch2_rbio_done(rbio); } @@ -1807,17 +1815,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if ((ret = bkey_err(k))) goto out; - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - BKEY_EXTENT_U64s_MAX * 8); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - k = bkey_i_to_s_c(new); - if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1836,6 +1833,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; } + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; @@ -2002,7 +2009,7 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, - struct bkey_on_stack *orig_k) + struct bkey_buf *orig_k) { struct btree_iter *iter; struct bkey_s_c k; @@ -2029,7 +2036,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, } *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); - bkey_on_stack_reassemble(orig_k, trans->c, k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: bch2_trans_iter_put(trans, iter); return ret; @@ -2208,7 +2215,11 @@ get_bio: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (pick.ptr.cached) + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -2290,7 +2301,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_trans trans; struct btree_iter *iter; - struct bkey_on_stack sk; + struct bkey_buf sk; struct bkey_s_c k; unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| @@ -2304,7 +2315,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -2327,7 +2338,7 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); ret = bch2_read_indirect_extent(&trans, &offset_into_extent, &sk); @@ -2364,7 +2375,7 @@ retry: } out: bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return; err: if (ret == -EINTR) diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index e6aac594f3e6..04f6baa1daf7 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -3,7 +3,7 @@ #define _BCACHEFS_IO_H #include "checksum.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "io_types.h" #define to_wbio(_bio) \ @@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) : op->c->wq; } +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct disk_reservation *, u64 *, u64, s64 *); @@ -112,11 +114,11 @@ struct cache_promote_op; struct extent_ptr_decoded; int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_on_stack *); + struct bkey_buf *); static inline int bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, - struct bkey_on_stack *k) + struct bkey_buf *k) { return k->k->k.type == KEY_TYPE_reflink_p ? __bch2_read_indirect_extent(trans, offset_into_extent, k) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d54424829378..395021b5ac8e 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_gc.h" +#include "btree_update.h" #include "buckets.h" #include "journal.h" #include "journal_io.h" @@ -82,6 +83,7 @@ static void bch2_journal_buf_init(struct journal *j) bkey_extent_init(&buf->key); buf->noflush = false; buf->must_flush = false; + buf->separate_flush = false; memset(buf->has_inode, 0, sizeof(buf->has_inode)); @@ -118,6 +120,9 @@ void __bch2_journal_buf_put(struct journal *j) /* * Returns true if journal entry is now closed: + * + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: */ static bool __journal_entry_close(struct journal *j) { @@ -155,6 +160,7 @@ static bool __journal_entry_close(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); sectors = vstruct_blocks_plus(buf->data, c->block_bits, @@ -185,6 +191,7 @@ static bool __journal_entry_close(struct journal *j) __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + /* Initialize new buffer: */ journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); @@ -568,6 +575,8 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, spin_lock(&j->lock); + BUG_ON(seq > journal_cur_seq(j)); + /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { ret = -EIO; @@ -633,9 +642,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) u64 start_time = local_clock(); int ret, ret2; - ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); - bch2_time_stats_update(j->flush_seq_time, start_time); + if (!ret) + bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } @@ -777,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } } else { rcu_read_lock(); - ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, + ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, false, cl); rcu_read_unlock(); if (IS_ERR(ob)) { @@ -818,18 +828,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); + if (!c || new_fs) + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); if (c) { spin_unlock(&c->journal.lock); percpu_up_read(&c->mark_lock); } + if (c && !new_fs) + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, NULL, ca, + bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (!new_fs) bch2_open_bucket_put(c, ob); + + if (ret) + goto err; } err: bch2_sb_resize_journal(&ca->disk_sb, @@ -948,6 +968,7 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && (journal_entry_is_open(j) || j->last_empty_seq + 1 != journal_cur_seq(j))); @@ -993,13 +1014,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, } list_for_each_entry(i, journal_entries, list) { + unsigned ptr; + seq = le64_to_cpu(i->j.seq); BUG_ON(seq >= cur_seq); if (seq < last_seq) continue; - journal_seq_pin(j, seq)->devs = i->devs; + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); } spin_lock(&j->lock); @@ -1093,10 +1120,6 @@ int bch2_fs_journal_init(struct journal *j) j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; - /* Btree roots: */ - j->entry_u64s_reserved += - BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); - atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); @@ -1138,6 +1161,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" + "flushed_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" "nr flush writes:\t%llu\n" "nr noflush writes:\t%llu\n" @@ -1150,6 +1174,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) journal_cur_seq(j), journal_last_seq(j), j->last_seq_ondisk, + j->flushed_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, j->nr_flush_writes, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 34ef66994b73..bda8cb97d321 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -288,7 +288,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _THIS_IP_); + lock_release(&j->res_map, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, @@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 0e6fbe2f6a75..0d361f5c39b5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -5,6 +5,7 @@ #include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" +#include "disk_groups.h" #include "error.h" #include "io.h" #include "journal.h" @@ -46,15 +47,16 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct bch_extent_ptr entry_ptr, struct journal_list *jlist, struct jset *j, bool bad) { - struct journal_replay *i, *pos; - struct bch_devs_list devs = { .nr = 0 }; + struct journal_replay *i, *pos, *dup = NULL; + struct bch_extent_ptr *ptr; struct list_head *where; size_t bytes = vstruct_bytes(j); u64 last_seq = 0; - int ret; + int ret = JOURNAL_ENTRY_ADD_OK; list_for_each_entry_reverse(i, jlist->head, list) { if (!JSET_NO_FLUSH(&i->j)) { @@ -88,28 +90,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, where = jlist->head; add: - i = where->next != jlist->head + dup = where->next != jlist->head ? container_of(where->next, struct journal_replay, list) : NULL; + if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq)) + dup = NULL; + /* * Duplicate journal entries? If so we want the one that didn't have a * checksum error: */ - if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - if (i->bad) { - devs = i->devs; - __journal_replay_free(i); + if (dup) { + if (dup->bad) { + /* we'll replace @dup: */ } else if (bad) { + i = dup; goto found; } else { - fsck_err_on(bytes != vstruct_bytes(&i->j) || - memcmp(j, &i->j, bytes), c, + fsck_err_on(bytes != vstruct_bytes(&dup->j) || + memcmp(j, &dup->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); + i = dup; goto found; } - } i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); @@ -118,17 +123,34 @@ add: goto out; } - list_add(&i->list, where); - i->devs = devs; - i->bad = bad; - i->ignore = false; + i->nr_ptrs = 0; + i->bad = bad; + i->ignore = false; memcpy(&i->j, j, bytes); + + if (dup) { + i->nr_ptrs = dup->nr_ptrs; + memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); + __journal_replay_free(dup); + } + + list_add(&i->list, where); found: - if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) - bch2_dev_list_add_dev(&i->devs, ca->dev_idx); - else - fsck_err_on(1, c, "duplicate journal entries on same device"); - ret = JOURNAL_ENTRY_ADD_OK; + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + if (ptr->dev == ca->dev_idx) { + bch_err(c, "duplicate journal entry %llu on same device", + le64_to_cpu(i->j.seq)); + goto out; + } + } + + if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + goto out; + } + + i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: return ret; @@ -405,6 +427,69 @@ fsck_err: return ret; } +static int journal_entry_validate_clock(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, "invalid journal entry clock: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, "invalid journal entry clock: bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static int journal_entry_validate_dev_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, "invalid journal entry dev usage: bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, "invalid journal entry dev usage: bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, "invalid journal entry dev usage: bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, int); @@ -470,7 +555,8 @@ static int jset_validate(struct bch_fs *c, version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, c, "%s sector %llu seq %llu: unknown journal entry version %u", - ca->name, sector, le64_to_cpu(jset->seq), + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), version)) { /* don't try to continue: */ return EINVAL; @@ -482,32 +568,42 @@ static int jset_validate(struct bch_fs *c, if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca->name, sector, le64_to_cpu(jset->seq), bytes)) { + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) { ret = JOURNAL_ENTRY_BAD; le32_add_cpu(&jset->u64s, -((bytes - (bucket_sectors_left << 9)) / 8)); } - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", - ca->name, sector, le64_to_cpu(jset->seq), + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), JSET_CSUM_TYPE(jset))) { ret = JOURNAL_ENTRY_BAD; - goto bad_csum_type; + goto csum_done; } + if (write) + goto csum_done; + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, "%s sector %llu seq %llu: journal checksum bad", - ca->name, sector, le64_to_cpu(jset->seq))) + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq))) ret = JOURNAL_ENTRY_BAD; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); -bad_csum_type: - if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, - "invalid journal entry: last_seq > seq")) { +csum_done: + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } @@ -515,6 +611,14 @@ fsck_err: return ret; } +static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) +{ + unsigned sectors = vstruct_sectors(jset, c->block_bits); + + return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: + jset_validate_entries(c, jset, WRITE); +} + struct journal_read_buf { void *data; size_t size; @@ -577,8 +681,15 @@ reread: if (bch2_dev_io_err_on(ret, ca, "journal read error: sector %llu", offset) || - bch2_meta_read_fault("journal")) - return -EIO; + bch2_meta_read_fault("journal")) { + /* + * We don't error out of the recovery process + * here, since the relevant journal entry may be + * found on a different device, and missing or + * no journal entries will be handled later + */ + return 0; + } j = buf->data; } @@ -628,7 +739,10 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j, ret != 0); + ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { + .dev = ca->dev_idx, + .offset = offset, + }, jlist, j, ret != 0); mutex_unlock(&jlist->lock); switch (ret) { @@ -716,6 +830,25 @@ err: goto out; } +static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = c->devs[j->ptrs[i].dev]; + u64 offset; + + div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); + + if (i) + pr_buf(out, " "); + pr_buf(out, "%u:%llu (offset %llu)", + j->ptrs[i].dev, + (u64) j->ptrs[i].offset, offset); + } +} + int bch2_journal_read(struct bch_fs *c, struct list_head *list, u64 *blacklist_seq, u64 *start_seq) { @@ -813,6 +946,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, while (seq < le64_to_cpu(i->j.seq)) { u64 missing_start, missing_end; + char buf1[200], buf2[200]; while (seq < le64_to_cpu(i->j.seq) && bch2_journal_seq_is_blacklisted(c, seq, false)) @@ -827,10 +961,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, !bch2_journal_seq_is_blacklisted(c, seq, false)) seq++; + if (i->list.prev != list) { + struct printbuf out = PBUF(buf1); + struct journal_replay *p = list_prev_entry(i, list); + + bch2_journal_ptrs_to_text(&out, c, p); + pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); + } else + sprintf(buf1, "(none)"); + bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); + missing_end = seq - 1; - fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", missing_start, missing_end, - last_seq, *blacklist_seq - 1); + last_seq, *blacklist_seq - 1, + buf1, buf2); } seq++; @@ -839,7 +986,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; - struct bch_replicas_padded replicas; + struct bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_required = 1, + }; + unsigned ptr; char buf[80]; if (i->ignore) @@ -849,13 +1000,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (ret) goto fsck_err; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + + bch2_replicas_entry_sort(&replicas.e); + /* * If we're mounting in degraded mode - if we didn't read all * the devices - this is wrong: */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); - if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, @@ -946,16 +1100,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); rcu_read_lock(); +retry: + devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, - &c->rw_devs[BCH_DATA_journal]); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -987,9 +1145,17 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + goto retry; + } done: rcu_read_unlock(); + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } @@ -1050,9 +1216,13 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) return; memcpy(new_buf, buf->data, buf->buf_size); - kvpfree(buf->data, buf->buf_size); - buf->data = new_buf; - buf->buf_size = new_size; + + spin_lock(&j->lock); + swap(buf->data, new_buf); + swap(buf->buf_size, new_size); + spin_unlock(&j->lock); + + kvpfree(new_buf, new_size); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) @@ -1069,9 +1239,7 @@ static void journal_write_done(struct closure *cl) bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 seq = le64_to_cpu(w->data->seq); - u64 last_seq = le64_to_cpu(w->data->last_seq); - u64 v; + u64 v, seq, last_seq; int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); @@ -1089,6 +1257,9 @@ static void journal_write_done(struct closure *cl) bch2_fatal_error(c); spin_lock(&j->lock); + seq = le64_to_cpu(w->data->seq); + last_seq = le64_to_cpu(w->data->last_seq); + if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; @@ -1096,7 +1267,7 @@ static void journal_write_done(struct closure *cl) if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - if (!w->noflush) { + if (!JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = last_seq; } @@ -1156,6 +1327,56 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } +static void do_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, system_highpri_wq); + return; +} + void bch2_journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); @@ -1165,9 +1386,9 @@ void bch2_journal_write(struct closure *cl) struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - struct bch_extent_ptr *ptr; + char *journal_debug_buf = NULL; bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1184,7 +1405,7 @@ void bch2_journal_write(struct closure *cl) test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); - jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + jset->last_seq = 0; j->nr_noflush_writes++; } else { @@ -1209,8 +1430,8 @@ void bch2_journal_write(struct closure *cl) end = bch2_btree_roots_to_journal_entries(c, jset->start, end); - end = bch2_journal_super_entries_add_common(c, end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1219,10 +1440,7 @@ void bch2_journal_write(struct closure *cl) journal_write_compact(jset); - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); @@ -1236,11 +1454,11 @@ void bch2_journal_write(struct closure *cl) if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; - if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) + if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change) validate_before_checksum = true; if (validate_before_checksum && - jset_validate_entries(c, jset, WRITE)) + jset_validate_for_write(c, jset)) goto err; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), @@ -1251,7 +1469,7 @@ void bch2_journal_write(struct closure *cl) journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate_entries(c, jset, WRITE)) + jset_validate_for_write(c, jset)) goto err; sectors = vstruct_sectors(jset, c->block_bits); @@ -1270,6 +1488,12 @@ retry_alloc: goto retry_alloc; } + if (ret) { + journal_debug_buf = kmalloc(4096, GFP_ATOMIC); + if (journal_debug_buf) + __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); + } + /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): @@ -1284,7 +1508,9 @@ retry_alloc: spin_unlock(&j->lock); if (ret) { - bch_err(c, "Unable to allocate journal write"); + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf); + kfree(journal_debug_buf); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); return; @@ -1297,49 +1523,30 @@ retry_alloc: if (c->opts.nochanges) goto no_io; - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } + for_each_rw_member(ca, c, i) + nr_rw_members++; - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); + if (nr_rw_members > 1) + w->separate_flush = true; - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; - if (!JSET_NO_FLUSH(jset)) - bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; - bch2_bio_map(bio, jset, sectors << 9); + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); - trace_journal_write(bio); - closure_bio_submit(bio, cl); - - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_opf = REQ_OP_FLUSH; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } } - if (!JSET_NO_FLUSH(jset)) { - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_opf = REQ_OP_FLUSH; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - } + bch2_bucket_seq_cleanup(c); + + continue_at(cl, do_journal_write, system_highpri_wq); + return; no_io: bch2_bucket_seq_cleanup(c); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 6b4c80968f52..a4931ab93a68 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -8,7 +8,9 @@ */ struct journal_replay { struct list_head list; - struct bch_devs_list devs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + /* checksum error, but we may want to try using it anyways: */ bool bad; bool ignore; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 61bcd77190a7..bbf8e5ad8aa0 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -11,7 +11,6 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> -#include <linux/sched/task.h> #include <trace/events/bcachefs.h> /* Free space calculations: */ @@ -385,12 +384,22 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin_list *pin_list; spin_lock(&j->lock); + + if (seq < journal_last_seq(j)) { + /* + * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on + * the src pin - with the pin dropped, the entry to pin might no + * longer to exist, but that means there's no longer anything to + * copy and we can bail out here: + */ + spin_unlock(&j->lock); + return; + } + pin_list = journal_seq_pin(j, seq); __journal_pin_drop(j, pin); - BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); - atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; @@ -682,8 +691,10 @@ int bch2_journal_reclaim_start(struct journal *j) p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); - if (IS_ERR(p)) + if (IS_ERR(p)) { + bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); j->reclaim_thread = p; diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index f02caa3d49ea..adf1f5c981cd 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -53,8 +53,11 @@ static inline void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - if (journal_pin_active(src)) - bch2_journal_pin_add(j, src->seq, dst, flush_fn); + /* Guard against racing with journal_pin_drop(src): */ + u64 seq = READ_ONCE(src->seq); + + if (seq) + bch2_journal_pin_add(j, seq, dst, flush_fn); } static inline void bch2_journal_pin_update(struct journal *j, u64 seq, diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 67ee47eb17a7..d17a1ff82a18 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -20,7 +20,7 @@ struct journal_buf { struct jset *data; - BKEY_PADDED(key); + __BKEY_PADDED(key, BCH_REPLICAS_MAX); struct closure_waitlist wait; @@ -31,6 +31,7 @@ struct journal_buf { unsigned u64s_reserved; bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ + bool separate_flush; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 96c8690adc5b..6241ff0c129f 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,7 +4,7 @@ */ #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack sk; + struct bkey_buf sk; int ret = 0; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, @@ -57,7 +57,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags continue; } - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), dev_idx, flags, false); @@ -90,7 +90,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags } ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); BUG_ON(ret == -EINTR); @@ -109,6 +109,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) struct btree_iter *iter; struct closure cl; struct btree *b; + struct bkey_buf k; unsigned id; int ret; @@ -116,28 +117,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; + bch2_bkey_buf_init(&k); bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; retry: if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), dev_idx)) continue; - bkey_copy(&tmp.k, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), + ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); if (ret) { bch_err(c, "Cannot drop device without losing data"); goto err; } - ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + ret = bch2_btree_node_update_key(c, iter, b, k.k); if (ret == -EINTR) { b = bch2_btree_iter_peek_node(iter); goto retry; @@ -157,6 +158,7 @@ retry: ret = 0; err: ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&k, c); BUG_ON(ret == -EINTR); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fbeaa3b67326..75b7046d6042 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -2,7 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct migrate_write *m = container_of(op, struct migrate_write, op); struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; int ret = 0; + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, m->btree_id, @@ -73,21 +78,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct bkey_s_c k; struct bkey_i *insert; struct bkey_i_extent *new; - BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; struct extent_ptr_decoded p; bool did_work = false; - int nr; + bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; bch2_trans_reset(&trans, 0); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); - if (ret) { - if (ret == -EINTR) - continue; - break; - } + if (ret) + goto err; new = bkey_i_to_extent(bch2_keylist_front(keys)); @@ -95,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) goto nomatch; - bkey_reassemble(&_insert.k, k); - insert = &_insert.k; + bkey_reassemble(_insert.k, k); + insert = _insert.k; - bkey_copy(&_new.k, bch2_keylist_front(keys)); - new = bkey_i_to_extent(&_new.k); + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); bch2_cut_front(iter->pos, &new->k_i); bch2_cut_front(iter->pos, insert); @@ -144,23 +146,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.background_target, op->opts.data_replicas); - /* - * If we're not fully overwriting @k, and it's compressed, we - * need a reservation for all the pointers in @insert - */ - nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - - m->nr_ptrs_reserved; + ret = bch2_sum_sector_overwrites(&trans, iter, insert, + &extending, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; - if (insert->k.size < k.k->size && - bch2_bkey_sectors_compressed(k) && - nr > 0) { + if (disk_sectors_delta > (s64) op->res.sectors) { ret = bch2_disk_reservation_add(c, &op->res, - keylist_sectors(keys) * nr, 0); + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto out; - - m->nr_ptrs_reserved += nr; - goto next; } bch2_trans_update(&trans, iter, insert, 0); @@ -168,8 +168,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| m->data_opts.btree_insert_flags); +err: if (!ret) atomic_long_inc(&c->extent_migrate_done); if (ret == -EINTR) @@ -197,6 +197,8 @@ nomatch: } out: bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); BUG_ON(ret == -EINTR); return ret; } @@ -326,12 +328,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -516,7 +518,7 @@ static int __bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct bkey_on_stack sk; + struct bkey_buf sk; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -525,12 +527,12 @@ static int __bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_user; stats->btree_id = btree_id; - stats->pos = POS_MIN; + stats->pos = start; iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); @@ -605,13 +607,19 @@ peek: } /* unlock before doing IO: */ - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { + if (ret2 == -EINTR) { + bch2_trans_reset(&trans, 0); + bch2_trans_cond_resched(&trans); + continue; + } + if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); @@ -633,20 +641,21 @@ next_nondata: } out: ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } int bch2_move_data(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, struct bch_ratelimit *rate, struct write_point_specifier wp, - struct bpos start, - struct bpos end, move_pred_fn pred, void *arg, struct bch_move_stats *stats) { struct moving_context ctxt = { .stats = stats }; + enum btree_id id; int ret; closure_init_stack(&ctxt.cl); @@ -655,10 +664,23 @@ int bch2_move_data(struct bch_fs *c, stats->data_type = BCH_DATA_user; - ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, - pred, arg, stats, BTREE_ID_EXTENTS) ?: - __bch2_move_data(c, &ctxt, rate, wp, start, end, - pred, arg, stats, BTREE_ID_REFLINK); + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id++) { + stats->btree_id = id; + + if (id != BTREE_ID_EXTENTS && + id != BTREE_ID_REFLINK) + continue; + + ret = __bch2_move_data(c, &ctxt, rate, wp, + id == start_btree_id ? start_pos : POS_MIN, + id == end_btree_id ? end_pos : POS_MAX, + pred, arg, stats, id); + if (ret) + break; + } + move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); @@ -672,16 +694,22 @@ int bch2_move_data(struct bch_fs *c, return ret; } +typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_opts *); + static int bch2_move_btree(struct bch_fs *c, - move_pred_fn pred, - void *arg, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, + move_btree_pred pred, void *arg, struct bch_move_stats *stats) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned id; + enum btree_id id; struct data_opts data_opts; enum data_cmd cmd; int ret = 0; @@ -690,16 +718,24 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = 0; id < BTREE_ID_NR; id++) { + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id++) { stats->btree_id = id; - for_each_btree_node(&trans, iter, id, POS_MIN, + for_each_btree_node(&trans, iter, id, + id == start_btree_id ? start_pos : POS_MIN, BTREE_ITER_PREFETCH, b) { + if (kthread && kthread_should_stop()) + goto out; + + if ((cmp_int(id, end_btree_id) ?: + bkey_cmp(b->key.k.p, end_pos)) > 0) + break; + stats->pos = iter->pos; - switch ((cmd = pred(c, arg, - bkey_i_to_s_c(&b->key), - &io_opts, &data_opts))) { + switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { case DATA_SKIP: goto next; case DATA_SCRUB: @@ -719,9 +755,12 @@ next: ret = bch2_trans_iter_free(&trans, iter) ?: ret; } - +out: bch2_trans_exit(&trans); + if (ret) + bch_err(c, "error %i in bch2_move_btree", ret); + return ret; } @@ -778,6 +817,83 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, return DATA_REWRITE; } +static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool bformat_needs_redo(struct bkey_format *f) +{ + unsigned i; + + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f->bits_per_field[i] > unpacked_bits) + return true; + + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return true; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) + return true; + } + + return false; +} + +static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + if (b->version_ondisk != c->sb.version || + btree_node_need_rewrite(b) || + bformat_needs_redo(&b->format)) { + data_opts->target = 0; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; + return DATA_REWRITE; + } + + return DATA_SKIP; +} + +int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) +{ + int ret; + + ret = bch2_move_btree(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + return ret; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -789,17 +905,20 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + rereplicate_btree_pred, c, stats) ?: ret; closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); ret = bch2_replicas_gc2(c) ?: ret; - ret = bch2_move_data(c, NULL, - writepoint_hashed((unsigned long) current), - op.start, - op.end, + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, writepoint_hashed((unsigned long) current), rereplicate_pred, c, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; @@ -810,16 +929,22 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + migrate_btree_pred, &op, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - ret = bch2_move_data(c, NULL, - writepoint_hashed((unsigned long) current), - op.start, - op.end, + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, writepoint_hashed((unsigned long) current), migrate_pred, &op, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; + case BCH_DATA_OP_REWRITE_OLD_NODES: + ret = bch2_scan_old_btree_nodes(c, stats); + break; default: ret = -EINVAL; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index b04bc669226d..5076153689d1 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -52,9 +52,13 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_opts *); -int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, +int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + +int bch2_move_data(struct bch_fs *, + enum btree_id, struct bpos, + enum btree_id, struct bpos, + struct bch_ratelimit *, struct write_point_specifier, - struct bpos, struct bpos, move_pred_fn, void *, struct bch_move_stats *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 2c5daed58aca..03668e481f7a 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->rewrite_dev = p.ptr.dev; - if (p.has_ec) { - struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); - - data_opts->nr_replicas += m->nr_redundant; - } + if (p.has_ec) + data_opts->nr_replicas += p.ec.redundancy; return DATA_REWRITE; } @@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c) bucket_sectors_used(m) >= ca->mi.bucket_size) continue; - WARN_ON(m.stripe && !g->ec_redundancy); + WARN_ON(m.stripe && !g->stripe_redundancy); e = (struct copygc_heap_entry) { .dev = dev_idx, .gen = m.gen, - .replicas = 1 + g->ec_redundancy, + .replicas = 1 + g->stripe_redundancy, .fragmentation = bucket_sectors_used(m) * (1U << 15) / ca->mi.bucket_size, .sectors = bucket_sectors_used(m), @@ -200,6 +197,11 @@ static int bch2_copygc(struct bch_fs *c) return -1; } + /* + * Our btree node allocations also come out of RESERVE_MOVINGGC: + */ + sectors_to_move = (sectors_to_move * 3) / 4; + for (i = h->data; i < h->data + h->used; i++) sectors_to_move += i->sectors * i->replicas; @@ -217,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c) sizeof(h->data[0]), bucket_offset_cmp, NULL); - ret = bch2_move_data(c, &c->copygc_pd.rate, + ret = bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, + &c->copygc_pd.rate, writepoint_ptr(&c->copygc_write_point), - POS_MIN, POS_MAX, copygc_pred, NULL, &move_stats); @@ -286,7 +290,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) fragmented_allowed += ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); - fragmented += usage.sectors_fragmented; + fragmented += usage.d[BCH_DATA_user].fragmented; } return max_t(s64, 0, fragmented_allowed - fragmented); @@ -296,7 +300,7 @@ static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last, wait; + u64 last, wait; set_freezable(); @@ -304,7 +308,7 @@ static int bch2_copygc_thread(void *arg) if (kthread_wait_freezable(c->copy_gc_enabled)) break; - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { @@ -346,8 +350,10 @@ int bch2_copygc_start(struct bch_fs *c) return -ENOMEM; t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); - if (IS_ERR(t)) + if (IS_ERR(t)) { + bch_err(c, "error creating copygc thread: %li", PTR_ERR(t)); return PTR_ERR(t); + } get_task_struct(t); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 97a36ac0beea..d53b6dccd161 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -23,6 +23,13 @@ const char * const bch2_sb_features[] = { NULL }; +const char * const bch2_sb_compat[] = { +#define x(f, n) #f, + BCH_SB_COMPAT() +#undef x + NULL +}; + const char * const bch2_csum_opts[] = { "none", "crc32c", diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 710a7ee67039..7ce2b3adb8d7 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -10,6 +10,7 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; @@ -136,6 +137,11 @@ enum opt_type { OPT_STR(bch2_str_hash_types), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or disk group for metadata writes") \ x(foreground_target, u16, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ OPT_FN(bch2_opt_target), \ @@ -217,6 +223,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ x(discard, u8, \ OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c3373c48fa81..aa9bbdbfa65e 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; - unsigned long io_start; + u64 io_start; long throttle; set_freezable(); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = rebalance_work(c); prev_start = jiffies; prev_cputime = curr_cputime(); @@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) (20 - w.dev_most_full_percent), 50); - if (atomic_long_read(&clock->now) + clock->max_slop < + if (atomic64_read(&clock->now) + clock->max_slop < r->throttled_until_iotime) { r->throttled_until_cputime = start + throttle; r->state = REBALANCE_THROTTLED; @@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) max(p.dev_most_full_percent, 1U) / max(w.dev_most_full_percent, 1U)); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = w; prev_start = start; prev_cputime = cputime; @@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg) rebalance_work_reset(c); bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, /* ratelimiting disabled for now */ NULL, /* &r->pd.rate, */ writepoint_ptr(&c->rebalance_write_point), - POS_MIN, POS_MAX, rebalance_pred, NULL, &r->move_stats); } @@ -274,7 +275,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - - atomic_long_read(&c->io_clock[WRITE].now)) << 9); + atomic64_read(&c->io_clock[WRITE].now)) << 9); pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); @@ -311,12 +312,17 @@ int bch2_rebalance_start(struct bch_fs *c) { struct task_struct *p; + if (c->rebalance.thread) + return 0; + if (c->opts.nochanges) return 0; p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); - if (IS_ERR(p)) + if (IS_ERR(p)) { + bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); rcu_assign_pointer(c->rebalance.thread, p); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 192c6be20ced..2f62a643c39f 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -17,7 +17,7 @@ struct bch_fs_rebalance { atomic64_t work_unknown_dev; enum rebalance_state state; - unsigned long throttled_until_iotime; + u64 throttled_until_iotime; unsigned long throttled_until_cputime; struct bch_move_stats move_stats; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1883a1faf380..c42919277c72 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "move.h" #include "quota.h" #include "recovery.h" #include "replicas.h" @@ -39,78 +41,174 @@ static void drop_alloc_keys(struct journal_keys *keys) /* iterate over keys read from the journal: */ -static struct journal_key *journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bkey_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +{ + return (cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + bkey_cmp(l->k->k.p, r->k->k.p)); +} + +static size_t journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; while (l < r) { m = l + ((r - l) >> 1); - if ((cmp_int(id, journal_keys->d[m].btree_id) ?: - cmp_int(level, journal_keys->d[m].level) ?: - bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) + if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) l = m + 1; else r = m; } BUG_ON(l < journal_keys->nr && - (cmp_int(id, journal_keys->d[l].btree_id) ?: - cmp_int(level, journal_keys->d[l].level) ?: - bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); BUG_ON(l && - (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: - cmp_int(level, journal_keys->d[l - 1].level) ?: - bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); + + return l; +} + +static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) +{ + struct bkey_i *n = iter->keys->d[idx].k; + struct btree_and_journal_iter *biter = + container_of(iter, struct btree_and_journal_iter, journal); + + if (iter->idx > idx || + (iter->idx == idx && + biter->last && + bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) + iter->idx++; +} + +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + unsigned idx = journal_key_search(keys, id, level, k->k.p); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = keys->size * 2, + .journal_seq_base = keys->journal_seq_base, + }; + + new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -ENOMEM; + } - return l < journal_keys->nr ? journal_keys->d + l : NULL; + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + } + + array_insert_item(keys->d, keys->nr, idx, n); + + list_for_each_entry(iter, &c->journal_iters, list) + journal_iter_fix(c, iter, idx); + + return 0; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i *whiteout = + kmalloc(sizeof(struct bkey), GFP_KERNEL); + int ret; + + if (!whiteout) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_init(&whiteout->k); + whiteout->k.p = pos; + + ret = bch2_journal_key_insert(c, id, level, whiteout); + if (ret) + kfree(whiteout); + return ret; } static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - if (iter->k && - iter->k < iter->keys->d + iter->keys->nr && - iter->k->btree_id == iter->btree_id && - iter->k->level == iter->level) - return iter->k->k; + struct journal_key *k = iter->idx - iter->keys->nr + ? iter->keys->d + iter->idx : NULL; + + if (k && + k->btree_id == iter->btree_id && + k->level == iter->level) + return k->k; - iter->k = NULL; + iter->idx = iter->keys->nr; return NULL; } static void bch2_journal_iter_advance(struct journal_iter *iter) { - if (iter->k) - iter->k++; + if (iter->idx < iter->keys->nr) + iter->idx++; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); } -static void bch2_journal_iter_init(struct journal_iter *iter, - struct journal_keys *journal_keys, +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, enum btree_id id, unsigned level, struct bpos pos) { iter->btree_id = id; iter->level = level; - iter->keys = journal_keys; - iter->k = journal_key_search(journal_keys, id, level, pos); + iter->keys = &c->journal_keys; + iter->idx = journal_key_search(&c->journal_keys, id, level, pos); + list_add(&iter->list, &c->journal_iters); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) { - return iter->btree - ? bch2_btree_iter_peek(iter->btree) - : bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); } static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) { - if (iter->btree) - bch2_btree_iter_next(iter->btree); - else - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); } void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) @@ -159,7 +257,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * if (iter->b && bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { - iter->journal.k = NULL; + iter->journal.idx = iter->journal.keys->nr; iter->last = none; return bkey_s_c_null; } @@ -180,31 +278,50 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * return bch2_btree_and_journal_iter_peek(iter); } -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, - struct btree_trans *trans, - struct journal_keys *journal_keys, - enum btree_id id, struct bpos pos) +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) { - memset(iter, 0, sizeof(*iter)); - - iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH); - bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); + bch2_journal_iter_exit(&iter->journal); } void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct journal_keys *journal_keys, + struct bch_fs *c, struct btree *b) { memset(iter, 0, sizeof(*iter)); iter->b = b; bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(&iter->journal, journal_keys, + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, b->data->min_key); } /* Walk btree, overlaying keys from the journal: */ +static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + struct btree_and_journal_iter iter) +{ + unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; + struct bkey_s_c k; + struct bkey_buf tmp; + + BUG_ON(!b->c.level); + + bch2_bkey_buf_init(&tmp); + + while (i < nr && + (k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_buf_reassemble(&tmp, c, k); + + bch2_btree_node_prefetch(c, NULL, tmp.k, + b->c.btree_id, b->c.level - 1); + + bch2_btree_and_journal_iter_advance(&iter); + i++; + } + + bch2_bkey_buf_exit(&tmp, c); +} + static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, struct journal_keys *journal_keys, enum btree_id btree_id, @@ -213,9 +330,12 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b { struct btree_and_journal_iter iter; struct bkey_s_c k; + struct bkey_buf tmp; + struct btree *child; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + bch2_bkey_buf_init(&tmp); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ret = key_fn(c, btree_id, b->c.level, k); @@ -223,34 +343,34 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b break; if (b->c.level) { - struct btree *child; - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bch2_bkey_buf_reassemble(&tmp, c, k); bch2_btree_and_journal_iter_advance(&iter); - if (b->c.level > 0) { - child = bch2_btree_node_get_noiter(c, &tmp.k, - b->c.btree_id, b->c.level - 1); - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; + child = bch2_btree_node_get_noiter(c, tmp.k, + b->c.btree_id, b->c.level - 1, + false); + + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; - ret = (node_fn ? node_fn(c, b) : 0) ?: - bch2_btree_and_journal_walk_recurse(c, child, - journal_keys, btree_id, node_fn, key_fn); - six_unlock_read(&child->c.lock); + btree_and_journal_iter_prefetch(c, b, iter); - if (ret) - break; - } + ret = (node_fn ? node_fn(c, b) : 0) ?: + bch2_btree_and_journal_walk_recurse(c, child, + journal_keys, btree_id, node_fn, key_fn); + six_unlock_read(&child->c.lock); + + if (ret) + break; } else { bch2_btree_and_journal_iter_advance(&iter); } } + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&tmp, c); return ret; } @@ -306,6 +426,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_free(struct journal_keys *keys) { + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -334,7 +460,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + keys.size = roundup_pow_of_two(nr_keys); + + keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); if (!keys.d) goto err; @@ -384,115 +512,6 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, - struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter, *split_iter; - /* - * We might cause compressed extents to be split, so we need to pass in - * a disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i *split; - struct bpos atomic_end; - /* - * Some extents aren't equivalent - w.r.t. what the triggers do - * - if they're split: - */ - bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || - k->k.type == KEY_TYPE_reflink_p; - bool remark = false; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -retry: - bch2_trans_begin(&trans); - - iter = bch2_trans_get_iter(&trans, btree_id, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - do { - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto err; - - atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); - - split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); - ret = PTR_ERR_OR_ZERO(split); - if (ret) - goto err; - - if (!remark && - remark_if_split && - bkey_cmp(atomic_end, k->k.p) < 0) { - ret = bch2_disk_reservation_add(c, &disk_res, - k->k.size * - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - - remark = true; - } - - bkey_copy(split, k); - bch2_cut_front(iter->pos, split); - bch2_cut_back(atomic_end, split); - - split_iter = bch2_trans_copy_iter(&trans, iter); - - /* - * It's important that we don't go through the - * extent_handle_overwrites() and extent_update_to_keys() path - * here: journal replay is supposed to treat extents like - * regular keys - */ - __bch2_btree_iter_set_pos(split_iter, split->k.p, false); - bch2_trans_update(&trans, split_iter, split, - BTREE_TRIGGER_NORUN); - bch2_trans_iter_put(&trans, split_iter); - - bch2_btree_iter_set_pos(iter, split->k.p); - - if (remark) { - ret = bch2_trans_mark_key(&trans, - bkey_s_c_null, - bkey_i_to_s_c(split), - 0, split->k.size, - BTREE_TRIGGER_INSERT); - if (ret) - goto err; - } - } while (bkey_cmp(iter->pos, k->k.p) < 0); - - if (remark) { - ret = bch2_trans_mark_key(&trans, - bkey_i_to_s_c(k), - bkey_s_c_null, - 0, -((s64) k->k.size), - BTREE_TRIGGER_OVERWRITE); - if (ret) - goto err; - } - - ret = bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); -err: - bch2_trans_iter_put(&trans, iter); - - if (ret == -EINTR) - goto retry; - - bch2_disk_reservation_put(c, &disk_res); - - return bch2_trans_exit(&trans) ?: ret; -} - static int __bch2_journal_replay_key(struct btree_trans *trans, enum btree_id id, unsigned level, struct bkey_i *k) @@ -518,14 +537,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, return ret; } -static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) +static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) { - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_journal_replay_key(&trans, id, level, k)); + unsigned commit_flags = BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW; + + if (!k->allocated) + commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + + return bch2_trans_do(c, NULL, NULL, commit_flags, + __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); } static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) @@ -601,7 +622,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (i->level) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -629,9 +650,7 @@ static int bch2_journal_replay(struct bch_fs *c, replay_now_at(j, keys.journal_seq_base + i->journal_seq); - ret = i->k->k.size - ? bch2_extent_replay_key(c, i->btree_id, i->k) - : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -643,7 +662,8 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_flush_all_pins(j); return bch2_journal_error(j); err: - bch_err(c, "journal replay: error %d while replaying key", ret); + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[i->btree_id], i->level); return ret; } @@ -700,10 +720,31 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_data_usage: { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); + unsigned i; + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + + for (i = 0; i < nr_types; i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } case BCH_JSET_ENTRY_blacklist: { struct jset_entry_blacklist *bl_entry = container_of(entry, struct jset_entry_blacklist, entry); @@ -722,6 +763,12 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(bl_entry->end) + 1); break; } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, clock->time); + } } return ret; @@ -736,9 +783,6 @@ static int journal_replay_early(struct bch_fs *c, int ret; if (clean) { - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); - for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { @@ -751,9 +795,6 @@ static int journal_replay_early(struct bch_fs *c, if (i->ignore) continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) @@ -817,13 +858,6 @@ static int verify_superblock_clean(struct bch_fs *c, return 0; } - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock %u doesn't match journal %u after clean shutdown", - clean->read_clock, j->read_clock); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock write clock %u doesn't match journal %u after clean shutdown", - clean->write_clock, j->write_clock); - for (i = 0; i < BTREE_ID_NR; i++) { char buf1[200], buf2[200]; struct bkey_i *k1, *k2; @@ -899,7 +933,7 @@ static int read_btree_roots(struct bch_fs *c) if (i == BTREE_ID_ALLOC && c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); continue; } @@ -909,7 +943,7 @@ static int read_btree_roots(struct bch_fs *c) "invalid btree root %s", bch2_btree_ids[i]); if (i == BTREE_ID_ALLOC) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } ret = bch2_btree_root_read(c, i, &r->key, r->level); @@ -919,7 +953,7 @@ static int read_btree_roots(struct bch_fs *c) "error reading btree root %s", bch2_btree_ids[i]); if (i == BTREE_ID_ALLOC) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } } @@ -936,7 +970,7 @@ int bch2_fs_recovery(struct bch_fs *c) struct bch_sb_field_clean *clean = NULL; struct jset *last_journal_entry = NULL; u64 blacklist_seq, journal_seq; - bool write_sb = false, need_write_alloc = false; + bool write_sb = false; int ret; if (c->sb.clean) @@ -949,6 +983,13 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2; + } + if (!c->replicas.entries || c->opts.rebuild_replicas) { bch_info(c, "building replicas info"); @@ -979,7 +1020,7 @@ int bch2_fs_recovery(struct bch_fs *c) last_journal_entry && !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } @@ -1020,7 +1061,7 @@ use_clean: } if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); drop_alloc_keys(&c->journal_keys); } @@ -1072,36 +1113,20 @@ use_clean: set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { - /* - * interior btree node updates aren't consistent with the - * journal; after an unclean shutdown we have to walk all - * pointers to metadata: - */ - bch_info(c, "starting metadata mark and sweep"); - err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, true); - if (ret < 0) - goto err; - if (ret) - need_write_alloc = true; - bch_verbose(c, "mark and sweep done"); - } - if (c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, false); - if (ret < 0) - goto err; + ret = bch2_gc(c, true); if (ret) - need_write_alloc = true; + goto err; bch_verbose(c, "mark and sweep done"); } + bch2_stripes_heap_start(c); + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); @@ -1122,7 +1147,8 @@ use_clean: goto err; bch_verbose(c, "journal replay done"); - if (need_write_alloc && !c->opts.nochanges) { + if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && + !c->opts.nochanges) { /* * note that even when filesystem was clean there might be work * to do here, if we ran gc (because of fsck) which recalculated @@ -1137,8 +1163,6 @@ use_clean: goto err; } bch_verbose(c, "alloc write done"); - - set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); } if (!c->sb.clean) { @@ -1177,6 +1201,21 @@ use_clean: bch_verbose(c, "quotas done"); } + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { + struct bch_move_stats stats = { 0 }; + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; + + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + mutex_lock(&c->sb_lock); if (c->opts.version_upgrade) { if (c->sb.version < bcachefs_metadata_version_new_versioning) @@ -1188,7 +1227,7 @@ use_clean: } if (!test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; write_sb = true; } @@ -1248,6 +1287,8 @@ int bch2_fs_initialize(struct bch_fs *c) le16_to_cpu(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1312,8 +1353,10 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL)); - if (ret) + if (ret) { + bch_err(c, "error creating lost+found"); goto err; + } if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index a66827c9addf..fa91851b9ed7 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -6,10 +6,11 @@ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) struct journal_iter { + struct list_head list; enum btree_id btree_id; unsigned level; + size_t idx; struct journal_keys *keys; - struct journal_key *k; }; /* @@ -17,8 +18,6 @@ struct journal_iter { */ struct btree_and_journal_iter { - struct btree_iter *btree; - struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -32,16 +31,18 @@ struct btree_and_journal_iter { } last; }; +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, - struct btree_trans *, - struct journal_keys *, - enum btree_id, struct bpos); +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct journal_keys *, + struct bch_fs *, struct btree *); typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 8abcbfb3bd64..930547de3309 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "extents.h" #include "inode.h" @@ -198,8 +198,7 @@ s64 bch2_remap_range(struct bch_fs *c, struct btree_trans trans; struct btree_iter *dst_iter, *src_iter; struct bkey_s_c src_k; - BKEY_PADDED(k) new_dst; - struct bkey_on_stack new_src; + struct bkey_buf new_dst, new_src; struct bpos dst_end = dst_start, src_end = src_start; struct bpos dst_want, src_want; u64 src_done, dst_done; @@ -216,7 +215,8 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset += remap_sectors; src_end.offset += remap_sectors; - bkey_on_stack_init(&new_src); + bch2_bkey_buf_init(&new_dst); + bch2_bkey_buf_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, @@ -257,7 +257,7 @@ s64 bch2_remap_range(struct bch_fs *c, break; if (src_k.k->type != KEY_TYPE_reflink_p) { - bkey_on_stack_reassemble(&new_src, c, src_k); + bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); bch2_cut_front(src_iter->pos, new_src.k); @@ -275,7 +275,7 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_s_c_reflink_p src_p = bkey_s_c_to_reflink_p(src_k); struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(&new_dst.k); + bkey_reflink_p_init(new_dst.k); u64 offset = le64_to_cpu(src_p.v->idx) + (src_iter->pos.offset - @@ -286,12 +286,12 @@ s64 bch2_remap_range(struct bch_fs *c, BUG(); } - new_dst.k.k.p = dst_iter->pos; - bch2_key_resize(&new_dst.k.k, + new_dst.k->k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k->k, min(src_k.k->p.offset - src_iter->pos.offset, dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, + ret = bch2_extent_update(&trans, dst_iter, new_dst.k, NULL, journal_seq, new_i_size, i_sectors_delta); if (ret) @@ -333,7 +333,8 @@ err: } while (ret2 == -EINTR); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&new_src, c); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); percpu_ref_put(&c->writes); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 00a197b65e0b..be73b458e4f6 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, /* Replicas tracking - in memory: */ -static inline int u8_cmp(u8 l, u8 r) -{ - return cmp_int(l, r); -} - static void verify_replicas_entry(struct bch_replicas_entry *e) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -31,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -static void replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -127,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, break; } - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } void bch2_devlist_to_replicas(struct bch_replicas_entry *e, @@ -147,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, for (i = 0; i < devs.nr; i++) e->devs[e->nr_devs++] = devs.devs[i]; - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } static struct bch_replicas_cpu @@ -164,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); - new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -202,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, int bch2_replicas_entry_idx(struct bch_fs *c, struct bch_replicas_entry *search) { - replicas_entry_sort(search); + bch2_replicas_entry_sort(search); return __replicas_entry_idx(&c->replicas, search); } @@ -287,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c, for (i = 0; i < ARRAY_SIZE(new_usage); i++) if (!(new_usage[i] = __alloc_percpu_gfp(bytes, - sizeof(u64), GFP_NOIO))) + sizeof(u64), GFP_KERNEL))) goto err; - if (!(new_base = kzalloc(bytes, GFP_NOIO)) || - !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(bytes, GFP_KERNEL)) || (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) goto err; for (i = 0; i < ARRAY_SIZE(new_usage); i++) @@ -553,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, - GFP_NOIO); + GFP_KERNEL); if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); @@ -603,7 +598,11 @@ retry: cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || - bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) + c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i]) || + percpu_u64_get(&c->usage[2]->replicas[i]) || + percpu_u64_get(&c->usage[3]->replicas[i])) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } @@ -672,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; @@ -682,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, for_each_replicas_entry(sb_r, e) { dst = cpu_replicas_entry(cpu_r, idx++); memcpy(dst, e, replicas_entry_bytes(e)); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; @@ -704,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, entry_size += sizeof(struct bch_replicas_entry) - sizeof(struct bch_replicas_entry_v0); - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; @@ -719,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, dst->nr_devs = e->nr_devs; dst->nr_required = 1; memcpy(dst->devs, e->devs, e->nr_devs); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; @@ -959,94 +958,48 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { /* Query replicas: */ -struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_devs_mask online_devs) +bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, bool print) { - struct bch_sb_field_members *mi; struct bch_replicas_entry *e; - unsigned i, nr_online, nr_offline; - struct replicas_status ret; - - memset(&ret, 0, sizeof(ret)); - - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - ret.replicas[i].redundancy = INT_MAX; - - mi = bch2_sb_get_members(c->disk_sb.sb); + bool ret = true; percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { - if (e->data_type >= ARRAY_SIZE(ret.replicas)) - panic("e %p data_type %u\n", e, e->data_type); + unsigned i, nr_online = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; - nr_online = nr_offline = 0; + for (i = 0; i < e->nr_devs; i++) + nr_online += test_bit(e->devs[i], devs.d); - for (i = 0; i < e->nr_devs; i++) { - BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, - e->devs[i])); + if (nr_online < e->nr_required) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_LOST + : BCH_FORCE_IF_DATA_LOST; - if (test_bit(e->devs[i], online_devs.d)) - nr_online++; - else - nr_offline++; - } + if (nr_online < e->nr_devs) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_DEGRADED + : BCH_FORCE_IF_DATA_DEGRADED; - ret.replicas[e->data_type].redundancy = - min(ret.replicas[e->data_type].redundancy, - (int) nr_online - (int) e->nr_required); + if (dflags & ~flags) { + if (print) { + char buf[100]; - ret.replicas[e->data_type].nr_offline = - max(ret.replicas[e->data_type].nr_offline, - nr_offline); - } + bch2_replicas_entry_to_text(&PBUF(buf), e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", + nr_online, buf); + } + ret = false; + break; + } + } percpu_up_read(&c->mark_lock); - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - if (ret.replicas[i].redundancy == INT_MAX) - ret.replicas[i].redundancy = 0; - return ret; } -struct replicas_status bch2_replicas_status(struct bch_fs *c) -{ - return __bch2_replicas_status(c, bch2_online_devs(c)); -} - -static bool have_enough_devs(struct replicas_status s, - enum bch_data_type type, - bool force_if_degraded, - bool force_if_lost) -{ - return (!s.replicas[type].nr_offline || force_if_degraded) && - (s.replicas[type].redundancy >= 0 || force_if_lost); -} - -bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -{ - return (have_enough_devs(s, BCH_DATA_journal, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_btree, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_user, - flags & BCH_FORCE_IF_DATA_DEGRADED, - flags & BCH_FORCE_IF_DATA_LOST)); -} - -int bch2_replicas_online(struct bch_fs *c, bool meta) -{ - struct replicas_status s = bch2_replicas_status(c); - - return (meta - ? min(s.replicas[BCH_DATA_journal].redundancy, - s.replicas[BCH_DATA_btree].redundancy) - : s.replicas[BCH_DATA_user].redundancy) + 1; -} - unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { struct bch_replicas_entry *e; @@ -1066,8 +1019,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) int bch2_fs_replicas_init(struct bch_fs *c) { - c->journal.entry_u64s_reserved += - reserve_journal_replicas(c, &c->replicas); + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &c->replicas)); return replicas_table_update(c, &c->replicas); } diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 8b95164fbb56..9c8fd3d98247 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -5,6 +5,7 @@ #include "eytzinger.h" #include "replicas_types.h" +void bch2_replicas_entry_sort(struct bch_replicas_entry *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); @@ -38,19 +39,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, e->devs[0] = dev; } -struct replicas_status { - struct { - int redundancy; - unsigned nr_offline; - } replicas[BCH_DATA_NR]; -}; +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); -struct replicas_status __bch2_replicas_status(struct bch_fs *, - struct bch_devs_mask); -struct replicas_status bch2_replicas_status(struct bch_fs *); -bool bch2_have_enough_devs(struct replicas_status, unsigned); - -int bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 78835bd2d6bc..86f1feff3aaa 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -276,19 +276,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) return "Bad number of member devices"; if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) @@ -375,7 +375,6 @@ static void bch2_sb_update(struct bch_fs *c) ca->mi = bch2_mi_to_cpu(mi->members + i); } -/* doesn't copy member info */ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; @@ -767,15 +766,13 @@ int bch2_write_super(struct bch_fs *c) nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = - bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = - bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); /* * If we would be able to mount _without_ the devices we successfully @@ -786,6 +783,7 @@ int bch2_write_super(struct bch_fs *c) * mount with the devices we did successfully write to: */ if (bch2_fs_fatal_err_on(!nr_wrote || + !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, "Unable to write superblock to sufficient devices")) @@ -954,40 +952,35 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS; ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } -static void -entry_init_u64s(struct jset_entry *entry, unsigned u64s) +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) { - memset(entry, 0, u64s * sizeof(u64)); + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + memset(entry, 0, u64s * sizeof(u64)); /* * The u64s field counts from the start of data, ignoring the shared * fields. */ entry->u64s = u64s - 1; -} -static void -entry_init_size(struct jset_entry *entry, size_t size) -{ - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - entry_init_u64s(entry, u64s); + *end = vstruct_next(*end); + return entry; } -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry *entry, - u64 journal_seq) +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) { - unsigned i; + struct bch_dev *ca; + unsigned i, dev; percpu_down_write(&c->mark_lock); @@ -1000,58 +993,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; u->v = cpu_to_le64(c->usage_base->nr_inodes); - - entry = vstruct_next(entry); } { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_KEY_VERSION; u->v = cpu_to_le64(atomic64_read(&c->key_version)); - - entry = vstruct_next(entry); } for (i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - - entry = vstruct_next(entry); } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); - entry_init_size(entry, sizeof(*u) + e->nr_devs); u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e)); + } - entry = vstruct_next(entry); + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } } percpu_up_write(&c->mark_lock); - return entry; + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = atomic64_read(&c->io_clock[i].now); + } } void bch2_fs_mark_clean(struct bch_fs *c) @@ -1066,8 +1078,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); @@ -1080,15 +1092,13 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); entry = sb_clean->start; - entry = bch2_journal_super_entries_add_common(c, entry, 0); + bch2_journal_super_entries_add_common(c, &entry, 0); entry = bch2_btree_roots_to_journal_entries(c, entry, entry); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 7a068158efca..1a35124f5f47 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_clean: */ -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry *, u64); +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2d5a9b1c6a9d..b94bbca42446 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -148,42 +148,21 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } -int bch2_congested(void *data, int bdi_bits) +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) { - struct bch_fs *c = data; - struct backing_dev_info *bdi; struct bch_dev *ca; - unsigned i; - int ret = 0; + unsigned i, nr = 0, u64s = + ((sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / + sizeof(u64); rcu_read_lock(); - if (bdi_bits & (1 << WB_sync_congested)) { - /* Reads - check all devices: */ - for_each_readable_member(ca, c, i) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } else { - const struct bch_devs_mask *devs = - bch2_target_to_mask(c, c->opts.foreground_target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_member_device_rcu(ca, c, i, devs) { - bdi = ca->disk_sb.bdev->bd_bdi; - - if (bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - } + for_each_member_device_rcu(ca, c, i, NULL) + nr++; rcu_read_unlock(); - return ret; + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); } /* Filesystem RO/RW: */ @@ -212,9 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_copygc_stop(c); bch2_gc_thread_stop(c); - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - /* * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: @@ -273,10 +249,7 @@ nowrote_alloc: * the journal kicks off btree writes via reclaim - wait for in flight * writes after stopping journal: */ - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_btree_flush_all_writes(c); - else - bch2_btree_verify_flushed(c); + bch2_btree_flush_all_writes(c); /* * After stopping journal: @@ -440,9 +413,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - for_each_rw_member(ca, c, i) { ret = bch2_dev_allocator_start(ca); if (ret) { @@ -454,6 +424,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + for_each_rw_member(ca, c, i) + bch2_wake_allocator(ca); + ret = bch2_journal_reclaim_start(&c->journal); if (ret) { bch_err(c, "error starting journal reclaim: %i", ret); @@ -725,6 +698,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_blacklist_entries_gc); INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); @@ -825,6 +799,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_dev_alloc(c, i)) goto err; + bch2_journal_entry_res_resize(&c->journal, + &c->btree_root_journal_res, + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); + bch2_dev_usage_journal_reserve(c); + bch2_journal_entry_res_resize(&c->journal, + &c->clock_journal_res, + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + mutex_lock(&bch_fs_list_lock); err = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); @@ -1022,6 +1004,8 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { + bch2_dev_allocator_stop(ca); + cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1190,6 +1174,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) if (!ca) goto err; + ca->fs = c; + + if (ca->mi.state == BCH_MEMBER_STATE_RW && + bch2_dev_allocator_start(ca)) { + bch2_dev_free(ca); + goto err; + } + bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1260,13 +1252,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && - !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { - mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(ca->fs, ca, 0); - mutex_unlock(&c->sb_lock); - } - bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) @@ -1292,7 +1277,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; - struct replicas_status s; struct bch_dev *ca2; int i, nr_rw = 0, required; @@ -1328,9 +1312,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, new_online_devs = bch2_online_devs(c); __clear_bit(ca->dev_idx, new_online_devs.d); - s = __bch2_replicas_status(c, new_online_devs); - - return bch2_have_enough_devs(s, flags); + return bch2_have_enough_devs(c, new_online_devs, flags, false); default: BUG(); } @@ -1338,14 +1320,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, static bool bch2_fs_may_start(struct bch_fs *c) { - struct replicas_status s; struct bch_sb_field_members *mi; struct bch_dev *ca; - unsigned i, flags = c->opts.degraded - ? BCH_FORCE_IF_DEGRADED - : 0; + unsigned i, flags = 0; + + if (c->opts.very_degraded) + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - if (!c->opts.degraded) { + if (c->opts.degraded) + flags |= BCH_FORCE_IF_DEGRADED; + + if (!c->opts.degraded && + !c->opts.very_degraded) { mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); @@ -1365,9 +1351,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_unlock(&c->sb_lock); } - s = bch2_replicas_status(c); - - return bch2_have_enough_devs(s, flags); + return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) @@ -1568,6 +1552,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_RW && @@ -1577,19 +1563,6 @@ err: return ret; } -static void dev_usage_clear(struct bch_dev *ca) -{ - struct bucket_array *buckets; - - percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); - up_read(&ca->bucket_lock); -} - /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1640,15 +1613,13 @@ int bch2_dev_add(struct bch_fs *c, const char *path) * allocate the journal, reset all the marks, then remark after we * attach... */ - bch2_mark_dev_superblock(ca->fs, ca, 0); + bch2_mark_dev_superblock(NULL, ca, 0); err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); if (ret) goto err; - dev_usage_clear(ca); - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1699,15 +1670,15 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); - bch2_mark_dev_superblock(c, ca, 0); - bch2_write_super(c); mutex_unlock(&c->sb_lock); - err = "alloc write failed"; - ret = bch2_dev_alloc_write(c, ca, 0); + bch2_dev_usage_journal_reserve(c); + + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret) - goto err; + goto err_late; if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); @@ -1728,6 +1699,7 @@ err: bch_err(c, "Unable to add device: %s", err); return ret; err_late: + up_write(&c->state_lock); bch_err(c, "Error going rw after adding device: %s", err); return -EINVAL; } @@ -1763,6 +1735,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } ca = bch_dev_locked(c, dev_idx); + + if (bch2_trans_mark_dev_sb(c, NULL, ca)) { + err = "bch2_trans_mark_dev_sb() error"; + goto err; + } + if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 048ffec622af..02c81f3555c3 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_uuid_to_fs(uuid_le); -int bch2_congested(void *, int); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 20406ebd6f5b..069973a38f12 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -20,7 +20,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_REPLICAS_MAX + 1]; + u8 devs[BCH_BKEY_PTRS_MAX]; }; struct bch_member_cpu { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index cc13fc258115..bc4c3a77ea62 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -199,9 +199,6 @@ read_attribute(new_stripes); rw_attribute(pd_controllers_update_seconds); -read_attribute(meta_replicas_have); -read_attribute(data_replicas_have); - read_attribute(io_timers_read); read_attribute(io_timers_write); @@ -347,9 +344,6 @@ SHOW(bch2_fs) sysfs_print(promote_whole_extents, c->promote_whole_extents); - sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); - sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); - /* Debugging: */ if (attr == &sysfs_alloc_debug) @@ -475,7 +469,7 @@ STORE(bch2_fs) */ #if 0 down_read(&c->state_lock); - bch2_gc(c, NULL, false, false); + bch2_gc(c, false, false); up_read(&c->state_lock); #else bch2_gc_gens(c); @@ -520,9 +514,6 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_node_size, &sysfs_btree_cache_size, - &sysfs_meta_replicas_have, - &sysfs_data_replicas_have, - &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, @@ -705,7 +696,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, { int rw = (private ? 1 : 0); - return bucket_last_io(c, bucket(ca, b), rw); + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; } static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, @@ -718,7 +709,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, b); + return bucket_gc_gen(bucket(ca, b)); } static int unsigned_cmp(const void *_l, const void *_r) @@ -797,63 +788,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) nr[c->open_buckets[i].type]++; pr_buf(out, - "free_inc: %zu/%zu\n" - "free[RESERVE_BTREE]: %zu/%zu\n" - "free[RESERVE_MOVINGGC]: %zu/%zu\n" - "free[RESERVE_NONE]: %zu/%zu\n" - "buckets:\n" - " capacity: %llu\n" - " alloc: %llu\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " available: %lli\n" - "sectors:\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " fragmented: %llu\n" - " copygc threshold: %llu\n" - "freelist_wait: %s\n" - "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n" - "open_buckets_btree: %u\n" - "open_buckets_user: %u\n" - "btree reserve cache: %u\n", - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, - ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_alloc, - stats.buckets[BCH_DATA_sb], - stats.buckets[BCH_DATA_journal], - stats.buckets[BCH_DATA_btree], - stats.buckets[BCH_DATA_user], - stats.buckets[BCH_DATA_cached], - stats.buckets_ec, - __dev_buckets_available(ca, stats), - stats.sectors[BCH_DATA_sb], - stats.sectors[BCH_DATA_journal], - stats.sectors[BCH_DATA_btree], - stats.sectors[BCH_DATA_user], - stats.sectors[BCH_DATA_cached], - stats.sectors_ec, - stats.sectors_fragmented, - c->copygc_threshold, - c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, - BTREE_NODE_OPEN_BUCKET_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_btree], - nr[BCH_DATA_user], - c->btree_reserve_cache_nr); + "\t\t buckets\t sectors fragmented\n" + "capacity%16llu\n", + ca->mi.nbuckets - ca->mi.first_bucket); + + for (i = 1; i < BCH_DATA_NR; i++) + pr_buf(out, "%-8s%16llu%16llu%16llu\n", + bch2_data_types[i], stats.d[i].buckets, + stats.d[i].sectors, stats.d[i].fragmented); + + pr_buf(out, + "ec\t%16llu\n" + "available%15llu\n" + "alloc\t%16llu\n" + "\n" + "free_inc\t\t%zu/%zu\n" + "free[RESERVE_MOVINGGC]\t%zu/%zu\n" + "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" + "open buckets\t\t%u/%u (reserved %u)\n" + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" + "btree reserve cache\t%u\n", + stats.buckets_ec, + __dev_buckets_available(ca, stats), + stats.buckets_alloc, + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + c->freelist_wait.list.first ? "waiting" : "empty", + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index e8a7df61ff5c..c69b05deec41 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -88,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -653,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); @@ -776,4 +747,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); #define cmp_int(l, r) ((l > r) - (l < r)) +static inline int u8_cmp(u8 l, u8 r) +{ + return cmp_int(l, r); +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/kernel/locking/six.c b/kernel/locking/six.c index 3acee748e052..49d46ed2e18e 100644 --- a/kernel/locking/six.c +++ b/kernel/locking/six.c @@ -15,7 +15,7 @@ #endif #define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -#define six_release(l) lock_release(l, 0, _RET_IP_) +#define six_release(l) lock_release(l, _RET_IP_) struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ |