diff options
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r-- | fs/bcachefs/btree_io.c | 686 |
1 files changed, 150 insertions, 536 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 74ffad4c38f3..52c3f51f02cb 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" +#include "bkey_sort.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" @@ -19,43 +21,6 @@ #include <trace/events/bcachefs.h> -/* btree_node_iter_large: */ - -#define btree_node_iter_cmp_heap(h, _l, _r) \ - __btree_node_iter_cmp((iter)->is_extents, b, \ - __btree_node_offset_to_key(b, (_l).k), \ - __btree_node_offset_to_key(b, (_r).k)) - -void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set n = - ((struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }); - - __heap_add(iter, n, btree_node_iter_cmp_heap); - } -} - -void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, - struct btree *b) -{ - iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; - - EBUG_ON(!iter->used); - EBUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - heap_del(iter, 0, btree_node_iter_cmp_heap); - else - heap_sift_down(iter, 0, btree_node_iter_cmp_heap); -} - static void verify_no_dups(struct btree *b, struct bkey_packed *start, struct bkey_packed *end) @@ -116,190 +81,6 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); } -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); - -struct sort_iter { - struct btree *b; - unsigned used; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; -}; - -static void sort_iter_init(struct sort_iter *iter, struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - iter->b = b; -} - -static inline void __sort_iter_sift(struct sort_iter *iter, - unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -{ - - __sort_iter_sift(iter, 0, cmp); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - __sort_iter_sift(iter, i, cmp); -} - -static void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return iter->used ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - iter->data->k = bkey_next(iter->data->k); - - BUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - array_remove_item(iter->data, iter->used, 0); - else - sort_iter_sift(iter, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; -} - -static inline int sort_key_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r); -} - -static unsigned sort_key_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_key_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extent_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -} - -static unsigned sort_extent_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *out = dst; - struct bkey_i l, r; - bool prev = false, l_packed = false; - u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); - u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); - u64 new_size; - - max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); - - sort_iter_sort(iter, sort_extent_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { - EBUG_ON(bkeyp_val_u64s(f, in)); - EBUG_ON(in->type != KEY_TYPE_DISCARD); - - r.k = bkey_unpack_key(iter->b, in); - - if (prev && - bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - new_size = l_packed - ? min(max_packed_size, max_packed_offset - - bkey_start_offset(&l.k)) - : KEY_SIZE_MAX; - - new_size = min(new_size, r.k.p.offset - - bkey_start_offset(&l.k)); - - BUG_ON(new_size < l.k.size); - - bch2_key_resize(&l.k, new_size); - - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - bch2_cut_front(l.k.p, &r); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - l = r; - prev = true; - l_packed = bkey_packed(in); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, bool compacting, enum compact_mode mode) @@ -309,7 +90,7 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, if (mode == COMPACT_LAZY) { if (should_compact_bset_lazy(b, t) || - (compacting && bset_unwritten(b, bset(b, t)))) + (compacting && !bset_written(b, bset(b, t)))) return dead_u64s; } else { if (bset_written(b, bset(b, t))) @@ -356,7 +137,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, struct bkey_packed *k, *n, *out, *start, *end; struct btree_node_entry *src = NULL, *dst = NULL; - if (t != b->set && bset_unwritten(b, i)) { + if (t != b->set && !bset_written(b, i)) { src = container_of(i, struct btree_node_entry, keys); dst = max(write_block(b), (void *) btree_bkey_last(b, t -1)); @@ -396,7 +177,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, continue; if (bkey_whiteout(k)) { - unreserve_whiteout(b, t, k); + unreserve_whiteout(b, k); memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); set_bkeyp_val_u64s(f, u_pos, 0); u_pos = bkey_next(u_pos); @@ -420,11 +201,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, BUG_ON((void *) unwritten_whiteouts_start(c, b) < (void *) btree_bkey_last(b, bset_tree_last(b))); - u64s = btree_node_is_extents(b) - ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter) - : sort_key_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter); + u64s = (btree_node_is_extents(b) + ? bch2_sort_extent_whiteouts + : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), + &sort_iter); BUG_ON(u64s > b->whiteout_u64s); BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); @@ -467,7 +247,7 @@ static bool bch2_drop_whiteouts(struct btree *b) start = btree_bkey_first(b, t); end = btree_bkey_last(b, t); - if (bset_unwritten(b, i) && + if (!bset_written(b, i) && t != b->set) { struct bset *dst = max_t(struct bset *, write_block(b), @@ -499,87 +279,6 @@ static bool bch2_drop_whiteouts(struct btree *b) return ret; } -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; -} - -static unsigned sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, sort_keys_cmp); - - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; - } - - if (bkey_whiteout(in)) { - memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_copy(out, in); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extents_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_deleted(l) - (int) bkey_deleted(r); -} - -static unsigned sort_extents(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_extents_cmp); - - while ((in = sort_iter_next(iter, sort_extents_cmp))) { - if (bkey_deleted(in)) - continue; - - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - static void btree_node_sort(struct bch_fs *c, struct btree *b, struct btree_iter *iter, unsigned start_idx, @@ -618,16 +317,18 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (btree_node_is_extents(b)) filter_whiteouts = bset_written(b, start_bset); - u64s = btree_node_is_extents(b) - ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) - : sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + u64s = (btree_node_is_extents(b) + ? bch2_sort_extents + : bch2_sort_keys)(out->keys.start, + &sort_iter, + filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); if (sorting_entire_node) - bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); /* Make sure we preserve bset journal_seq: */ @@ -678,101 +379,6 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, bch2_verify_btree_nr_keys(b); } -/* Sort + repack in a new format: */ -static struct btree_nr_keys sort_repack(struct bset *dst, - struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_whiteout(in)) - continue; - - if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort, repack, and merge: */ -static struct btree_nr_keys sort_repack_merge(struct bch_fs *c, - struct bset *dst, - struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts, - key_filter_fn filter, - key_merge_fn merge) -{ - struct bkey_packed *k, *prev = NULL, *out; - struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; - - memset(&nr, 0, sizeof(nr)); - - while ((k = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_whiteout(k)) - continue; - - /* - * The filter might modify pointers, so we have to unpack the - * key and values to &tmp.k: - */ - bch2_bkey_unpack(src, &tmp.k, k); - - if (filter && filter(c, src, bkey_i_to_s(&tmp.k))) - continue; - - /* prev is always unpacked, for key merging: */ - - if (prev && - merge && - merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE) - continue; - - /* - * the current key becomes the new prev: advance prev, then - * copy the current key - but first pack prev (in place): - */ - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - - btree_keys_account_key_add(&nr, 0, prev); - prev = bkey_next(prev); - } else { - prev = vstruct_last(dst); - } - - bkey_copy(prev, &tmp.k); - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = vstruct_last(dst); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - void bch2_btree_sort_into(struct bch_fs *c, struct btree *dst, struct btree *src) @@ -785,24 +391,21 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_bset_set_no_aux_tree(dst, dst->set); - bch2_btree_node_iter_init_from_start(&src_iter, src, - btree_node_is_extents(src)); + bch2_btree_node_iter_init_from_start(&src_iter, src); - if (btree_node_ops(src)->key_normalize || - btree_node_ops(src)->key_merge) - nr = sort_repack_merge(c, btree_bset_first(dst), + if (btree_node_is_extents(src)) + nr = bch2_sort_repack_merge(c, btree_bset_first(dst), src, &src_iter, &dst->format, - true, - btree_node_ops(src)->key_normalize, - btree_node_ops(src)->key_merge); + true); else - nr = sort_repack(btree_bset_first(dst), + nr = bch2_sort_repack(btree_bset_first(dst), src, &src_iter, &dst->format, true); - bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); set_btree_bset_end(dst, dst->set); @@ -829,7 +432,7 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b, for (unwritten_idx = 0; unwritten_idx < b->nsets; unwritten_idx++) - if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) break; if (b->nsets - unwritten_idx > 1) { @@ -852,7 +455,7 @@ void bch2_btree_build_aux_trees(struct btree *b) for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, - bset_unwritten(b, bset(b, t)) && + !bset_written(b, bset(b, t)) && t == bset_tree_last(b)); } @@ -907,33 +510,27 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, vstruct_end(i) - (void *) i->_data); } -static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i, - unsigned offset, int write, char *buf, size_t len) +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct btree *b, struct bset *i, + unsigned offset, int write) { - char *out = buf, *end = buf + len; - - out += scnprintf(out, end - out, - "error validating btree node %s " - "at btree %u level %u/%u\n" - "pos %llu:%llu node offset %u", - write ? "before write " : "", - b->btree_id, b->level, - c->btree_roots[b->btree_id].level, - b->key.k.p.inode, b->key.k.p.offset, - b->written); + pr_buf(out, "error validating btree node %s" + "at btree %u level %u/%u\n" + "pos %llu:%llu node offset %u", + write ? "before write " : "", + b->btree_id, b->level, + c->btree_roots[b->btree_id].level, + b->key.k.p.inode, b->key.k.p.offset, + b->written); if (i) - out += scnprintf(out, end - out, - " bset u64s %u", - le16_to_cpu(i->u64s)); - - return out - buf; + pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } enum btree_err_type { @@ -950,10 +547,11 @@ enum btree_validate_ret { #define btree_err(type, c, b, i, msg, ...) \ ({ \ __label__ out; \ - char _buf[300], *out = _buf, *end = out + sizeof(_buf); \ + char _buf[300]; \ + struct printbuf out = PBUF(_buf); \ \ - out += btree_err_msg(c, b, i, b->written, write, out, end - out);\ - out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__); \ + btree_err_msg(&out, c, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ @@ -1006,8 +604,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, { struct bkey_packed *k, *prev = NULL; struct bpos prev_pos = POS_MIN; - enum bkey_type type = btree_node_type(b); bool seen_non_whiteout = false; + unsigned version; const char *err; int ret = 0; @@ -1053,13 +651,12 @@ static int validate_bset(struct bch_fs *c, struct btree *b, "invalid bkey format: %s", err); } - if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION, - BTREE_ERR_FIXABLE, c, b, i, - "unsupported bset version")) { - i->version = cpu_to_le16(BCACHE_BSET_VERSION); - i->u64s = 0; - return 0; - } + version = le16_to_cpu(i->version); + btree_err_on((version != BCH_BSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + BTREE_ERR_FATAL, c, b, i, + "unsupported bset version"); if (btree_err_on(b->written + sectors > c->opts.btree_node_size, BTREE_ERR_FIXABLE, c, b, i, @@ -1108,19 +705,23 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch2_bkey_swab(type, &b->format, k); + bch2_bkey_swab(&b->format, k); + + if (!write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(btree_node_type(b), k, write); u = bkey_disassemble(b, k, &tmp); - invalid = __bch2_bkey_invalid(c, type, u) ?: + invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, u) ?: - (write ? bch2_bkey_val_invalid(c, type, u) : NULL); + (write ? bch2_bkey_val_invalid(c, u) : NULL); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); + bch2_bkey_val_to_text(&PBUF(buf), c, u); btree_err(BTREE_ERR_FIXABLE, c, b, i, - "invalid bkey:\n%s\n%s", buf, invalid); + "invalid bkey:\n%s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -1128,6 +729,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, continue; } + if (write && + version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(btree_node_type(b), k, write); + /* * with the separate whiteouts thing (used for extents), the * second set of keys actually can have whiteouts too, so we @@ -1166,12 +771,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct btree_node *sorted; struct bkey_packed *k; struct bset *i; - bool used_mempool; + bool used_mempool, blacklisted; unsigned u64s; int ret, retry_read = 0, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b)); + iter->used = 0; if (bch2_meta_read_fault("btree")) btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, @@ -1240,20 +845,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry b->written += sectors; - ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) { - btree_err(BTREE_ERR_FATAL, c, b, i, - "insufficient memory"); - goto err; - } + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); - if (ret) { - btree_err_on(first, - BTREE_ERR_FIXABLE, c, b, i, - "first btree node bset has blacklisted journal seq"); - if (!first) - continue; - } + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; bch2_btree_node_iter_large_push(iter, b, i->start, @@ -1293,15 +893,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { - enum bkey_type type = btree_node_type(b); struct bkey tmp; struct bkey_s_c u = bkey_disassemble(b, k, &tmp); - const char *invalid = bch2_bkey_val_invalid(c, type, u); + const char *invalid = bch2_bkey_val_invalid(c, u); - if (invalid) { + if (invalid || + (inject_invalid_keys(c) && + !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; - bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u); + bch2_bkey_val_to_text(&PBUF(buf), c, u); btree_err(BTREE_ERR_FIXABLE, c, b, i, "invalid bkey %s: %s", buf, invalid); @@ -1310,6 +911,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); continue; } @@ -1324,7 +926,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry out: mempool_free(iter, &c->fill_iter); return retry_read; -err: fsck_err: if (ret == BTREE_RETRY_READ) { retry_read = 1; @@ -1343,11 +944,9 @@ static void btree_node_read_work(struct work_struct *work) struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; - struct bch_devs_mask avoid; + struct bch_io_failures failed = { .nr = 0 }; bool can_retry; - memset(&avoid, 0, sizeof(avoid)); - goto start; while (1) { bch_info(c, "retrying read"); @@ -1370,8 +969,11 @@ start: percpu_ref_put(&ca->io_ref); rb->have_ioref = false; - __set_bit(rb->pick.ptr.dev, avoid.d); - can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0; + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick) > 0; if (!bio->bi_status && !bch2_btree_node_read_done(c, b, can_retry)) @@ -1383,7 +985,8 @@ start: } } - bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); bio_put(&rb->bio); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1406,7 +1009,7 @@ static void btree_node_read_endio(struct bio *bio) void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bool sync) { - struct extent_pick_ptr pick; + struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; @@ -1414,7 +1017,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, trace_btree_read(c, b); - ret = bch2_btree_pick_ptr(c, b, NULL, &pick); + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, "btree node read error: no device to read from")) { set_btree_node_read_error(b); @@ -1423,7 +1027,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio); + bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, + btree_bytes(c)), + &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; rb->start_time = local_clock(); @@ -1539,22 +1145,24 @@ static void bch2_btree_node_write_error(struct bch_fs *c, { struct btree *b = wbio->wbio.bio.bi_private; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct bkey_i_extent *new_key; - struct bkey_s_extent e; + struct bkey_i_btree_ptr *new_key; + struct bkey_s_btree_ptr bp; struct bch_extent_ptr *ptr; - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; int ret; - __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, - BTREE_MAX_DEPTH, - b->level, 0); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_node_iter(&trans, b->btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->level, 0); retry: - ret = bch2_btree_iter_traverse(&iter); + ret = bch2_btree_iter_traverse(iter); if (ret) goto err; /* has node been freed? */ - if (iter.l[b->level].b != b) { + if (iter->l[b->level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -1564,22 +1172,22 @@ retry: bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_extent(&tmp.k); - e = extent_i_to_s(new_key); - extent_for_each_ptr_backwards(e, ptr) - if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)) - bch2_extent_drop_ptr(e, ptr); + new_key = bkey_i_to_btree_ptr(&tmp.k); + bp = btree_ptr_i_to_s(new_key); + + bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_extent_nr_ptrs(e.c)) + if (!bch2_bkey_nr_ptrs(bp.s_c)) goto err; - ret = bch2_btree_node_update_key(c, &iter, b, new_key); + ret = bch2_btree_node_update_key(c, iter, b, new_key); if (ret == -EINTR) goto retry; if (ret) goto err; out: - bch2_btree_iter_unlock(&iter); + bch2_trans_exit(&trans); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); return; @@ -1673,12 +1281,11 @@ static void btree_node_write_endio(struct bio *bio) static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - const struct bch_extent_ptr *ptr; unsigned whiteout_u64s = 0; int ret; - extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr) - break; + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) + return -1; ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); if (ret) @@ -1696,7 +1303,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; BKEY_PADDED(key) k; - struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1704,6 +1310,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, u64 seq = 0; bool used_mempool; unsigned long old, new; + bool validate_before_checksum = false; void *data; if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) @@ -1722,8 +1329,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!(old & (1 << BTREE_NODE_dirty))) return; - if (b->written && - !btree_node_may_write(b)) + if (!btree_node_may_write(b)) return; if (old & (1 << BTREE_NODE_write_in_flight)) { @@ -1739,7 +1345,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, } while (cmpxchg_acquire(&b->flags, old, new) != old); BUG_ON(btree_node_fake(b)); - BUG_ON(!list_empty(&b->write_blocked)); BUG_ON((b->will_make_reachable != 0) != !b->written); BUG_ON(b->written >= c->opts.btree_node_size); @@ -1817,8 +1422,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->whiteout_u64s = 0; u64s = btree_node_is_extents(b) - ? sort_extents(vstruct_last(i), &sort_iter, false) - : sort_keys(i->start, &sort_iter, false); + ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) + : bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); clear_needs_whiteout(i); @@ -1837,11 +1442,21 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = cpu_to_le16(BCACHE_BSET_VERSION); + i->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) + validate_before_checksum = true; + + /* validate_bset will be modifying: */ + if (le16_to_cpu(i->version) < + bcachefs_metadata_version_bkey_renumber) + validate_before_checksum = true; + /* if we're going to be encrypting, check metadata validity first: */ - if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + if (validate_before_checksum && validate_bset_for_write(c, b, i, sectors_to_write)) goto err; @@ -1855,7 +1470,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); /* if we're not encrypting, check metadata after checksumming: */ - if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) && + if (!validate_before_checksum && validate_bset_for_write(c, b, i, sectors_to_write)) goto err; @@ -1878,7 +1493,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, trace_btree_write(b, bytes_to_write, sectors_to_write); - wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio), + wbio = container_of(bio_alloc_bioset(GFP_NOIO, + buf_pages(data, sectors_to_write << 9), + &c->btree_bio), struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; @@ -1907,9 +1524,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, */ bkey_copy(&k.key, &b->key); - e = bkey_i_to_s_extent(&k.key); - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ptr->offset += b->written; b->written += sectors_to_write; @@ -1942,9 +1558,9 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); /* - * Note: immediately after write, bset_unwritten()/bset_written() don't - * work - the amount of data we had to write after compaction might have - * been smaller than the offset of the last bset. + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. * * However, we know that all bsets have been written here, as long as * we're still holding the write lock: @@ -2054,7 +1670,7 @@ void bch2_btree_verify_flushed(struct bch_fs *c) ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) { - char *out = buf, *end = buf + PAGE_SIZE; + struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; @@ -2065,24 +1681,22 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) unsigned long flags = READ_ONCE(b->flags); unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; - if (//!(flags & (1 << BTREE_NODE_dirty)) && - !b->writes[0].wait.list.first && - !b->writes[1].wait.list.first && - !(b->will_make_reachable & 1)) + if (!(flags & (1 << BTREE_NODE_dirty))) continue; - out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n", - b, - (flags & (1 << BTREE_NODE_dirty)) != 0, - b->level, - b->written, - !list_empty_careful(&b->write_blocked), - b->will_make_reachable != 0, - b->will_make_reachable & 1, - b->writes[ idx].wait.list.first != NULL, - b->writes[!idx].wait.list.first != NULL); + pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, + b->level, + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, + b->will_make_reachable & 1, + b->writes[ idx].wait.list.first != NULL, + b->writes[!idx].wait.list.first != NULL); } rcu_read_unlock(); - return out - buf; + return out.pos - buf; } |