diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-04-07 15:26:03 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:18 -0400 |
commit | 683ee194ef320abff86e8d1ced7993406e63896d (patch) | |
tree | 48fa4bfaa1e33e6ef659e4862445fb35fbd300b4 | |
parent | 880fedbfc7edc4f1a0debc800a17d751cd71e87b (diff) |
Merge with e4871e8f27 bcachefs: Fix a deadlock on starting an interior btree update
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
60 files changed, 2748 insertions, 2503 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c57df50168e0..b2d1b8f9c9b8 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -211,33 +211,31 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_and_journal_iter iter; struct bkey_s_c k; struct bch_dev *ca; - struct journal_key *j; unsigned i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) + bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, + BTREE_ID_ALLOC, POS_MIN); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_ALLOC_READ| BTREE_TRIGGER_NOATOMIC); + bch2_btree_and_journal_iter_advance(&iter); + } + ret = bch2_trans_exit(&trans) ?: ret; if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; } - for_each_journal_key(*journal_keys, j) - if (j->btree_id == BTREE_ID_ALLOC) - bch2_mark_key(c, bkey_i_to_s_c(j->k), - 0, 0, NULL, 0, - BTREE_TRIGGER_ALLOC_READ| - BTREE_TRIGGER_NOATOMIC); - percpu_down_write(&c->mark_lock); bch2_dev_usage_from_buckets(c); percpu_up_write(&c->mark_lock); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index be2c1ed9fcb2..fa9593764f0c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -521,6 +521,18 @@ struct journal_seq_blacklist_table { } entries[0]; }; +struct journal_keys { + struct journal_key { + enum btree_id btree_id:8; + unsigned level:8; + struct bkey_i *k; + u32 journal_seq; + u32 journal_offset; + } *d; + size_t nr; + u64 journal_seq_base; +}; + struct bch_fs { struct closure cl; @@ -608,6 +620,7 @@ struct bch_fs { mempool_t btree_interior_update_pool; struct list_head btree_interior_update_list; + struct list_head btree_interior_updates_unwritten; struct mutex btree_interior_update_lock; struct closure_waitlist btree_interior_update_wait; @@ -719,7 +732,7 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; @@ -786,6 +799,8 @@ struct bch_fs { mempool_t btree_bounce_pool; struct journal journal; + struct list_head journal_entries; + struct journal_keys journal_keys; u64 last_bucket_seq_cleanup; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f6141fde830b..616863ef77d4 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -339,7 +339,8 @@ static inline void bkey_init(struct bkey *k) x(stripe, 14) \ x(reflink_p, 15) \ x(reflink_v, 16) \ - x(inline_data, 17) + x(inline_data, 17) \ + x(btree_ptr_v2, 18) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -595,6 +596,19 @@ struct bch_btree_ptr { __u64 _data[0]; } __attribute__((packed, aligned(8))); +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + /* In case we ever decide to do variable size btree nodes: */ + __le16 sectors; + struct bpos min_key; + struct bch_extent_ptr start[0]; + __u64 _data[0]; +} __attribute__((packed, aligned(8))); + struct bch_extent { struct bch_val v; @@ -626,7 +640,8 @@ struct bch_reservation { /* Btree pointers don't carry around checksums: */ #define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX) + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) @@ -1141,7 +1156,8 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, bcachefs_metadata_version_new_versioning = 10, bcachefs_metadata_version_bkey_renumber = 10, - bcachefs_metadata_version_max = 11, + bcachefs_metadata_version_inode_btree_change = 11, + bcachefs_metadata_version_max = 12, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1294,7 +1310,17 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(reflink, 6) \ x(new_siphash, 7) \ x(inline_data, 8) \ - x(new_extent_overwrite, 9) + x(new_extent_overwrite, 9) \ + x(incompressible, 10) \ + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) + +#define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1374,11 +1400,12 @@ enum bch_csum_opts { }; #define BCH_COMPRESSION_TYPES() \ - x(none, 0) \ - x(lz4_old, 1) \ - x(gzip, 2) \ - x(lz4, 3) \ - x(zstd, 4) + x(none, 0) \ + x(lz4_old, 1) \ + x(gzip, 2) \ + x(lz4, 3) \ + x(zstd, 4) \ + x(incompressible, 5) enum bch_compression_type { #define x(t, n) BCH_COMPRESSION_TYPE_##t, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index f2d5f3009b21..cbcfbd26bc58 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format, static inline void bkey_reassemble(struct bkey_i *dst, struct bkey_s_c src) { - BUG_ON(bkey_packed(src.k)); dst->k = *src.k; - memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k)); + memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } #define bkey_s_null ((struct bkey_s) { .k = NULL }) @@ -565,6 +564,7 @@ BKEY_VAL_ACCESSORS(stripe); BKEY_VAL_ACCESSORS(reflink_p); BKEY_VAL_ACCESSORS(reflink_v); BKEY_VAL_ACCESSORS(inline_data); +BKEY_VAL_ACCESSORS(btree_ptr_v2); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 320e17d108d2..c97e1e9002cb 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { - if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0) + if (bkey_cmp(k.k->p, b->data->min_key) < 0) return "key before start of btree node"; if (bkey_cmp(k.k->p, b->data->max_key) > 0) @@ -202,15 +202,12 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, bch2_val_to_text(out, c, k); } -void bch2_bkey_swab(const struct bkey_format *f, - struct bkey_packed *k) +void bch2_bkey_swab_val(struct bkey_s k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k->type]; - - bch2_bkey_swab_key(f, k); + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; if (ops->swab) - ops->swab(f, k); + ops->swab(k); } bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) @@ -276,3 +273,59 @@ void bch2_bkey_renumber(enum btree_node_type btree_node_type, break; } } + +void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + const struct bkey_ops *ops; + struct bkey uk; + struct bkey_s u; + + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_key(f, k); + + if (version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_INODES) { + if (!bkey_packed(k)) { + struct bkey_i *u = packed_to_bkey(k); + swap(u->k.p.inode, u->k.p.offset); + } else if (f->bits_per_field[BKEY_FIELD_INODE] && + f->bits_per_field[BKEY_FIELD_OFFSET]) { + struct bkey_format tmp = *f, *in = f, *out = &tmp; + + swap(tmp.bits_per_field[BKEY_FIELD_INODE], + tmp.bits_per_field[BKEY_FIELD_OFFSET]); + swap(tmp.field_offset[BKEY_FIELD_INODE], + tmp.field_offset[BKEY_FIELD_OFFSET]); + + if (!write) + swap(in, out); + + uk = __bch2_bkey_unpack_key(in, k); + swap(uk.p.inode, uk.p.offset); + BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); + } + } + + if (!bkey_packed(k)) { + u = bkey_i_to_s(packed_to_bkey(k)); + } else { + uk = __bch2_bkey_unpack_key(f, k); + u.k = &uk; + u.v = bkeyp_val(f, k); + } + + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_val(u); + + ops = &bch2_bkey_ops[k->type]; + + if (ops->compat) + ops->compat(btree_id, version, big_endian, write, u); +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 8568b65c1ed2..0bca725ae3b8 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -29,10 +29,13 @@ struct bkey_ops { void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); - void (*swab)(const struct bkey_format *, struct bkey_packed *); + void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); enum merge_result (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); }; const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); @@ -51,7 +54,7 @@ void bch2_val_to_text(struct printbuf *, struct bch_fs *, void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *); +void bch2_bkey_swab_val(struct bkey_s); bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); @@ -60,4 +63,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *, void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); +void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, + int, struct bkey_format *, struct bkey_packed *); + +static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + if (version < bcachefs_metadata_version_current || + big_endian != CPU_BIG_ENDIAN) + __bch2_bkey_compat(level, btree_id, version, + big_endian, write, f, k); + +} + #endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index 7cbb57042af1..839e78d1dc35 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, return nr; } -static void extent_sort_advance_prev(struct bkey_format *f, - struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev) -{ - if (*prev) { - bch2_bkey_pack(*prev, (void *) *prev, f); - - btree_keys_account_key_add(nr, 0, *prev); - *prev = bkey_next(*prev); - } else { - *prev = start; - } -} - static void extent_sort_append(struct bch_fs *c, struct bkey_format *f, struct btree_nr_keys *nr, - struct bkey_packed *start, - struct bkey_packed **prev, + struct bkey_packed **out, struct bkey_s k) { - if (bkey_whiteout(k.k)) - return; - - /* - * prev is always unpacked, for key merging - until right before we - * advance it: - */ + if (!bkey_whiteout(k.k)) { + if (!bch2_bkey_pack_key(*out, k.k, f)) + memcpy_u64s_small(*out, k.k, BKEY_U64s); - if (*prev && - bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) == - BCH_MERGE_MERGE) - return; + memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); - extent_sort_advance_prev(f, nr, start, prev); - - bkey_reassemble((void *) *prev, k.s_c); + btree_keys_account_key_add(nr, 0, *out); + *out = bkey_next(*out); + } } /* Sort + repack in a new format: */ @@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -/* Sort, repack, and merge: */ +/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ struct btree_nr_keys bch2_sort_repack_merge(struct bch_fs *c, struct bset *dst, struct btree *src, @@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c, struct bkey_format *out_f, bool filter_whiteouts) { - struct bkey_packed *prev = NULL, *k_packed; + struct bkey_packed *out = vstruct_last(dst), *k_packed; struct bkey_on_stack k; struct btree_nr_keys nr; @@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c, bch2_bkey_normalize(c, bkey_i_to_s(k.k))) continue; - extent_sort_append(c, out_f, &nr, vstruct_last(dst), - &prev, bkey_i_to_s(k.k)); + extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); } - extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); - - dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); bkey_on_stack_exit(&k, c); return nr; } @@ -311,6 +285,25 @@ static inline int extent_sort_fix_overlapping_cmp(struct btree *b, cmp_int((unsigned long) r, (unsigned long) l); } +/* + * The algorithm in extent_sort_fix_overlapping() relies on keys in the same + * bset being ordered by start offset - but 0 size whiteouts (which are always + * KEY_TYPE_deleted) break this ordering, so we need to skip over them: + */ +static void extent_iter_advance(struct sort_iter *iter, unsigned idx) +{ + struct sort_iter_set *i = iter->data + idx; + + do { + i->k = bkey_next_skip_noops(i->k, i->end); + } while (i->k != i->end && bkey_deleted(i->k)); + + if (i->k == i->end) + array_remove_item(iter->data, iter->used, idx); + else + __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); +} + struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, struct sort_iter *iter) @@ -318,24 +311,31 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, struct btree *b = iter->b; struct bkey_format *f = &b->format; struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; - struct bkey_packed *prev = NULL; + struct bkey_packed *out = dst->start; struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; struct bkey_on_stack split; + unsigned i; memset(&nr, 0, sizeof(nr)); bkey_on_stack_init(&split); sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); + for (i = 0; i < iter->used;) { + if (bkey_deleted(iter->data[i].k)) + __sort_iter_advance(iter, i, + extent_sort_fix_overlapping_cmp); + else + i++; + } while (!sort_iter_end(iter)) { l = __bkey_disassemble(b, _l->k, &l_unpacked); if (iter->used == 1) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); - sort_iter_advance(iter, - extent_sort_fix_overlapping_cmp); + extent_sort_append(c, f, &nr, &out, l); + extent_iter_advance(iter, 0); continue; } @@ -343,16 +343,14 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, /* If current key and next key don't overlap, just append */ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); - sort_iter_advance(iter, - extent_sort_fix_overlapping_cmp); + extent_sort_append(c, f, &nr, &out, l); + extent_iter_advance(iter, 0); continue; } /* Skip 0 size keys */ if (!r.k->size) { - __sort_iter_advance(iter, 1, - extent_sort_fix_overlapping_cmp); + extent_iter_advance(iter, 1); continue; } @@ -369,8 +367,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, if (_l->k > _r->k) { /* l wins, trim r */ if (bkey_cmp(l.k->p, r.k->p) >= 0) { - __sort_iter_advance(iter, 1, - extent_sort_fix_overlapping_cmp); + extent_iter_advance(iter, 1); } else { bch2_cut_front_s(l.k->p, r); extent_save(b, _r->k, r.k); @@ -391,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, __sort_iter_sift(iter, 0, extent_sort_fix_overlapping_cmp); - extent_sort_append(c, f, &nr, dst->start, - &prev, bkey_i_to_s(split.k)); + extent_sort_append(c, f, &nr, &out, + bkey_i_to_s(split.k)); } else { bch2_cut_back_s(bkey_start_pos(r.k), l); extent_save(b, _l->k, l.k); } } - extent_sort_advance_prev(f, &nr, dst->start, &prev); - - dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); bkey_on_stack_exit(&split, c); return nr; diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index cf8fa59fada1..6360b2e8cf73 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -79,8 +79,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) _n = bkey_next_skip_noops(_k, vstruct_last(i)); bch2_bkey_to_text(&PBUF(buf), &k); - printk(KERN_ERR "block %u key %5u: %s\n", set, - __btree_node_key_to_offset(b, _k), buf); + printk(KERN_ERR "block %u key %5zu: %s\n", set, + _k->_data - i->_data, buf); if (_n == vstruct_last(i)) continue; @@ -1206,7 +1206,8 @@ void bch2_bset_insert(struct btree *b, memcpy_u64s(bkeyp_val(f, where), &insert->v, bkeyp_val_u64s(f, src)); - bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); + if (src->u64s != clobber_u64s) + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); bch2_verify_btree_nr_keys(b); } @@ -1681,7 +1682,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct bset_tree *t; unsigned end = 0; - bch2_btree_node_iter_verify(iter, b); + if (btree_keys_expensive_checks(b)) + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1716,7 +1718,8 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - bch2_btree_node_iter_verify(iter, b); + if (btree_keys_expensive_checks(b)) + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 0c737f35f430..c12f8a6b5205 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -62,13 +62,13 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct btree *b = obj; const u64 *v = arg->key; - return PTR_HASH(&b->key) == *v ? 0 : 1; + return b->hash_val == *v ? 0 : 1; } static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, key.v), - .key_len = sizeof(struct bch_extent_ptr), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), .obj_cmpfn = bch2_btree_cache_cmp_fn, }; @@ -114,11 +114,14 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); /* Cause future lookups for this node to fail: */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); } @@ -144,8 +147,9 @@ __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) { - return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), - bch_btree_cache_params); + u64 v = btree_ptr_hash_val(k); + + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } /* @@ -199,7 +203,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) btree_node_wait_on_io(b); } out: - if (PTR_HASH(&b->key) && !ret) + if (b->hash_val && !ret) trace_btree_node_reap(c, b); return ret; out_unlock: @@ -584,6 +588,7 @@ err: static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, + enum btree_id btree_id, unsigned level, enum six_lock_type lock_type, bool sync) @@ -591,23 +596,24 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_cache *bc = &c->btree_cache; struct btree *b; + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); /* * Parent node must be locked, else we could read in a btree node that's * been freed: */ - BUG_ON(!btree_node_locked(iter, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); + if (iter && !bch2_btree_node_relock(iter, level + 1)) + return ERR_PTR(-EINTR); b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; mutex_lock(&bc->lock); list_add(&b->list, &bc->freeable); @@ -619,15 +625,11 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, } /* - * If the btree node wasn't cached, we can't drop our lock on - * the parent until after it's added to the cache - because - * otherwise we could race with a btree_split() freeing the node - * we're trying to lock. + * Unlock before doing IO: * - * But the deadlock described below doesn't exist in this case, - * so it's safe to not drop the parent lock until here: + * XXX: ideally should be dropping all btree node locks here */ - if (btree_node_read_locked(iter, level + 1)) + if (iter && btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); bch2_btree_node_read(c, b, sync); @@ -662,16 +664,11 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, struct btree *b; struct bset_tree *t; - /* - * XXX: locking optimization - * - * we can make the locking looser here - caller can drop lock on parent - * node before locking child node (and potentially blocking): we just - * have to have bch2_btree_node_fill() call relock on the parent and - * return -EINTR if that fails - */ - EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -680,7 +677,8 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); + b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -689,6 +687,7 @@ retry: if (IS_ERR(b)) return b; } else { +lock_node: /* * There's a potential deadlock with splits and insertions into * interior nodes we have to avoid: @@ -710,7 +709,7 @@ retry: * free it: * * To guard against this, btree nodes are evicted from the cache - * when they're freed - and PTR_HASH() is zeroed out, which we + * when they're freed - and b->hash_val is zeroed out, which we * check for after we lock the node. * * Then, bch2_btree_node_relock() on the parent will fail - because @@ -723,7 +722,7 @@ retry: if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) return ERR_PTR(-EINTR); - if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->level != level || race_fault())) { six_unlock_type(&b->lock, lock_type); @@ -735,6 +734,7 @@ retry: } } + /* XXX: waiting on IO with btree locks held: */ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); @@ -749,7 +749,7 @@ retry: } /* avoid atomic set bit if it's not needed: */ - if (btree_node_accessed(b)) + if (!btree_node_accessed(b)) set_btree_node_accessed(b); if (unlikely(btree_node_read_error(b))) { @@ -764,6 +764,74 @@ retry: return b; } +struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { +lock_node: + six_lock_read(&b->lock); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->btree_id != btree_id || + b->level != level)) { + six_unlock_read(&b->lock); + goto retry; + } + } + + /* XXX: waiting on IO with btree locks held: */ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->lock); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->btree_id != btree_id || + BTREE_NODE_LEVEL(b->data) != level || + bkey_cmp(b->data->max_key, k->k.p)); + + return b; +} + struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, struct btree_iter *iter, struct btree *b, @@ -855,8 +923,7 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, - n1->key.k.p), + BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), n2->data->min_key)); } @@ -878,7 +945,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, if (b) return; - bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, SIX_LOCK_read, false); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 83358d6a4df8..132cc95a4c02 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, enum six_lock_type); +struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned); + struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, struct btree *, enum btree_node_sibling); @@ -35,13 +38,29 @@ void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); void bch2_fs_btree_cache_init_early(struct btree_cache *); -#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v) +static inline u64 btree_ptr_hash_val(const struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); + case KEY_TYPE_btree_ptr_v2: + return bkey_i_to_btree_ptr_v2_c(k)->v.seq; + default: + return 0; + } +} + +static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr + : NULL; +} /* is btree node in hash table? */ static inline bool btree_node_hashed(struct btree *b) { - return b->key.k.type == KEY_TYPE_btree_ptr && - PTR_HASH(&b->key); + return b->hash_val != 0; } #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 05879b66d6af..cef8e148f784 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -47,65 +47,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* range_checks - for validating min/max pos of each btree node: */ - -struct range_checks { - struct range_level { - struct bpos min; - struct bpos max; - } l[BTREE_MAX_DEPTH]; - unsigned depth; -}; - -static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) +static int bch2_gc_check_topology(struct bch_fs *c, + struct bkey_s_c k, + struct bpos *expected_start, + struct bpos expected_end, + bool is_last) { - unsigned i; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - r->l[i].min = r->l[i].max = POS_MIN; - r->depth = depth; -} - -static void btree_node_range_checks(struct bch_fs *c, struct btree *b, - struct range_checks *r) -{ - struct range_level *l = &r->l[b->level]; - - struct bpos expected_min = bkey_cmp(l->min, l->max) - ? btree_type_successor(b->btree_id, l->max) - : l->max; - - bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, - "btree node has incorrect min key: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - expected_min.inode, - expected_min.offset); - - l->max = b->data->max_key; + int ret = 0; - if (b->level > r->depth) { - l = &r->l[b->level - 1]; + if (k.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, - "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - l->min.inode, - l->min.offset); + if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, + "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", + bp.v->min_key.inode, + bp.v->min_key.offset, + expected_start->inode, + expected_start->offset)) { + BUG(); + } + } - bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, - "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", - b->data->max_key.inode, - b->data->max_key.offset, - l->max.inode, - l->max.offset); - - if (bkey_cmp(b->data->max_key, POS_MAX)) - l->min = l->max = - btree_type_successor(b->btree_id, - b->data->max_key); + *expected_start = bkey_cmp(k.k->p, POS_MAX) + ? bkey_successor(k.k->p) + : k.k->p; + + if (fsck_err_on(is_last && + bkey_cmp(k.k->p, expected_end), c, + "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", + k.k->p.inode, + k.k->p.offset, + expected_end.inode, + expected_end.offset)) { + BUG(); } +fsck_err: + return ret; } /* marking of btree keys/nodes: */ @@ -124,7 +101,11 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, BUG_ON(journal_seq_verify(c) && k.k->version.lo > journal_cur_seq(&c->journal)); - if (k.k->version.lo > atomic64_read(&c->key_version)) + /* XXX change to fsck check */ + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", + k.k->version.lo, + atomic64_read(&c->key_version))) atomic64_set(&c->key_version, k.k->version.lo); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || @@ -180,9 +161,10 @@ fsck_err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, - u8 *max_stale, bool initial) +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bool initial) { + struct bpos next_node_start = b->data->min_key; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; @@ -193,13 +175,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; - for_each_btree_node_key_unpack(b, k, &iter, - &unpacked) { + bch2_btree_node_iter_init_from_start(&iter, b); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { bch2_bkey_debugcheck(c, b, k); ret = bch2_gc_mark_key(c, k, max_stale, initial); if (ret) break; + + bch2_btree_node_iter_advance(&iter, b); + + if (b->level) { + ret = bch2_gc_check_topology(c, k, + &next_node_start, + b->data->max_key, + bch2_btree_node_iter_end(&iter)); + if (ret) + break; + } } return ret; @@ -211,7 +205,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, struct btree_trans trans; struct btree_iter *iter; struct btree *b; - struct range_checks r; unsigned depth = metadata_only ? 1 : expensive_debug_checks(c) ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 @@ -223,12 +216,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - btree_node_range_checks_init(&r, depth); - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, depth, BTREE_ITER_PREFETCH, b) { - btree_node_range_checks(c, b, &r); - bch2_verify_btree_nr_keys(b); gc_pos_set(c, gc_pos_btree_node(b)); @@ -269,40 +258,116 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return ret; } -static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + struct journal_keys *journal_keys, + unsigned target_depth) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bpos next_node_start = b->data->min_key; + u8 max_stale = 0; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); + + BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + + ret = bch2_gc_mark_key(c, k, &max_stale, true); + if (ret) + break; + + if (b->level) { + struct btree *child; + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + + bch2_btree_and_journal_iter_advance(&iter); + + ret = bch2_gc_check_topology(c, k, + &next_node_start, + b->data->max_key, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + break; + + if (b->level > target_depth) { + child = bch2_btree_node_get_noiter(c, &tmp.k, + b->btree_id, b->level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; + + ret = bch2_gc_btree_init_recurse(c, child, + journal_keys, target_depth); + six_unlock_read(&child->lock); + + if (ret) + break; + } + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } + + return ret; } -static int mark_journal_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *insert) +static int bch2_gc_btree_init(struct bch_fs *c, + struct journal_keys *journal_keys, + enum btree_id btree_id, + bool metadata_only) { - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - u8 max_stale; + struct btree *b; + unsigned target_depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; int ret = 0; - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); - if (ret) - return ret; + b = c->btree_roots[btree_id].b; - bch2_trans_init(&trans, c, 0, 0); + if (btree_node_fake(b)) + return 0; - for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), - BTREE_ITER_SLOTS, k, ret) { - percpu_down_read(&c->mark_lock); - ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, - BTREE_TRIGGER_GC| - BTREE_TRIGGER_NOATOMIC); - percpu_up_read(&c->mark_lock); + six_lock_read(&b->lock); + if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset)) { + BUG(); + } - if (!ret) - break; + if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, + "btree root with incorrect min_key: %llu:%llu", + b->data->max_key.inode, + b->data->max_key.offset)) { + BUG(); } - return bch2_trans_exit(&trans) ?: ret; + if (b->level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, + journal_keys, target_depth); + + if (!ret) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, true); +fsck_err: + six_unlock_read(&b->lock); + + return ret; +} + +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); } static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, @@ -317,24 +382,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; - enum btree_node_type type = __btree_node_type(0, id); - - int ret = bch2_gc_btree(c, id, initial, metadata_only); + int ret = initial + ? bch2_gc_btree_init(c, journal_keys, + id, metadata_only) + : bch2_gc_btree(c, id, initial, metadata_only); if (ret) return ret; - - if (journal_keys && !metadata_only && - btree_node_type_needs_gc(type)) { - struct journal_key *j; - int ret; - - for_each_journal_key(*journal_keys, j) - if (j->btree_id == id) { - ret = mark_journal_key(c, id, j->k); - if (ret) - return ret; - } - } } return 0; @@ -880,7 +933,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, return; } - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(iter->trans, iter->btree_id, btree_update_reserve_required(c, parent) + nr_old_nodes, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE, @@ -951,9 +1004,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, n1->key.k.p = n1->data->max_key = bkey_unpack_pos(n1, last); - n2->data->min_key = - btree_type_successor(iter->btree_id, - n1->data->max_key); + n2->data->min_key = bkey_successor(n1->data->max_key); memcpy_u64s(vstruct_last(s1), s2->start, u64s); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index c126985b6ef5..ac8b98861aae 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -19,6 +19,7 @@ #include "journal_seq_blacklist.h" #include "super-io.h" +#include <linux/sched/mm.h> #include <trace/events/bcachefs.h> static void verify_no_dups(struct btree *b, @@ -68,17 +69,19 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order, static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, bool *used_mempool) { + unsigned flags = memalloc_nofs_save(); void *p; BUG_ON(order > btree_page_order(c)); *used_mempool = false; p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); - if (p) - return p; - - *used_mempool = true; - return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); + } + memalloc_nofs_restore(flags); + return p; } static void sort_bkey_ptrs(const struct btree *bt, @@ -617,7 +620,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -706,76 +709,107 @@ out: \ static int validate_bset(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors, - unsigned *whiteout_u64s, int write, - bool have_retry) + int write, bool have_retry) { - struct bkey_packed *k, *prev = NULL; - struct bpos prev_pos = POS_MIN; - struct bpos prev_data = POS_MIN; - bool seen_non_whiteout = false; - unsigned version; + unsigned version = le16_to_cpu(i->version); const char *err; int ret = 0; - if (i == &b->data->keys) { + btree_err_on((version != BCH_BSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + BTREE_ERR_FATAL, c, b, i, + "unsupported bset version"); + + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, + BTREE_ERR_FIXABLE, c, b, i, + "bset past end of btree node")) { + i->u64s = 0; + return 0; + } + + btree_err_on(b->written && !i->u64s, + BTREE_ERR_FIXABLE, c, b, i, + "empty bset"); + + if (!b->written) { + struct btree_node *bn = + container_of(i, struct btree_node, keys); /* These indicate that we read the wrong btree node: */ - btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id, + btree_err_on(BTREE_NODE_ID(bn) != b->btree_id, BTREE_ERR_MUST_RETRY, c, b, i, "incorrect btree id"); - btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level, + btree_err_on(BTREE_NODE_LEVEL(bn) != b->level, BTREE_ERR_MUST_RETRY, c, b, i, "incorrect level"); if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &b->data->ptr; + u64 *p = (u64 *) &bn->ptr; *p = swab64(*p); - bch2_bpos_swab(&b->data->min_key); - bch2_bpos_swab(&b->data->max_key); } - btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p), + if (!write) + compat_btree_node(b->level, b->btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, b, NULL, + "incorrect min_key: got %llu:%llu should be %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset, + bp->min_key.inode, + bp->min_key.offset); + } + + btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, b, i, "incorrect max key"); + if (write) + compat_btree_node(b->level, b->btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + /* XXX: ideally we would be validating min_key too */ #if 0 /* * not correct anymore, due to btree node write error * handling * - * need to add b->data->seq to btree keys and verify + * need to add bn->seq to btree keys and verify * against that */ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), - b->data->ptr), + bn->ptr), BTREE_ERR_FATAL, c, b, i, "incorrect backpointer"); #endif - err = bch2_bkey_format_validate(&b->data->format); + err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, BTREE_ERR_FATAL, c, b, i, "invalid bkey format: %s", err); - } - version = le16_to_cpu(i->version); - btree_err_on((version != BCH_BSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, b, i, - "unsupported bset version"); - - if (btree_err_on(b->written + sectors > c->opts.btree_node_size, - BTREE_ERR_FIXABLE, c, b, i, - "bset past end of btree node")) { - i->u64s = 0; - return 0; + compat_bformat(b->level, b->btree_id, version, + BSET_BIG_ENDIAN(i), write, + &bn->format); } +fsck_err: + return ret; +} - btree_err_on(b->written && !i->u64s, - BTREE_ERR_FIXABLE, c, b, i, - "empty bset"); +static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned *whiteout_u64s, + int write, bool have_retry) +{ + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; + bool seen_non_whiteout = false; + int ret = 0; if (!BSET_SEPARATE_WHITEOUTS(i)) { seen_non_whiteout = true; @@ -784,7 +818,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, for (k = i->start; k != vstruct_last(i);) { - struct bkey_s_c u; + struct bkey_s u; struct bkey tmp; const char *invalid; @@ -804,22 +838,21 @@ static int validate_bset(struct bch_fs *c, struct btree *b, continue; } - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch2_bkey_swab(&b->format, k); - - if (!write && - version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(btree_node_type(b), k, write); + /* XXX: validate k->u64s */ + if (!write) + bch2_bkey_compat(b->level, b->btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); - u = bkey_disassemble(b, k, &tmp); + u = __bkey_disassemble(b, k, &tmp); - invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: - bch2_bkey_in_btree_node(b, u) ?: - (write ? bch2_bkey_val_invalid(c, u) : NULL); + invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, u.s_c) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(&PBUF(buf), c, u); + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, b, i, "invalid bkey:\n%s\n%s", invalid, buf); @@ -829,9 +862,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, continue; } - if (write && - version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(btree_node_type(b), k, write); + if (write) + bch2_bkey_compat(b->level, b->btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); /* * with the separate whiteouts thing (used for extents), the @@ -841,29 +875,27 @@ static int validate_bset(struct bch_fs *c, struct btree *b, if (!seen_non_whiteout && (!bkey_whiteout(k) || - (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { + (prev && bkey_iter_cmp(b, prev, k) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; - } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || - bkey_cmp(prev_pos, u.k->p) > 0) { + } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { + char buf1[80]; + char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); + + bch2_bkey_to_text(&PBUF(buf1), &up); + bch2_bkey_to_text(&PBUF(buf2), u.k); + + bch2_dump_bset(b, i, 0); btree_err(BTREE_ERR_FATAL, c, b, i, - "keys out of order: %llu:%llu > %llu:%llu", - prev_pos.inode, - prev_pos.offset, - u.k->p.inode, - bkey_start_offset(u.k)); + "keys out of order: %s > %s", + buf1, buf2); /* XXX: repair this */ } - if (!bkey_deleted(u.k)) - prev_data = u.k->p; - prev_pos = u.k->p; - prev = k; k = bkey_next_skip_noops(k, vstruct_last(i)); } - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); fsck_err: return ret; } @@ -895,6 +927,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry BTREE_ERR_MUST_RETRY, c, b, NULL, "bad btree header"); + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, + BTREE_ERR_MUST_RETRY, c, b, NULL, + "got wrong btree node"); + } + while (b->written < c->opts.btree_node_size) { unsigned sectors, whiteout_u64s = 0; struct nonce nonce; @@ -922,8 +963,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry set_btree_node_old_extent_overwrite(b); sectors = vstruct_sectors(b->data, c->block_bits); - - btree_node_set_format(b, b->data->format); } else { bne = write_block(b); i = &bne->keys; @@ -947,11 +986,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry sectors = vstruct_sectors(bne, c->block_bits); } - ret = validate_bset(c, b, i, sectors, &whiteout_u64s, + ret = validate_bset(c, b, i, sectors, READ, have_retry); if (ret) goto fsck_err; + if (!b->written) + btree_node_set_format(b, b->data->format); + + ret = validate_bset_keys(c, b, i, &whiteout_u64s, + READ, have_retry); + if (ret) + goto fsck_err; + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + b->written += sectors; blacklisted = bch2_journal_seq_is_blacklisted(c, @@ -1002,15 +1051,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { struct bkey tmp; - struct bkey_s_c u = bkey_disassemble(b, k, &tmp); - const char *invalid = bch2_bkey_val_invalid(c, u); + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + const char *invalid = bch2_bkey_val_invalid(c, u.s_c); if (invalid || (inject_invalid_keys(c) && !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; - bch2_bkey_val_to_text(&PBUF(buf), c, u); + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, b, i, "invalid bkey %s: %s", buf, invalid); @@ -1023,6 +1072,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry continue; } + if (u.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); + + bp.v->mem_ptr = 0; + } + k = bkey_next_skip_noops(k, vstruct_last(i)); } @@ -1236,7 +1291,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, closure_put(&((struct btree_update *) new)->cl); bch2_journal_pin_drop(&c->journal, &w->journal); - closure_wake_up(&w->wait); } static void btree_node_write_done(struct bch_fs *c, struct btree *b) @@ -1252,8 +1306,6 @@ static void bch2_btree_node_write_error(struct bch_fs *c, { struct btree *b = wbio->wbio.bio.bi_private; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct bkey_i_btree_ptr *new_key; - struct bkey_s_btree_ptr bp; struct bch_extent_ptr *ptr; struct btree_trans trans; struct btree_iter *iter; @@ -1279,16 +1331,13 @@ retry: bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_btree_ptr(&tmp.k); - bp = btree_ptr_i_to_s(new_key); - bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bp.s_c)) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) goto err; - ret = bch2_btree_node_update_key(c, iter, b, new_key); + ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); if (ret == -EINTR) goto retry; if (ret) @@ -1394,7 +1443,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) return -1; - ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); + ret = validate_bset(c, b, i, sectors, WRITE, false) ?: + validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); if (ret) bch2_inconsistent_error(c); @@ -1544,8 +1594,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, validate_before_checksum = true; /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < - bcachefs_metadata_version_bkey_renumber) + if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) validate_before_checksum = true; /* if we're going to be encrypting, check metadata validity first: */ @@ -1598,9 +1647,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; - if (b->level || !b->written) - wbio->wbio.bio.bi_opf |= REQ_FUA; - bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* @@ -1625,6 +1671,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; + /* XXX: submitting IO with btree locks held: */ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); return; err: @@ -1773,12 +1820,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) { unsigned long flags = READ_ONCE(b->flags); - unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; if (!(flags & (1 << BTREE_NODE_dirty))) continue; - pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n", + pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", b, (flags & (1 << BTREE_NODE_dirty)) != 0, (flags & (1 << BTREE_NODE_need_write)) != 0, @@ -1786,9 +1832,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) b->written, !list_empty_careful(&b->write_blocked), b->will_make_reachable != 0, - b->will_make_reachable & 1, - b->writes[ idx].wait.list.first != NULL, - b->writes[!idx].wait.list.first != NULL); + b->will_make_reachable & 1); } rcu_read_unlock(); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e90e89eee273..337d2bdd29e8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BTREE_IO_H #define _BCACHEFS_BTREE_IO_H +#include "bkey_methods.h" #include "bset.h" #include "btree_locking.h" #include "extents.h" @@ -102,19 +103,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type); -static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b) +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) { while (b->written && btree_node_need_write(b) && btree_node_may_write(b)) { if (!btree_node_write_in_flight(b)) { - bch2_btree_node_write(c, b, SIX_LOCK_read); + bch2_btree_node_write(c, b, lock_held); break; } - six_unlock_read(&b->lock); + six_unlock_type(&b->lock, lock_held); btree_node_wait_on_io(b); - btree_node_lock_type(c, b, SIX_LOCK_read); + btree_node_lock_type(c, b, lock_held); } } @@ -131,7 +133,7 @@ do { \ new |= (1 << BTREE_NODE_need_write); \ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ \ - btree_node_write_if_need(_c, _b); \ + btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ } while (0) void bch2_btree_flush_all_reads(struct bch_fs *); @@ -139,4 +141,50 @@ void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_btree_verify_flushed(struct bch_fs *); ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); +static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_INODES) { + swap(f->bits_per_field[BKEY_FIELD_INODE], + f->bits_per_field[BKEY_FIELD_OFFSET]); + swap(f->field_offset[BKEY_FIELD_INODE], + f->field_offset[BKEY_FIELD_OFFSET]); + } +} + +static inline void compat_bpos(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bpos *p) +{ + if (big_endian != CPU_BIG_ENDIAN) + bch2_bpos_swab(p); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_INODES) + swap(p->inode, p->offset); +} + +static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct btree_node *bn) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + bkey_cmp(bn->min_key, POS_MIN) && + write) + bn->min_key = bkey_predecessor(bn->min_key); + + compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); + compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + bkey_cmp(bn->min_key, POS_MIN) && + !write) + bn->min_key = bkey_successor(bn->min_key); +} + #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ea0555b806f0..5528ba0f1d44 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -5,6 +5,7 @@ #include "btree_cache.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update.h" #include "debug.h" #include "extents.h" @@ -35,6 +36,26 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) return pos; } +static inline bool btree_iter_pos_before_node(struct btree_iter *iter, + struct btree *b) +{ + return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; +} + +static inline bool btree_iter_pos_after_node(struct btree_iter *iter, + struct btree *b) +{ + return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; +} + +static inline bool btree_iter_pos_in_node(struct btree_iter *iter, + struct btree *b) +{ + return iter->btree_id == b->btree_id && + !btree_iter_pos_before_node(iter, b) && + !btree_iter_pos_after_node(iter, b); +} + /* Btree node locking: */ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) @@ -241,7 +262,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, /* Btree iterator locking: */ #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_iter_verify_locks(struct btree_iter *iter) +static void bch2_btree_iter_verify_locks(struct btree_iter *iter) { unsigned l; @@ -262,6 +283,8 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) trans_for_each_iter(trans, iter) bch2_btree_iter_verify_locks(iter); } +#else +static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} #endif __flatten @@ -385,21 +408,43 @@ void bch2_trans_unlock(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG -static void __bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) +static void bch2_btree_iter_verify_level(struct btree_iter *iter, + unsigned level) { struct bpos pos = btree_iter_search_key(iter); - struct btree_iter_level *l = &iter->l[b->level]; + struct btree_iter_level *l = &iter->l[level]; struct btree_node_iter tmp = l->iter; - struct bkey_packed *k; + bool locked = btree_node_locked(iter, level); + struct bkey_packed *p, *k; + char buf1[100], buf2[100]; + const char *msg; if (!debug_check_iterators(iter->trans->c)) return; - if (iter->uptodate > BTREE_ITER_NEED_PEEK) + BUG_ON(iter->level < iter->min_depth); + + if (!btree_iter_node(iter, level)) return; - bch2_btree_node_iter_verify(&l->iter, b); + if (!bch2_btree_node_relock(iter, level)) + return; + + /* + * Ideally this invariant would always be true, and hopefully in the + * future it will be, but for now set_pos_same_leaf() breaks it: + */ + BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && + !btree_iter_pos_in_node(iter, l->b)); + + /* + * node iterators don't use leaf node iterator: + */ + if (btree_iter_type(iter) == BTREE_ITER_NODES && + level <= iter->min_depth) + goto unlock; + + bch2_btree_node_iter_verify(&l->iter, l->b); /* * For interior nodes, the iterator will have skipped past @@ -408,46 +453,73 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, * For extents, the iterator may have skipped past deleted keys (but not * whiteouts) */ - k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS - ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) - : bch2_btree_node_iter_prev_all(&tmp, b); - if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); + p = level || btree_node_type_is_extents(iter->btree_id) + ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - bch2_bkey_to_text(&PBUF(buf), &uk); - panic("iterator should be before prev key:\n%s\n%llu:%llu\n", - buf, iter->pos.inode, iter->pos.offset); + if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { + msg = "before"; + goto err; } - k = bch2_btree_node_iter_peek_all(&l->iter, b); - if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { - char buf[100]; - struct bkey uk = bkey_unpack_key(b, k); + if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + msg = "after"; + goto err; + } +unlock: + if (!locked) + btree_node_unlock(iter, level); + return; +err: + strcpy(buf1, "(none)"); + strcpy(buf2, "(none)"); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); + bch2_bkey_to_text(&PBUF(buf1), &uk); + } - bch2_bkey_to_text(&PBUF(buf), &uk); - panic("iter should be after current key:\n" - "iter pos %llu:%llu\n" - "cur key %s\n", - iter->pos.inode, iter->pos.offset, buf); + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); + bch2_bkey_to_text(&PBUF(buf2), &uk); } + + panic("iterator should be %s key at level %u:\n" + "iter pos %s %llu:%llu\n" + "prev key %s\n" + "cur key %s\n", + msg, level, + iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", + iter->pos.inode, iter->pos.offset, + buf1, buf2); } -void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) +static void bch2_btree_iter_verify(struct btree_iter *iter) { - struct btree_iter *linked; + unsigned i; - if (!debug_check_iterators(iter->trans->c)) + bch2_btree_trans_verify_locks(iter->trans); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + bch2_btree_iter_verify_level(iter, i); +} + +void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) +{ + struct btree_iter *iter; + + if (!debug_check_iterators(trans->c)) return; - trans_for_each_iter_with_node(iter->trans, b, linked) - __bch2_btree_iter_verify(linked, b); + trans_for_each_iter_with_node(trans, b, iter) + bch2_btree_iter_verify_level(iter, b->level); } #else -static inline void __bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} +static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} +static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} #endif @@ -492,7 +564,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_iter_fix_key_modified(linked, b, where); - __bch2_btree_iter_verify(linked, b); + bch2_btree_iter_verify_level(linked, b->level); } } @@ -563,7 +635,7 @@ fixup_done: if (!bch2_btree_node_iter_end(node_iter) && iter_current_key_modified && (b->level || - (iter->flags & BTREE_ITER_IS_EXTENTS))) { + btree_node_type_is_extents(iter->btree_id))) { struct bset_tree *t; struct bkey_packed *k, *k2, *p; @@ -591,19 +663,8 @@ fixup_done: if (!b->level && node_iter == &iter->l[0].iter && - iter_current_key_modified) { - struct bkey_packed *k = - bch2_btree_node_iter_peek_all(node_iter, b); - - if (likely(k)) { - bkey_disassemble(b, k, &iter->k); - } else { - /* XXX: for extents, calculate size of hole? */ - iter->k.type = KEY_TYPE_deleted; - } - + iter_current_key_modified) btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - } } void bch2_btree_node_iter_fix(struct btree_iter *iter, @@ -619,14 +680,16 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, if (node_iter != &iter->l[b->level].iter) { __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - bch2_btree_node_iter_verify(node_iter, b); + + if (debug_check_iterators(iter->trans->c)) + bch2_btree_node_iter_verify(node_iter, b); } trans_for_each_iter_with_node(iter->trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->level].iter, t, where, clobber_u64s, new_u64s); - __bch2_btree_iter_verify(linked, b); + bch2_btree_iter_verify_level(linked, b->level); } } @@ -736,26 +799,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) btree_node_unlock(iter, b->level + 1); } -static inline bool btree_iter_pos_before_node(struct btree_iter *iter, - struct btree *b) -{ - return bkey_cmp(iter->pos, b->data->min_key) < 0; -} - -static inline bool btree_iter_pos_after_node(struct btree_iter *iter, - struct btree *b) -{ - return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; -} - -static inline bool btree_iter_pos_in_node(struct btree_iter *iter, - struct btree *b) -{ - return iter->btree_id == b->btree_id && - !btree_iter_pos_before_node(iter, b) && - !btree_iter_pos_after_node(iter, b); -} - static inline void __btree_iter_init(struct btree_iter *iter, unsigned level) { @@ -912,6 +955,27 @@ static void btree_iter_prefetch(struct btree_iter *iter) btree_node_unlock(iter, iter->level); } +static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, + unsigned plevel, struct btree *b) +{ + struct btree_iter_level *l = &iter->l[plevel]; + bool locked = btree_node_locked(iter, plevel); + struct bkey_packed *k; + struct bch_btree_ptr_v2 *bp; + + if (!bch2_btree_node_relock(iter, plevel)) + return; + + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); + + bp = (void *) bkeyp_val(&l->b->format, k); + bp->mem_ptr = (unsigned long)b; + + if (!locked) + btree_node_unlock(iter, plevel); +} + static __always_inline int btree_iter_down(struct btree_iter *iter) { struct bch_fs *c = iter->trans->c; @@ -933,6 +997,10 @@ static __always_inline int btree_iter_down(struct btree_iter *iter) mark_btree_node_locked(iter, level, lock_type); btree_iter_node_set(iter, b); + if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && + unlikely(b != btree_node_mem_ptr(&tmp.k))) + btree_node_mem_ptr_set(iter, level + 1, b); + if (iter->flags & BTREE_ITER_PREFETCH) btree_iter_prefetch(iter); @@ -1000,7 +1068,14 @@ retry_all: goto retry_all; } - ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; + if (hweight64(trans->iters_live) > 1) + ret = -EINTR; + else + trans_for_each_iter(trans, iter) + if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { + ret = -EINTR; + break; + } out: bch2_btree_cache_cannibalize_unlock(c); return ret; @@ -1107,9 +1182,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter) iter->uptodate = BTREE_ITER_NEED_PEEK; - bch2_btree_trans_verify_locks(iter->trans); - if (btree_iter_node(iter, iter->level)) - __bch2_btree_iter_verify(iter, iter->l[iter->level].b); + bch2_btree_iter_verify(iter); return 0; } @@ -1129,12 +1202,14 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, enum btree_iter_type type) { EBUG_ON(iter->btree_id >= BTREE_ID_NR); - EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != - (btree_node_type_is_extents(iter->btree_id) && - type != BTREE_ITER_NODES)); EBUG_ON(btree_iter_type(iter) != type); - bch2_btree_trans_verify_locks(iter->trans); + BUG_ON(type == BTREE_ITER_KEYS && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0)); + + bch2_btree_iter_verify_locks(iter); + bch2_btree_iter_verify_level(iter, iter->level); } /* Iterate across nodes (leaf and interior nodes) */ @@ -1162,10 +1237,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify(iter); + return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree *b; int ret; @@ -1207,11 +1284,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) if (btree_node_read_locked(iter, iter->level)) btree_node_unlock(iter, iter->level); - /* ick: */ - iter->pos = iter->btree_id == BTREE_ID_INODES - ? btree_type_successor(iter->btree_id, iter->pos) - : bkey_successor(iter->pos); - iter->level = depth; + iter->pos = bkey_successor(iter->pos); + iter->level = iter->min_depth; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ret = bch2_btree_iter_traverse(iter); @@ -1224,6 +1298,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) iter->pos = b->key.k.p; iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify(iter); + return b; } @@ -1238,7 +1314,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ EBUG_ON(!btree_node_locked(iter, 0)); EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); - iter->pos = new_pos; + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); btree_iter_advance_to_pos(iter, l, -1); @@ -1248,9 +1325,14 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); } -static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) +static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) { - unsigned l = btree_iter_up_until_good_node(iter, cmp); + unsigned l = iter->level; + + if (!cmp) + goto out; + + l = btree_iter_up_until_good_node(iter, cmp); if (btree_iter_node(iter, l)) { /* @@ -1267,64 +1349,81 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp) if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(iter, l); } +out: + if (l != iter->level) + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + else + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); +} - return l; +void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, + bool strictly_greater) +{ + struct bpos old = btree_iter_search_key(iter); + int cmp; + + iter->flags &= ~BTREE_ITER_IS_EXTENTS; + iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; + + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; + + cmp = bkey_cmp(btree_iter_search_key(iter), old); + + btree_iter_pos_changed(iter, cmp); } void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { int cmp = bkey_cmp(new_pos, iter->pos); - unsigned l; - - if (!cmp) - return; - - iter->pos = new_pos; - l = btree_iter_pos_changed(iter, cmp); + bkey_init(&iter->k); + iter->k.p = iter->pos = new_pos; - if (l != iter->level) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - else - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + btree_iter_pos_changed(iter, cmp); } static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + bool ret; - iter->pos = l->b->key.k.p; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + bkey_init(&iter->k); + iter->k.p = iter->pos = l->b->key.k.p; - if (!bkey_cmp(iter->pos, POS_MAX)) { - bkey_init(&iter->k); - iter->k.p = POS_MAX; - return false; - } + ret = bkey_cmp(iter->pos, POS_MAX) != 0; + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter->k.p = iter->pos = bkey_successor(iter->pos); - iter->pos = btree_type_successor(iter->btree_id, iter->pos); btree_iter_pos_changed(iter, 1); - return true; + return ret; } static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; + bool ret; - iter->pos = l->b->data->min_key; + bkey_init(&iter->k); + iter->k.p = iter->pos = l->b->data->min_key; iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (!bkey_cmp(iter->pos, POS_MIN)) { - bkey_init(&iter->k); - iter->k.p = POS_MIN; - return false; + ret = bkey_cmp(iter->pos, POS_MIN) != 0; + if (ret) { + iter->k.p = iter->pos = bkey_predecessor(iter->pos); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + iter->k.p = iter->pos = bkey_predecessor(iter->pos); } - iter->pos = btree_type_predecessor(iter->btree_id, iter->pos); btree_iter_pos_changed(iter, -1); - return true; + return ret; } +/** + * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key + * it currently points to + */ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; @@ -1361,7 +1460,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (iter->uptodate == BTREE_ITER_UPTODATE) + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) return btree_iter_peek_uptodate(iter); while (1) { @@ -1386,6 +1486,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; + + bch2_btree_iter_verify_level(iter, 0); return k; } @@ -1395,52 +1497,101 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? iter->k.p + : bkey_successor(iter->k.p)); + + return bch2_btree_iter_peek(iter); +} + +static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) +{ + struct bpos pos = btree_iter_search_key(iter); + struct btree_trans *trans = iter->trans; + struct btree_insert_entry *i; + + trans_for_each_update2(trans, i) + if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) + break; + + return i < trans->updates2 + trans->nr_updates2 && + iter->btree_id == i->iter->btree_id + ? bkey_i_to_s_c(i->k) + : bkey_s_c_null; +} + +static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) +{ struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *p; + struct bkey_s_c k = __btree_iter_peek(iter, l); + struct bkey_s_c u = __btree_trans_updates_peek(iter); + + if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) + return k; + if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { + iter->k = *u.k; + return u; + } + return bkey_s_c_null; +} + +struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) +{ struct bkey_s_c k; + int ret; bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) - return bkey_s_c_null; + while (1) { + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; + k = __bch2_btree_iter_peek_with_updates(iter); - bch2_btree_iter_set_pos(iter, - btree_type_successor(iter->btree_id, iter->k.p)); + if (k.k && bkey_deleted(k.k)) { + bch2_btree_iter_set_pos(iter, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? iter->k.p + : bkey_successor(iter->k.p)); + continue; + } - return bch2_btree_iter_peek(iter); - } + if (likely(k.k)) + break; - if (unlikely(bkey_deleted(&iter->k))) { - /* - * we're currently pointed at a hole, because previously we were - * iterating over slots: - */ - return bch2_btree_iter_peek(iter); + if (!btree_iter_set_pos_to_next_leaf(iter)) + return bkey_s_c_null; } - do { - bch2_btree_node_iter_advance(&l->iter, l->b); - p = bch2_btree_node_iter_peek_all(&l->iter, l->b); - } while (likely(p) && bkey_whiteout(p)); + /* + * iter->pos should always be equal to the key we just + * returned - except extents can straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || + bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + iter->pos = bkey_start_pos(k.k); - if (unlikely(!p)) - return btree_iter_set_pos_to_next_leaf(iter) - ? bch2_btree_iter_peek(iter) - : bkey_s_c_null; + iter->uptodate = BTREE_ITER_UPTODATE; + return k; +} - k = __btree_iter_unpack(iter, l, &iter->k, p); +struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) +{ + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); - iter->pos = bkey_start_pos(k.k); - return k; + bch2_btree_iter_set_pos(iter, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? iter->k.p + : bkey_successor(iter->k.p)); + + return bch2_btree_iter_peek_with_updates(iter); } /** @@ -1449,13 +1600,15 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { + struct bpos pos = iter->pos; struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; int ret; bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (iter->uptodate == BTREE_ITER_UPTODATE) + if (iter->uptodate == BTREE_ITER_UPTODATE && + !bkey_deleted(&iter->k)) return btree_iter_peek_uptodate(iter); while (1) { @@ -1464,8 +1617,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) return bkey_s_c_err(ret); k = __btree_iter_peek(iter, l); - if (!k.k || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) k = __btree_iter_prev(iter, l); if (likely(k.k)) @@ -1475,7 +1627,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) return bkey_s_c_null; } - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); iter->pos = bkey_start_pos(k.k); iter->uptodate = BTREE_ITER_UPTODATE; return k; @@ -1487,33 +1639,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c k; + struct bpos pos = bkey_start_pos(&iter->k); bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - iter->pos = btree_type_predecessor(iter->btree_id, - iter->pos); - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - return bch2_btree_iter_peek_prev(iter); - } + if (unlikely(!bkey_cmp(pos, POS_MIN))) + return bkey_s_c_null; - k = __btree_iter_prev(iter, l); - if (unlikely(!k.k)) - return btree_iter_set_pos_to_prev_leaf(iter) - ? bch2_btree_iter_peek(iter) - : bkey_s_c_null; + bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0); - iter->pos = bkey_start_pos(k.k); - return k; + return bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c @@ -1525,8 +1660,17 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) struct bkey n; int ret; -recheck: - btree_iter_advance_to_pos(iter, l, -1); + /* keys & holes can't span inode numbers: */ + if (iter->pos.offset == KEY_OFFSET_MAX) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + } /* * iterator is now at the correct position for inserting at iter->pos, @@ -1540,47 +1684,17 @@ recheck: if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { /* - * If there wasn't actually a hole, want the iterator to be - * pointed at the key we found: - * - * XXX: actually, we shouldn't be changing the iterator here: - * the iterator needs to be correct for inserting at iter->pos, - * and there may be whiteouts between iter->pos and what this - * iterator points at: + * We're not setting iter->uptodate because the node iterator + * doesn't necessarily point at the key we're returning: */ - l->iter = node_iter; EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); - iter->uptodate = BTREE_ITER_UPTODATE; - - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return k; } - /* - * If we got to the end of the node, check if we need to traverse to the - * next node: - */ - if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - goto recheck; - } - /* hole */ - /* holes can't span inode numbers: */ - if (iter->pos.offset == KEY_OFFSET_MAX) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; - - iter->pos = bkey_successor(iter->pos); - goto recheck; - } - if (!k.k) k.k = &l->b->key.k; @@ -1598,42 +1712,33 @@ recheck: iter->k = n; iter->uptodate = BTREE_ITER_UPTODATE; - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return (struct bkey_s_c) { &iter->k, NULL }; } -static inline struct bkey_s_c -__bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; int ret; + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); + + if (iter->uptodate == BTREE_ITER_UPTODATE) + return btree_iter_peek_uptodate(iter); + + ret = bch2_btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + if (iter->flags & BTREE_ITER_IS_EXTENTS) return __bch2_btree_iter_peek_slot_extents(iter); -recheck: - while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && - bkey_deleted(k.k) && - bkey_cmp(k.k->p, iter->pos) == 0) - bch2_btree_node_iter_advance(&l->iter, l->b); - - /* - * If we got to the end of the node, check if we need to traverse to the - * next node: - */ - if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + k = __btree_iter_peek_all(iter, l, &iter->k); - goto recheck; - } + EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); - if (!k.k || - bkey_deleted(k.k) || - bkey_cmp(iter->pos, k.k->p)) { + if (!k.k || bkey_cmp(iter->pos, k.k->p)) { /* hole */ bkey_init(&iter->k); iter->k.p = iter->pos; @@ -1641,49 +1746,21 @@ recheck: } iter->uptodate = BTREE_ITER_UPTODATE; - __bch2_btree_iter_verify(iter, l->b); + bch2_btree_iter_verify_level(iter, 0); return k; } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -{ - int ret; - - bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - - if (iter->uptodate == BTREE_ITER_UPTODATE) - return btree_iter_peek_uptodate(iter); - - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - return __bch2_btree_iter_peek_slot(iter); -} - struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); - - iter->pos = btree_type_successor(iter->btree_id, iter->k.p); - - if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { - /* - * XXX: when we just need to relock we should be able to avoid - * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK - * for that to work - */ - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - - return bch2_btree_iter_peek_slot(iter); - } + if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + return bkey_s_c_null; - if (!bkey_deleted(&iter->k)) - bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); + bch2_btree_iter_set_pos(iter, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? iter->k.p + : bkey_successor(iter->k.p)); - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - return __bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(iter); } static inline void bch2_btree_iter_init(struct btree_trans *trans, @@ -1705,12 +1782,12 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; iter->level = 0; + iter->min_depth = 0; iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(iter->l); i++) - iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + iter->l[i].b = BTREE_ITER_NO_NODE_INIT; prefetch(c->btree_roots[btree_id].b); } @@ -1729,7 +1806,14 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, int bch2_trans_iter_put(struct btree_trans *trans, struct btree_iter *iter) { - int ret = btree_iter_err(iter); + int ret; + + if (IS_ERR_OR_NULL(iter)) + return 0; + + BUG_ON(trans->iters + iter->idx != iter); + + ret = btree_iter_err(iter); if (!(trans->iters_touched & (1ULL << iter->idx)) && !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) @@ -1742,6 +1826,9 @@ int bch2_trans_iter_put(struct btree_trans *trans, int bch2_trans_iter_free(struct btree_trans *trans, struct btree_iter *iter) { + if (IS_ERR_OR_NULL(iter)) + return 0; + trans->iters_touched &= ~(1ULL << iter->idx); return bch2_trans_iter_put(trans, iter); @@ -1750,7 +1837,7 @@ int bch2_trans_iter_free(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates; + void *p, *new_iters, *new_updates, *new_updates2; size_t iters_bytes; size_t updates_bytes; @@ -1768,21 +1855,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, iters_bytes = sizeof(struct btree_iter) * new_size; updates_bytes = sizeof(struct btree_insert_entry) * new_size; - new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); - if (new_iters) + p = kmalloc(iters_bytes + + updates_bytes + + updates_bytes, GFP_NOFS); + if (p) goto success; - new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); new_size = BTREE_ITER_MAX; trans->used_mempool = true; success: - new_updates = new_iters + iters_bytes; + new_iters = p; p += iters_bytes; + new_updates = p; p += updates_bytes; + new_updates2 = p; p += updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); memcpy(new_updates, trans->updates, sizeof(struct btree_insert_entry) * trans->nr_updates); + memcpy(new_updates2, trans->updates2, + sizeof(struct btree_insert_entry) * trans->nr_updates2); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) memset(trans->iters, POISON_FREE, @@ -1794,6 +1887,7 @@ success: trans->iters = new_iters; trans->updates = new_updates; + trans->updates2 = new_updates2; trans->size = new_size; if (trans->iters_live) { @@ -1818,13 +1912,14 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) struct btree_iter *iter; trans_for_each_iter(trans, iter) { - pr_err("iter: btree %s pos %llu:%llu%s%s%s", + pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf", bch2_btree_ids[iter->btree_id], iter->pos.inode, iter->pos.offset, (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : ""); + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); } panic("trans iter oveflow\n"); @@ -1931,15 +2026,16 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, return iter; } -struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos, unsigned flags) +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos, unsigned flags) { struct btree_iter *iter = __btree_trans_get_iter(trans, btree_id, pos, flags); if (!IS_ERR(iter)) - bch2_btree_iter_set_pos(iter, pos); + __bch2_btree_iter_set_pos(iter, pos, + btree_node_type_is_extents(btree_id)); return iter; } @@ -1960,6 +2056,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, iter->locks_want = locks_want; iter->level = depth; + iter->min_depth = depth; for (i = 0; i < ARRAY_SIZE(iter->l); i++) iter->l[i].b = NULL; @@ -1968,7 +2065,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, return iter; } -struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, +struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) { struct btree_iter *iter; @@ -1981,8 +2078,8 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, trans->iters_live |= 1ULL << iter->idx; /* - * Don't mark it as touched, we don't need to preserve this iter since - * it's cheap to copy it again: + * We don't need to preserve this iter since it's cheap to copy it + * again - this will cause trans_iter_put() to free it right away: */ trans->iters_touched &= ~(1ULL << iter->idx); @@ -2049,16 +2146,12 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) bch2_trans_unlink_iters(trans); - if (flags & TRANS_RESET_ITERS) - trans->iters_live = 0; - trans->iters_touched &= trans->iters_live; trans->need_reset = 0; trans->nr_updates = 0; - - if (flags & TRANS_RESET_MEM) - trans->mem_top = 0; + trans->nr_updates2 = 0; + trans->mem_top = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; @@ -2077,11 +2170,18 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, { memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + /* + * reallocating iterators currently completely breaks + * bch2_trans_iter_put(): + */ + expected_nr_iters = BTREE_ITER_MAX; + trans->c = c; trans->ip = _RET_IP_; trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; + trans->updates2 = trans->updates2_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -2119,5 +2219,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + - sizeof(u8) * nr); + sizeof(struct btree_insert_entry) * nr); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 962380925511..6456787a8f77 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -96,11 +96,11 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, (_iter)->idx + 1)) #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_iter_verify(struct btree_iter *, struct btree *); +void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); void bch2_btree_trans_verify_locks(struct btree_trans *); #else -static inline void bch2_btree_iter_verify(struct btree_iter *iter, - struct btree *b) {} +static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, + struct btree *b) {} static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} #endif @@ -154,11 +154,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) int bch2_btree_iter_traverse_all(struct btree_trans *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); @@ -166,41 +169,14 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); +void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -static inline struct bpos btree_type_successor(enum btree_id id, - struct bpos pos) -{ - if (id == BTREE_ID_INODES) { - pos.inode++; - pos.offset = 0; - } else if (!btree_node_type_is_extents(id)) { - pos = bkey_successor(pos); - } - - return pos; -} - -static inline struct bpos btree_type_predecessor(enum btree_id id, - struct bpos pos) -{ - if (id == BTREE_ID_INODES) { - --pos.inode; - pos.offset = 0; - } else { - pos = bkey_predecessor(pos); - } - - return pos; -} - static inline int __btree_iter_cmp(enum btree_id id, struct bpos pos, const struct btree_iter *r) { - if (id != r->btree_id) - return id < r->btree_id ? -1 : 1; - return bkey_cmp(pos, r->pos); + return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos); } static inline int btree_iter_cmp(const struct btree_iter *l, @@ -230,7 +206,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) _start, _locks_want, _depth, _flags), \ _b = bch2_btree_iter_peek_node(_iter); \ (_b); \ - (_b) = bch2_btree_iter_next_node(_iter, _depth)) + (_b) = bch2_btree_iter_next_node(_iter)) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ _flags, _b) \ @@ -281,23 +257,46 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); void bch2_trans_unlink_iters(struct btree_trans *); -struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned); -struct btree_iter *bch2_trans_copy_iter(struct btree_trans *, +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, + struct bpos, unsigned); + +static inline struct btree_iter * +bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, + struct bpos pos, unsigned flags) +{ + struct btree_iter *iter = + __bch2_trans_get_iter(trans, btree_id, pos, flags); + + if (!IS_ERR(iter)) + iter->ip_allocated = _THIS_IP_; + return iter; +} + +struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *); +static inline struct btree_iter * +bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) +{ + struct btree_iter *iter = + __bch2_trans_copy_iter(trans, src); + + if (!IS_ERR(iter)) + iter->ip_allocated = _THIS_IP_; + return iter; + +} + struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); -#define TRANS_RESET_ITERS (1 << 0) -#define TRANS_RESET_MEM (1 << 1) -#define TRANS_RESET_NOTRAVERSE (1 << 2) +#define TRANS_RESET_NOTRAVERSE (1 << 0) void bch2_trans_reset(struct btree_trans *, unsigned); static inline void bch2_trans_begin(struct btree_trans *trans) { - return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); + return bch2_trans_reset(trans, 0); } void *bch2_trans_kmalloc(struct btree_trans *, size_t); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index b7af88e05837..732cdc35aa7c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -53,7 +53,6 @@ struct bset_tree { struct btree_write { struct journal_entry_pin journal; - struct closure_waitlist wait; }; struct btree_alloc { @@ -64,9 +63,7 @@ struct btree_alloc { struct btree { /* Hottest entries first */ struct rhash_head hash; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u64 hash_val; struct six_lock lock; @@ -133,6 +130,9 @@ struct btree { #ifdef CONFIG_BCACHEFS_DEBUG bool *expensive_debug_checks; #endif + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_cache { @@ -234,9 +234,10 @@ struct btree_iter { u16 flags; u8 idx; - enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; + enum btree_iter_uptodate uptodate:4; unsigned level:4, + min_depth:4, locks_want:4, nodes_locked:4, nodes_intent_locked:4; @@ -252,6 +253,7 @@ struct btree_iter { * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; + unsigned long ip_allocated; }; static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) @@ -259,6 +261,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } +static inline struct btree_iter_level *iter_l(struct btree_iter *iter) +{ + return iter->l + iter->level; +} + struct btree_insert_entry { unsigned trigger_flags; unsigned trans_triggers_run:1; @@ -266,7 +273,11 @@ struct btree_insert_entry { struct btree_iter *iter; }; +#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 +#else +#define BTREE_ITER_MAX 32 +#endif struct btree_trans { struct bch_fs *c; @@ -278,6 +289,7 @@ struct btree_trans { u8 nr_iters; u8 nr_updates; + u8 nr_updates2; u8 size; unsigned used_mempool:1; unsigned error:1; @@ -290,6 +302,7 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; + struct btree_insert_entry *updates2; /* update path: */ struct journal_res journal_res; @@ -303,6 +316,7 @@ struct btree_trans { struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[2]; + struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ @@ -534,8 +548,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) struct btree_root { struct btree *b; - struct btree_update *as; - /* On disk root - see async splits: */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 2c34bae64281..11f7d02de622 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -12,8 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, struct btree_iter *); bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_i *); -void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, - struct bkey_i *); +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { __BTREE_INSERT_NOUNLOCK, @@ -59,6 +58,7 @@ enum btree_insert_flags { int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); @@ -70,7 +70,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, __le64, unsigned); int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, - struct btree *, struct bkey_i_btree_ptr *); + struct btree *, struct bkey_i *); int bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_trigger_flags); @@ -98,17 +98,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans, return __bch2_trans_commit(trans); } -#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \ - _flags, _reset_flags, _do) \ +#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ({ \ int _ret; \ \ - do { \ - bch2_trans_reset(_trans, _reset_flags); \ - \ + while (1) { \ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ (_journal_seq), (_flags)); \ - } while (_ret == -EINTR); \ + if (_ret != -EINTR) \ + break; \ + bch2_trans_reset(_trans, 0); \ + } \ \ _ret; \ }) @@ -120,7 +120,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, \ bch2_trans_init(&trans, (_c), 0, 0); \ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ - TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \ + _do); \ _ret2 = bch2_trans_exit(&trans); \ \ _ret ?: _ret2; \ @@ -131,4 +131,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +#define trans_for_each_update2(_trans, _i) \ + for ((_i) = (_trans)->updates2; \ + (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ + (_i)++) + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 748e6356f3d6..82b66a667e35 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -24,47 +24,42 @@ static void btree_node_will_make_reachable(struct btree_update *, struct btree *); static void btree_update_drop_new_node(struct bch_fs *, struct btree *); -static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int); /* Debug code: */ +/* + * Verify that child nodes correctly span parent node's range: + */ static void btree_node_interior_verify(struct btree *b) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos next_node = b->data->min_key; struct btree_node_iter iter; - struct bkey_packed *k; + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; BUG_ON(!b->level); - bch2_btree_node_iter_init(&iter, b, &b->key.k.p); -#if 1 - BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || - bkey_cmp_left_packed(b, k, &b->key.k.p)); + bch2_btree_node_iter_init_from_start(&iter, b); - BUG_ON((bch2_btree_node_iter_advance(&iter, b), - !bch2_btree_node_iter_end(&iter))); -#else - const char *msg; + while (1) { + k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + if (k.k->type != KEY_TYPE_btree_ptr_v2) + break; + bp = bkey_s_c_to_btree_ptr_v2(k); - msg = "not found"; - k = bch2_btree_node_iter_peek(&iter, b); - if (!k) - goto err; + BUG_ON(bkey_cmp(next_node, bp.v->min_key)); - msg = "isn't what it should be"; - if (bkey_cmp_left_packed(b, k, &b->key.k.p)) - goto err; + bch2_btree_node_iter_advance(&iter, b); - bch2_btree_node_iter_advance(&iter, b); + if (bch2_btree_node_iter_end(&iter)) { + BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); + break; + } - msg = "isn't last key"; - if (!bch2_btree_node_iter_end(&iter)) - goto err; - return; -err: - bch2_dump_btree_node(b); - printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, - b->key.k.p.offset, msg); - BUG(); + next_node = bkey_successor(k.k->p); + } #endif } @@ -260,16 +255,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, } static void bch2_btree_node_free_ondisk(struct bch_fs *c, - struct pending_btree_node_free *pending) + struct pending_btree_node_free *pending, + u64 journal_seq) { BUG_ON(!pending->index_update_done); bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); + 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - 0, 0, NULL, 0, + 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE| BTREE_TRIGGER_GC); } @@ -332,7 +328,11 @@ retry: goto retry; } - bkey_btree_ptr_init(&tmp.k); + if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) + bkey_btree_ptr_v2_init(&tmp.k); + else + bkey_btree_ptr_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); bch2_open_bucket_get(c, wp, &ob); @@ -354,25 +354,36 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev { struct bch_fs *c = as->c; struct btree *b; + int ret; BUG_ON(level >= BTREE_MAX_DEPTH); BUG_ON(!as->reserve->nr); b = as->reserve->b[--as->reserve->nr]; - BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id)); - set_btree_node_accessed(b); set_btree_node_dirty(b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); + b->level = level; + b->btree_id = as->btree_id; + memset(&b->nr, 0, sizeof(b->nr)); b->data->magic = cpu_to_le64(bset_magic(c)); b->data->flags = 0; SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; + b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); + + bp->v.mem_ptr = 0; + bp->v.seq = b->data->keys.seq; + bp->v.sectors_written = 0; + bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); + } if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); @@ -385,10 +396,26 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev btree_node_will_make_reachable(as, b); + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); + BUG_ON(ret); + trace_btree_node_alloc(c, b); return b; } +static void btree_set_min(struct btree *b, struct bpos pos) +{ + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; + b->data->min_key = pos; +} + +static void btree_set_max(struct btree *b, struct bpos pos) +{ + b->key.k.p = pos; + b->data->max_key = pos; +} + struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, struct btree *b, struct bkey_format format) @@ -397,11 +424,12 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, n = bch2_btree_node_alloc(as, b->level); - n->data->min_key = b->data->min_key; - n->data->max_key = b->data->max_key; - n->data->format = format; SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); + btree_set_min(n, b->data->min_key); + btree_set_max(n, b->data->max_key); + + n->data->format = format; btree_node_set_format(n, format); bch2_btree_sort_into(as->c, n, b); @@ -431,10 +459,9 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) { struct btree *b = bch2_btree_node_alloc(as, level); - b->data->min_key = POS_MIN; - b->data->max_key = POS_MAX; + btree_set_min(b, POS_MIN); + btree_set_max(b, POS_MAX); b->data->format = bch2_btree_calc_format(b); - b->key.k.p = POS_MAX; btree_node_set_format(b, b->data->format); bch2_btree_build_aux_trees(b); @@ -550,43 +577,47 @@ err_free: /* Asynchronous interior node update machinery */ -static void bch2_btree_update_free(struct btree_update *as) +static void __bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); - BUG_ON(as->nr_new_nodes); - BUG_ON(as->nr_pending); + BUG_ON((as->nr_new_nodes || as->nr_pending) && + !bch2_journal_error(&c->journal));; if (as->reserve) bch2_btree_reserve_put(c, as->reserve); - mutex_lock(&c->btree_interior_update_lock); list_del(&as->list); closure_debug_destroy(&as->cl); mempool_free(as, &c->btree_interior_update_pool); closure_wake_up(&c->btree_interior_update_wait); - mutex_unlock(&c->btree_interior_update_lock); } -static void btree_update_nodes_reachable(struct closure *cl) +static void bch2_btree_update_free(struct btree_update *as) { - struct btree_update *as = container_of(cl, struct btree_update, cl); struct bch_fs *c = as->c; - bch2_journal_pin_drop(&c->journal, &as->journal); - mutex_lock(&c->btree_interior_update_lock); + __bch2_btree_update_free(as); + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_nodes_reachable(struct btree_update *as, u64 seq) +{ + struct bch_fs *c = as->c; while (as->nr_new_nodes) { struct btree *b = as->new_nodes[--as->nr_new_nodes]; BUG_ON(b->will_make_reachable != (unsigned long) as); b->will_make_reachable = 0; - mutex_unlock(&c->btree_interior_update_lock); /* * b->will_make_reachable prevented it from being written, so @@ -595,150 +626,128 @@ static void btree_update_nodes_reachable(struct closure *cl) btree_node_lock_type(c, b, SIX_LOCK_read); bch2_btree_node_write_cond(c, b, btree_node_need_write(b)); six_unlock_read(&b->lock); - mutex_lock(&c->btree_interior_update_lock); } while (as->nr_pending) - bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); - - mutex_unlock(&c->btree_interior_update_lock); - - closure_wake_up(&as->wait); - - bch2_btree_update_free(as); -} - -static void btree_update_wait_on_journal(struct closure *cl) -{ - struct btree_update *as = container_of(cl, struct btree_update, cl); - struct bch_fs *c = as->c; - int ret; - - ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); - if (ret == -EAGAIN) { - continue_at(cl, btree_update_wait_on_journal, system_wq); - return; - } - if (ret < 0) - goto err; - - bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); -err: - continue_at(cl, btree_update_nodes_reachable, system_wq); + bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending], + seq); } static void btree_update_nodes_written(struct closure *cl) { struct btree_update *as = container_of(cl, struct btree_update, cl); + struct journal_res res = { 0 }; struct bch_fs *c = as->c; struct btree *b; + struct bset *i; + int ret; /* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have * been written so we can write out the update to the interior node. */ -retry: mutex_lock(&c->btree_interior_update_lock); as->nodes_written = true; +again: + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (!as || !as->nodes_written) { + mutex_unlock(&c->btree_interior_update_lock); + return; + } + + b = as->b; + if (b && !six_trylock_intent(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + btree_node_lock_type(c, b, SIX_LOCK_intent); + six_unlock_intent(&b->lock); + mutex_lock(&c->btree_interior_update_lock); + goto again; + } + + list_del(&as->unwritten_list); + + ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s, + JOURNAL_RES_GET_RESERVED); + if (ret) { + BUG_ON(!bch2_journal_error(&c->journal)); + /* can't unblock btree writes */ + goto free_update; + } + + { + struct journal_buf *buf = &c->journal.buf[res.idx]; + struct jset_entry *entry = vstruct_idx(buf->data, res.offset); + + res.offset += as->journal_u64s; + res.u64s -= as->journal_u64s; + memcpy_u64s(entry, as->journal_entries, as->journal_u64s); + } switch (as->mode) { case BTREE_INTERIOR_NO_UPDATE: BUG(); case BTREE_INTERIOR_UPDATING_NODE: - /* The usual case: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - btree_node_lock_type(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto retry; - } - - BUG_ON(!btree_node_dirty(b)); - closure_wait(&btree_current_write(b)->wait, cl); + /* @b is the node we did the final insert into: */ + BUG_ON(!res.ref); + six_lock_write(&b->lock); list_del(&as->write_blocked_list); - /* - * for flush_held_btree_writes() waiting on updates to flush or - * nodes to be writeable: - */ - closure_wake_up(&c->btree_interior_update_wait); - mutex_unlock(&c->btree_interior_update_lock); + i = btree_bset_last(b); + i->journal_seq = cpu_to_le64( + max(res.seq, + le64_to_cpu(i->journal_seq))); - /* - * b->write_blocked prevented it from being written, so - * write it now if it needs to be written: - */ - bch2_btree_node_write_cond(c, b, true); - six_unlock_read(&b->lock); + bch2_btree_add_journal_pin(c, b, res.seq); + six_unlock_write(&b->lock); break; case BTREE_INTERIOR_UPDATING_AS: - /* - * The btree node we originally updated has been freed and is - * being rewritten - so we need to write anything here, we just - * need to signal to that btree_update that it's ok to make the - * new replacement node visible: - */ - closure_put(&as->parent_as->cl); - - /* - * and then we have to wait on that btree_update to finish: - */ - closure_wait(&as->parent_as->wait, cl); - mutex_unlock(&c->btree_interior_update_lock); + BUG_ON(b); break; - case BTREE_INTERIOR_UPDATING_ROOT: - /* b is the new btree root: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - btree_node_lock_type(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto retry; - } + case BTREE_INTERIOR_UPDATING_ROOT: { + struct btree_root *r = &c->btree_roots[as->btree_id]; - BUG_ON(c->btree_roots[b->btree_id].as != as); - c->btree_roots[b->btree_id].as = NULL; + BUG_ON(b); - bch2_btree_set_root_ondisk(c, b, WRITE); + mutex_lock(&c->btree_root_lock); + bkey_copy(&r->key, as->parent_keys.keys); + r->level = as->level; + r->alive = true; + c->btree_roots_dirty = true; + mutex_unlock(&c->btree_root_lock); + break; + } + } - /* - * We don't have to wait anything anything here (before - * btree_update_nodes_reachable frees the old nodes - * ondisk) - we've ensured that the very next journal write will - * have the pointer to the new root, and before the allocator - * can reuse the old nodes it'll have to do a journal commit: - */ - six_unlock_read(&b->lock); - mutex_unlock(&c->btree_interior_update_lock); + bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_res_put(&c->journal, &res); + bch2_journal_preres_put(&c->journal, &as->journal_preres); +free_update: + /* Do btree write after dropping journal res: */ + if (b) { /* - * Bit of funny circularity going on here we have to break: - * - * We have to drop our journal pin before writing the journal - * entry that points to the new btree root: else, we could - * deadlock if the journal currently happens to be full. - * - * This mean we're dropping the journal pin _before_ the new - * nodes are technically reachable - but this is safe, because - * after the bch2_btree_set_root_ondisk() call above they will - * be reachable as of the very next journal write: + * b->write_blocked prevented it from being written, so + * write it now if it needs to be written: */ - bch2_journal_pin_drop(&c->journal, &as->journal); - - as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal); - - btree_update_wait_on_journal(cl); - return; + btree_node_write_if_need(c, b, SIX_LOCK_intent); + six_unlock_intent(&b->lock); } - continue_at(cl, btree_update_nodes_reachable, system_wq); + if (!ret) + btree_update_nodes_reachable(as, res.seq); + + __bch2_btree_update_free(as); + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); + goto again; } /* @@ -750,52 +759,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) struct bch_fs *c = as->c; mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(!btree_node_dirty(b)); - as->mode = BTREE_INTERIOR_UPDATING_NODE; - as->b = b; + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; + as->level = b->level; list_add(&as->write_blocked_list, &b->write_blocked); mutex_unlock(&c->btree_interior_update_lock); - - /* - * In general, when you're staging things in a journal that will later - * be written elsewhere, and you also want to guarantee ordering: that - * is, if you have updates a, b, c, after a crash you should never see c - * and not a or b - there's a problem: - * - * If the final destination of the update(s) (i.e. btree node) can be - * written/flushed _before_ the relevant journal entry - oops, that - * breaks ordering, since the various leaf nodes can be written in any - * order. - * - * Normally we use bset->journal_seq to deal with this - if during - * recovery we find a btree node write that's newer than the newest - * journal entry, we just ignore it - we don't need it, anything we're - * supposed to have (that we reported as completed via fsync()) will - * still be in the journal, and as far as the state of the journal is - * concerned that btree node write never happened. - * - * That breaks when we're rewriting/splitting/merging nodes, since we're - * mixing btree node writes that haven't happened yet with previously - * written data that has been reported as completed to the journal. - * - * Thus, before making the new nodes reachable, we have to wait the - * newest journal sequence number we have data for to be written (if it - * hasn't been yet). - */ - bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); -} - -static void interior_update_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct btree_update *as = - container_of(pin, struct btree_update, journal); - - bch2_journal_flush_seq_async(j, as->journal_seq, NULL); } static void btree_update_reparent(struct btree_update *as, @@ -803,10 +777,10 @@ static void btree_update_reparent(struct btree_update *as, { struct bch_fs *c = as->c; + lockdep_assert_held(&c->btree_interior_update_lock); + child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - child->parent_as = as; - closure_get(&as->cl); /* * When we write a new btree root, we have to drop our journal pin @@ -817,45 +791,24 @@ static void btree_update_reparent(struct btree_update *as, * just transfer the journal pin to the new interior update so * btree_update_nodes_written() can drop it. */ - bch2_journal_pin_add_if_older(&c->journal, &child->journal, - &as->journal, interior_update_flush); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); bch2_journal_pin_drop(&c->journal, &child->journal); - - as->journal_seq = max(as->journal_seq, child->journal_seq); } -static void btree_update_updated_root(struct btree_update *as) +static void btree_update_updated_root(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - struct btree_root *r = &c->btree_roots[as->btree_id]; - - mutex_lock(&c->btree_interior_update_lock); BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!bch2_keylist_empty(&as->parent_keys)); - /* - * Old root might not be persistent yet - if so, redirect its - * btree_update operation to point to us: - */ - if (r->as) - btree_update_reparent(as, r->as); - - as->mode = BTREE_INTERIOR_UPDATING_ROOT; - as->b = r->b; - r->as = as; + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->level = b->level; + bch2_keylist_add(&as->parent_keys, &b->key); mutex_unlock(&c->btree_interior_update_lock); - - /* - * When we're rewriting nodes and updating interior nodes, there's an - * issue with updates that haven't been written in the journal getting - * mixed together with older data - see btree_update_updated_node() - * for the explanation. - * - * However, this doesn't affect us when we're writing a new btree root - - * because to make that new root reachable we have to write out a new - * journal entry, which must necessarily be newer than as->journal_seq. - */ } static void btree_node_will_make_reachable(struct btree_update *as, @@ -932,10 +885,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; - struct closure *cl, *cl_n; struct btree_update *p, *n; struct btree_write *w; - struct bset_tree *t; set_btree_node_dying(b); @@ -944,18 +895,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, btree_interior_update_add_node_reference(as, b); - /* - * Does this node have data that hasn't been written in the journal? - * - * If so, we have to wait for the corresponding journal entry to be - * written before making the new nodes reachable - we can't just carry - * over the bset->journal_seq tracking, since we'll be mixing those keys - * in with keys that aren't in the journal anymore: - */ - for_each_bset(b, t) - as->journal_seq = max(as->journal_seq, - le64_to_cpu(bset(b, t)->journal_seq)); - mutex_lock(&c->btree_interior_update_lock); /* @@ -979,16 +918,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, clear_btree_node_dirty(b); clear_btree_node_need_write(b); - w = btree_current_write(b); - - /* - * Does this node have any btree_update operations waiting on this node - * to be written? - * - * If so, wake them up when this btree_update operation is reachable: - */ - llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list) - llist_add(&cl->list, &as->wait.list); /* * Does this node have unwritten data that has a pin on the journal? @@ -998,13 +927,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, * oldest pin of any of the nodes we're freeing. We'll release the pin * when the new nodes are persistent and reachable on disk: */ - bch2_journal_pin_add_if_older(&c->journal, &w->journal, - &as->journal, interior_update_flush); + w = btree_current_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_add_if_older(&c->journal, &w->journal, - &as->journal, interior_update_flush); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1021,12 +949,33 @@ void bch2_btree_update_done(struct btree_update *as) } struct btree_update * -bch2_btree_update_start(struct bch_fs *c, enum btree_id id, +bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, unsigned nr_nodes, unsigned flags, struct closure *cl) { + struct bch_fs *c = trans->c; + struct journal_preres journal_preres = { 0 }; struct btree_reserve *reserve; struct btree_update *as; + int ret; + + ret = bch2_journal_preres_get(&c->journal, &journal_preres, + BTREE_UPDATE_JOURNAL_RES, + JOURNAL_RES_GET_NONBLOCK); + if (ret == -EAGAIN) { + bch2_trans_unlock(trans); + + ret = bch2_journal_preres_get(&c->journal, &journal_preres, + BTREE_UPDATE_JOURNAL_RES, + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ERR_PTR(ret); + + if (!bch2_trans_relock(trans)) { + bch2_journal_preres_put(&c->journal, &journal_preres); + return ERR_PTR(-EINTR); + } + } reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); if (IS_ERR(reserve)) @@ -1040,6 +989,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, as->btree_id = id; as->reserve = reserve; INIT_LIST_HEAD(&as->write_blocked_list); + as->journal_preres = journal_preres; bch2_keylist_init(&as->parent_keys, as->inline_keys); @@ -1102,22 +1052,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } -static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw) -{ - struct btree_root *r = &c->btree_roots[b->btree_id]; - - mutex_lock(&c->btree_root_lock); - - BUG_ON(b != r->b); - bkey_copy(&r->key, &b->key); - r->level = b->level; - r->alive = true; - if (rw == WRITE) - c->btree_roots_dirty = true; - - mutex_unlock(&c->btree_root_lock); -} - /** * bch_btree_set_root - update the root in memory and on disk * @@ -1150,7 +1084,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, bch2_btree_set_root_inmem(as, b); - btree_update_updated_root(as); + btree_update_updated_root(as, b); /* * Unlock old root after new root is visible: @@ -1171,10 +1105,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b { struct bch_fs *c = as->c; struct bch_fs_usage *fs_usage; + struct jset_entry *entry; struct bkey_packed *k; struct bkey tmp; - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + + entry = (void *) &as->journal_entries[as->journal_u64s]; + memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(insert->k.u64s); + entry->type = BCH_JSET_ENTRY_btree_keys; + entry->btree_id = b->btree_id; + entry->level = b->level; + memcpy_u64s_small(entry->_data, insert, insert->k.u64s); + as->journal_u64s += jset_u64s(insert->k.u64s); mutex_lock(&c->btree_interior_update_lock); percpu_down_read(&c->mark_lock); @@ -1263,10 +1208,8 @@ static struct btree *__btree_split_node(struct btree_update *as, BUG_ON(!prev); - n1->key.k.p = bkey_unpack_pos(n1, prev); - n1->data->max_key = n1->key.k.p; - n2->data->min_key = - btree_type_successor(n1->btree_id, n1->key.k.p); + btree_set_max(n1, bkey_unpack_pos(n1, prev)); + btree_set_min(n2, bkey_successor(n1->key.k.p)); set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); @@ -1325,6 +1268,14 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, struct bkey_packed *src, *dst, *n; struct bset *i; + /* + * XXX + * + * these updates must be journalled + * + * oops + */ + BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); bch2_btree_node_iter_init(&node_iter, b, &k->k.p); @@ -1332,11 +1283,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, while (!bch2_keylist_empty(keys)) { k = bch2_keylist_front(keys); - BUG_ON(bch_keylist_u64s(keys) > - bch_btree_keys_u64s_remaining(as->c, b)); - BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); - BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); - bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); bch2_keylist_pop_front(keys); } @@ -1422,7 +1368,8 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->lock); - bch2_keylist_add(&as->parent_keys, &n1->key); + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); } bch2_btree_node_write(c, n1, SIX_LOCK_intent); @@ -1496,19 +1443,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, (bkey_cmp_packed(b, k, &insert->k) >= 0)) ; - while (!bch2_keylist_empty(keys)) { - insert = bch2_keylist_front(keys); - + for_each_keylist_key(keys, insert) bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); - bch2_keylist_pop_front(keys); - } btree_update_updated_node(as, b); trans_for_each_iter_with_node(iter->trans, b, linked) bch2_btree_node_iter_peek(&linked->l[b->level].iter, b); - bch2_btree_iter_verify(iter, b); + bch2_btree_trans_verify_iters(iter->trans, b); } /** @@ -1581,7 +1524,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned flags) { struct btree_trans *trans = iter->trans; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; struct btree_update *as; struct closure cl; int ret = 0; @@ -1620,7 +1563,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, goto out; } - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(trans, iter->btree_id, btree_update_reserve_required(c, b), flags, !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (IS_ERR(as)) { @@ -1732,7 +1675,7 @@ retry: goto err_unlock; } - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(trans, iter->btree_id, btree_update_reserve_required(c, parent) + 1, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE, @@ -1749,10 +1692,9 @@ retry: n = bch2_btree_node_alloc(as, b->level); - n->data->min_key = prev->data->min_key; - n->data->max_key = next->data->max_key; + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); n->data->format = new_f; - n->key.k.p = next->key.k.p; btree_node_set_format(n, new_f); @@ -1779,7 +1721,7 @@ retry: bch2_btree_iter_node_replace(iter, n); - bch2_btree_iter_verify(iter, n); + bch2_btree_trans_verify_iters(trans, n); bch2_btree_node_free_inmem(c, b, iter); bch2_btree_node_free_inmem(c, m, iter); @@ -1846,7 +1788,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, struct btree *n, *parent = btree_node_parent(iter, b); struct btree_update *as; - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(iter->trans, iter->btree_id, (parent ? btree_update_reserve_required(c, parent) : 0) + 1, @@ -1944,7 +1886,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, struct btree_update *as, struct btree_iter *iter, struct btree *b, struct btree *new_hash, - struct bkey_i_btree_ptr *new_key) + struct bkey_i *new_key) { struct btree *parent; int ret; @@ -1989,20 +1931,20 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, */ ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, c->opts.btree_node_size * - bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), + bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)), BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); parent = btree_node_parent(iter, b); if (parent) { if (new_hash) { - bkey_copy(&new_hash->key, &new_key->k_i); + bkey_copy(&new_hash->key, new_key); ret = bch2_btree_node_hash_insert(&c->btree_cache, new_hash, b->level, b->btree_id); BUG_ON(ret); } - bch2_keylist_add(&as->parent_keys, &new_key->k_i); + bch2_keylist_add(&as->parent_keys, new_key); bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); if (new_hash) { @@ -2011,12 +1953,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_hash_remove(&c->btree_cache, b); - bkey_copy(&b->key, &new_key->k_i); + bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); mutex_unlock(&c->btree_cache.lock); } else { - bkey_copy(&b->key, &new_key->k_i); + bkey_copy(&b->key, new_key); } } else { struct bch_fs_usage *fs_usage; @@ -2029,11 +1971,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, percpu_down_read(&c->mark_lock); fs_usage = bch2_fs_usage_scratch_get(c); - bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), 0, 0, fs_usage, 0, BTREE_TRIGGER_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) - bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), + bch2_mark_key_locked(c, bkey_i_to_s_c(new_key), 0, 0, NULL, 0, BTREE_TRIGGER_INSERT|| BTREE_TRIGGER_GC); @@ -2047,19 +1989,19 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, percpu_up_read(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); - if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + if (btree_ptr_hash_val(new_key) != b->hash_val) { mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); - bkey_copy(&b->key, &new_key->k_i); + bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); BUG_ON(ret); mutex_unlock(&c->btree_cache.lock); } else { - bkey_copy(&b->key, &new_key->k_i); + bkey_copy(&b->key, new_key); } - btree_update_updated_root(as); + btree_update_updated_root(as, b); bch2_btree_node_unlock_write(b, iter); } @@ -2068,7 +2010,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, struct btree *b, - struct bkey_i_btree_ptr *new_key) + struct bkey_i *new_key) { struct btree *parent = btree_node_parent(iter, b); struct btree_update *as = NULL; @@ -2091,8 +2033,11 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, } } - /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ - if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + /* + * check btree_ptr_hash_val() after @b is locked by + * btree_iter_traverse(): + */ + if (btree_ptr_hash_val(new_key) != b->hash_val) { /* bch2_btree_reserve_get will unlock */ ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { @@ -2110,7 +2055,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, new_hash = bch2_btree_node_mem_alloc(c); } - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(iter->trans, iter->btree_id, parent ? btree_update_reserve_required(c, parent) : 0, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| @@ -2134,7 +2079,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, goto err; } - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i)); + ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); if (ret) goto err_free_update; @@ -2193,14 +2138,14 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bkey_btree_ptr_init(&b->key); b->key.k.p = POS_MAX; - PTR_HASH(&b->key) = U64_MAX - id; + *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; bch2_bset_init_first(b, &b->data->keys); bch2_btree_build_aux_trees(b); b->data->flags = 0; - b->data->min_key = POS_MIN; - b->data->max_key = POS_MAX; + btree_set_min(b, POS_MIN); + btree_set_max(b, POS_MAX); b->data->format = bch2_btree_calc_format(b); btree_node_set_format(b, b->data->format); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 2d8e0b7f3aaf..2fddf5d31eb9 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -32,6 +32,9 @@ struct pending_btree_node_free { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; +#define BTREE_UPDATE_JOURNAL_RES \ + ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2) + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -55,6 +58,7 @@ struct btree_update { struct bch_fs *c; struct list_head list; + struct list_head unwritten_list; /* What kind of update are we doing? */ enum { @@ -68,8 +72,10 @@ struct btree_update { unsigned nodes_written:1; enum btree_id btree_id; + u8 level; struct btree_reserve *reserve; + struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: @@ -83,18 +89,6 @@ struct btree_update { struct list_head write_blocked_list; /* - * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now - * we're now blocking another btree_update - * @parent_as - btree_update that's waiting on our nodes to finish - * writing, before it can make new nodes visible on disk - * @wait - list of child btree_updates that are waiting on this - * btree_update to make all the new nodes visible before they can free - * their old btree nodes - */ - struct btree_update *parent_as; - struct closure_waitlist wait; - - /* * We may be freeing nodes that were dirty, and thus had journal entries * pinned: we need to transfer the oldest of those pins to the * btree_update operation, and release it when the new node(s) @@ -102,8 +96,6 @@ struct btree_update { */ struct journal_entry_pin journal; - u64 journal_seq; - /* * Nodes being freed: * Protected by c->btree_node_pending_free_lock @@ -115,6 +107,9 @@ struct btree_update { struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; unsigned nr_new_nodes; + unsigned journal_u64s; + u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; + /* Only here to reduce stack usage on recursive splits: */ struct keylist parent_keys; /* @@ -139,7 +134,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, void bch2_btree_update_done(struct btree_update *); struct btree_update * -bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned, +bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, unsigned, struct closure *); void bch2_btree_interior_update_will_free_node(struct btree_update *, @@ -302,18 +297,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, } static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bkey_packed *k) + struct bpos pos) { - unsigned u64s = bkeyp_key_u64s(&b->format, k); - struct bkey_packed *dst; + struct bkey_packed k; + + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; + + bkey_init(u); + u->p = pos; + } - BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); + k.needs_whiteout = true; - b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); - dst = unwritten_whiteouts_start(c, b); - memcpy_u64s(dst, k, u64s); - dst->u64s = u64s; - dst->type = KEY_TYPE_deleted; + b->whiteout_u64s += k.u64s; + bkey_copy(unwritten_whiteouts_start(c, b), &k); } /* diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index afd2086edeff..7faf98fd2f64 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -23,11 +23,10 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { - return i != trans->updates && - i[0].iter->l[0].b == i[-1].iter->l[0].b; + return i != trans->updates2 && + iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; } - inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) { @@ -53,45 +52,45 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, struct btree_node_iter *node_iter, struct bkey_i *insert) { - const struct bkey_format *f = &b->format; struct bkey_packed *k; - unsigned clobber_u64s; + unsigned clobber_u64s = 0, new_u64s = 0; EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || - bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && + bkey_cmp(bkey_start_pos(&insert->k), + bkey_predecessor(b->data->min_key)) < 0); + EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(iter->trans->c, b)); + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_packed(b, k, &insert->k)) k = NULL; /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_whiteout(k)); + /* Deleting, but not found? nothing to do: */ + if (bkey_whiteout(&insert->k) && !k) + return false; + if (bkey_whiteout(&insert->k)) { /* Deleting: */ - - /* Not found? Nothing to do: */ - if (!k) - return false; - btree_account_key_drop(b, k); k->type = KEY_TYPE_deleted; - if (k->needs_whiteout) { - push_whiteout(iter->trans->c, b, k); - k->needs_whiteout = false; - } + if (k->needs_whiteout) + push_whiteout(iter->trans->c, b, insert->k.p); + k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, 0); + goto fix_iter; } else { bch2_btree_iter_fix_key_modified(iter, b, k); } @@ -101,14 +100,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, if (k) { /* Overwriting: */ - if (!bkey_written(b, k) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { - k->type = insert->k.type; - memcpy_u64s(bkeyp_val(f, k), &insert->v, - bkey_val_u64s(&insert->k)); - return true; - } - btree_account_key_drop(b, k); k->type = KEY_TYPE_deleted; @@ -124,11 +115,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, } k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); - clobber_u64s = 0; overwrite: bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, k, - clobber_u64s, k->u64s); + new_u64s = k->u64s; +fix_iter: + if (clobber_u64s != new_u64s) + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, new_u64s); return true; } @@ -155,6 +148,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, return __btree_node_flush(j, pin, 1, seq); } +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); +} + static inline void __btree_journal_key(struct btree_trans *trans, enum btree_id btree_id, struct bkey_i *insert) @@ -176,16 +180,14 @@ static inline void __btree_journal_key(struct btree_trans *trans, *trans->journal_seq = seq; } -void bch2_btree_journal_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) +static void bch2_btree_journal_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree *b = iter->l[0].b; - struct btree_write *w = btree_current_write(b); + struct btree *b = iter_l(iter)->b; - EBUG_ON(iter->level || b->level); EBUG_ON(trans->journal_res.ref != !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); @@ -195,35 +197,15 @@ void bch2_btree_journal_key(struct btree_trans *trans, cpu_to_le64(trans->journal_res.seq); } - if (unlikely(!journal_pin_active(&w->journal))) { - u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) + bch2_btree_add_journal_pin(c, b, + likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) ? trans->journal_res.seq - : j->replay_journal_seq; - - bch2_journal_pin_add(j, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); - } + : j->replay_journal_seq); if (unlikely(!btree_node_dirty(b))) set_btree_node_dirty(b); } -static void bch2_insert_fixup_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - - EBUG_ON(iter->level); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, l->b)); - - if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) - bch2_btree_journal_key(trans, iter, insert); -} - /** * btree_insert_key - insert a key one key into a leaf node */ @@ -232,7 +214,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; struct bset_tree *t = bset_tree_last(b); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; @@ -240,10 +222,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans, insert->k.needs_whiteout = false; - if (!btree_node_is_extents(b)) - bch2_insert_fixup_key(trans, iter, insert); - else - bch2_insert_fixup_extent(trans, iter, insert); + if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert))) + bch2_btree_journal_key(trans, iter, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -268,14 +248,10 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, { struct bch_fs *c = trans->c; - BUG_ON(iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); - EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); - + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); BUG_ON(debug_check_bkeys(c) && - !bkey_deleted(&insert->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + __btree_node_type(iter->level, iter->btree_id))); } static noinline int @@ -321,15 +297,22 @@ btree_key_can_insert(struct btree_trans *trans, unsigned *u64s) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; + struct btree *b = iter_l(iter)->b; static enum btree_insert_ret ret; if (unlikely(btree_node_fake(b))) return BTREE_INSERT_BTREE_NODE_FULL; - ret = !btree_node_is_extents(b) + /* + * old bch2_extent_sort_fix_overlapping() algorithm won't work with new + * style extent updates: + */ + if (unlikely(btree_node_old_extent_overwrite(b))) + return BTREE_INSERT_BTREE_NODE_FULL; + + ret = !(iter->flags & BTREE_ITER_IS_EXTENTS) ? BTREE_INSERT_OK - : bch2_extent_can_insert(trans, iter, insert, u64s); + : bch2_extent_can_insert(trans, iter, insert); if (ret) return ret; @@ -369,7 +352,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) struct btree_insert_entry *i; trans_for_each_update(trans, i) - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b))) bch2_mark_update(trans, i->iter, i->k, NULL, i->trigger_flags|BTREE_TRIGGER_GC); } @@ -398,7 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); - trans_for_each_update(trans, i) { + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; @@ -437,10 +420,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; else if (inject_invalid_keys(c)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } @@ -463,7 +446,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { @@ -484,8 +467,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_iter *iter; int ret; - trans_for_each_update(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, 0)); + trans_for_each_update2(trans, i) + BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ret = bch2_journal_preres_get(&trans->c->journal, &trans->journal_preres, trans->journal_preres_u64s, @@ -512,20 +495,20 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) btree_insert_entry_checks(trans, i->iter, i->k); bch2_btree_trans_verify_locks(trans); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(trans->c, - i->iter->l[0].b, i->iter); + iter_l(i->iter)->b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, i->iter); /* @@ -540,14 +523,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) bch2_foreground_maybe_merge(trans->c, i->iter, 0, trans->flags); trans->nounlock = false; - trans_for_each_update(trans, i) + trans_for_each_update2(trans, i) bch2_btree_iter_downgrade(i->iter); return 0; @@ -670,6 +653,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } +static void bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_insert_entry *i, n = (struct btree_insert_entry) { + .iter = iter, .k = insert + }; + + btree_insert_entry_checks(trans, n.iter, n.k); + + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + EBUG_ON(trans->nr_updates2 >= trans->nr_iters); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans_for_each_update2(trans, i) { + if (btree_iter_cmp(n.iter, i->iter) == 0) { + *i = n; + return; + } + + if (btree_iter_cmp(n.iter, i->iter) <= 0) + break; + } + + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); +} + +static int extent_update_to_keys(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert) +{ + struct btree_iter *iter; + + if (bkey_deleted(&insert->k)) + return 0; + + iter = bch2_trans_copy_iter(trans, orig_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + iter->flags |= BTREE_ITER_INTENT; + __bch2_btree_iter_set_pos(iter, insert->k.p, false); + bch2_trans_update2(trans, iter, insert); + bch2_trans_iter_put(trans, iter); + return 0; +} + +static int extent_handle_overwrites(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos start, struct bpos end) +{ + struct btree_iter *iter = NULL, *update_iter; + struct bkey_i *update; + struct bkey_s_c k; + int ret = 0; + + iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_with_updates(iter); + + while (k.k && !(ret = bkey_err(k))) { + if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) + break; + + if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_back(start, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + if (bkey_cmp(k.k->p, end) > 0) { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_front(end, update); + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } else { + update_iter = bch2_trans_copy_iter(trans, iter); + if ((ret = PTR_ERR_OR_ZERO(update_iter))) + goto err; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + update->k = *k.k; + set_bkey_val_u64s(&update->k, 0); + update->k.type = KEY_TYPE_deleted; + update->k.size = 0; + + __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + bch2_trans_update2(trans, update_iter, update); + bch2_trans_iter_put(trans, update_iter); + } + + k = bch2_btree_iter_next_with_updates(iter); + } +err: + if (!IS_ERR_OR_NULL(iter)) + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; @@ -739,7 +851,36 @@ int __bch2_trans_commit(struct btree_trans *trans) } } while (trans_trigger_run); + /* Turn extents updates into keys: */ + trans_for_each_update(trans, i) + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + struct bpos start = bkey_start_pos(&i->k->k); + + while (i + 1 < trans->updates + trans->nr_updates && + i[0].iter->btree_id == i[1].iter->btree_id && + !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) + i++; + + ret = extent_handle_overwrites(trans, i->iter->btree_id, + start, i->k->k.p); + if (ret) + goto out; + } + trans_for_each_update(trans, i) { + if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { + ret = extent_update_to_keys(trans, i->iter, i->k); + if (ret) + goto out; + } else { + bch2_trans_update2(trans, i->iter, i->k); + } + } + + trans_for_each_update2(trans, i) { + BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); + BUG_ON(i->iter->locks_want < 1); + u64s = jset_u64s(i->k->k.u64s); if (0) trans->journal_preres_u64s += u64s; @@ -770,7 +911,7 @@ out: if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); out_noupdates: - bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE); + bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); return ret; err: @@ -788,11 +929,14 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, .trigger_flags = flags, .iter = iter, .k = k }; - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); + EBUG_ON(bkey_cmp(iter->pos, + (iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_start_pos(&k->k) + : k->k.p)); iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (btree_node_type_is_extents(iter->btree_id)) { iter->pos_after_commit = k->k.p; iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; } @@ -851,18 +995,21 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, return 0; } -static int __bch2_btree_insert(struct btree_trans *trans, - enum btree_id id, struct bkey_i *k) +int __bch2_btree_insert(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) { struct btree_iter *iter; + int ret; iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); - bch2_trans_update(trans, iter, k, 0); - return 0; + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, 0); + bch2_trans_iter_put(trans, iter); + return ret; } /** @@ -894,7 +1041,7 @@ retry: bkey_cmp(iter->pos, end) < 0) { struct bkey_i delete; - bch2_trans_reset(trans, TRANS_RESET_MEM); + bch2_trans_begin(trans); bkey_init(&delete.k); @@ -910,7 +1057,7 @@ retry: */ delete.k.p = iter->pos; - if (iter->flags & BTREE_ITER_IS_EXTENTS) { + if (btree_node_type_is_extents(iter->btree_id)) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << trans->c->block_bits); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 731b93255876..2e1df04c760d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1194,6 +1194,7 @@ int bch2_mark_key_locked(struct bch_fs *c, ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ? c->opts.btree_node_size : -c->opts.btree_node_size; @@ -1253,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, struct bch_fs_usage *fs_usage, - unsigned flags) + unsigned flags, + bool is_extents) { struct bch_fs *c = trans->c; - struct btree *b = iter->l[0].b; unsigned offset = 0; - s64 sectors = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; - if (btree_node_is_extents(b) + if (is_extents ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 : bkey_cmp(new->k.p, old.k->p)) return 0; - if (btree_node_is_extents(b)) { + if (is_extents) { switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: offset = 0; @@ -1334,13 +1335,13 @@ int bch2_mark_update(struct btree_trans *trans, !bkey_deleted(&insert->k)) return 0; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ret = bch2_mark_overwrite(trans, iter, k, insert, - fs_usage, flags); + fs_usage, flags, + btree_node_type_is_extents(iter->btree_id)); if (ret <= 0) break; @@ -1380,8 +1381,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, pr_err("overlapping with"); node_iter = iter->l[0].iter; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { struct bkey unpacked; struct bkey_s_c k; @@ -1443,8 +1443,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k; struct bkey_alloc_unpacked u; struct bkey_i_alloc *a; - u16 *dst_sectors; - bool overflow; + u16 *dst_sectors, orig_sectors; int ret; ret = trans_get_key(trans, BTREE_ID_ALLOC, @@ -1501,13 +1500,12 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, dst_sectors = !p.ptr.cached ? &u.dirty_sectors : &u.cached_sectors; + orig_sectors = *dst_sectors; - overflow = checked_add(*dst_sectors, sectors); - - if (overflow) { + if (checked_add(*dst_sectors, sectors)) { bch2_fs_inconsistent(c, "bucket sector count overflow: %u + %lli > U16_MAX", - *dst_sectors, sectors); + orig_sectors, sectors); /* return an error indicating that we need full fsck */ ret = -EIO; goto out; @@ -1672,8 +1670,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, k.k->p.offset > idx + sectors)) goto out; - bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + sectors = k.k->p.offset - idx; r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(r_v); @@ -1690,9 +1687,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, set_bkey_val_u64s(&r_v->k, 0); } + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + bch2_trans_update(trans, iter, &r_v->k_i, 0); out: - ret = k.k->p.offset - idx; + ret = sectors; err: bch2_trans_iter_put(trans, iter); return ret; @@ -1729,6 +1729,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, switch (k.k->type) { case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ? c->opts.btree_node_size : -c->opts.btree_node_size; @@ -1792,8 +1793,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) return 0; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { struct bkey unpacked; struct bkey_s_c k; unsigned offset = 0; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 4717a1a6f568..765650ce9d0a 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -97,7 +97,8 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, static inline enum bch_data_type ptr_data_type(const struct bkey *k, const struct bch_extent_ptr *ptr) { - if (k->type == KEY_TYPE_btree_ptr) + if (k->type == KEY_TYPE_btree_ptr || + k->type == KEY_TYPE_btree_ptr_v2) return BCH_DATA_BTREE; return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; @@ -267,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, struct bkey_s_c, struct bkey_i *, - struct bch_fs_usage *, unsigned); + struct bch_fs_usage *, unsigned, bool); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index ad6993b7565a..6f1afa4a3119 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <keys/user-type.h> @@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -198,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -223,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -324,7 +325,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, BUG_ON(len_a + len_b > bio_sectors(bio)); BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); - BUG_ON(crc_old.compression_type); + BUG_ON(crc_is_compressed(crc_old)); BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != bch2_csum_type_is_encryption(new_csum_type)); @@ -353,6 +354,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, if (i->crc) *i->crc = (struct bch_extent_crc_unpacked) { .csum_type = i->csum_type, + .compression_type = crc_old.compression_type, .compressed_size = i->len, .uncompressed_size = i->len, .offset = 0, @@ -461,7 +463,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -544,7 +546,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -572,7 +574,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -604,7 +606,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1963cbfaaa05..24dee8039d57 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } @@ -155,13 +155,16 @@ static inline struct nonce null_nonce(void) static inline struct nonce extent_nonce(struct bversion version, struct bch_extent_crc_unpacked crc) { - unsigned size = crc.compression_type ? crc.uncompressed_size : 0; + unsigned compression_type = crc_is_compressed(crc) + ? crc.compression_type + : 0; + unsigned size = compression_type ? crc.uncompressed_size : 0; struct nonce nonce = (struct nonce) {{ [0] = cpu_to_le32(size << 22), [1] = cpu_to_le32(version.lo), [2] = cpu_to_le32(version.lo >> 32), [3] = cpu_to_le32(version.hi| - (crc.compression_type << 24))^BCH_NONCE_EXTENT, + (compression_type << 24))^BCH_NONCE_EXTENT, }}; return nonce_add(nonce, crc.nonce << 9); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index b6b4ec48dccc..0713286d7999 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -17,7 +17,6 @@ struct bbuf { BB_NONE, BB_VMAP, BB_KMALLOC, - BB_VMALLOC, BB_MEMPOOL, } type; int rw; @@ -33,17 +32,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) if (b) return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT); - b = b ? page_address(b) : NULL; - if (b) - return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; - - b = vmalloc(size); - if (b) - return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw }; - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); - b = b ? page_address(b) : NULL; if (b) return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; @@ -66,7 +55,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); #ifndef CONFIG_HIGHMEM - __bio_for_each_contig_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (bv.bv_len == start.bi_size) return (struct bbuf) { .b = page_address(bv.bv_page) + bv.bv_offset, @@ -129,12 +118,8 @@ static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) case BB_KMALLOC: kfree(buf.b); break; - case BB_VMALLOC: - vfree(buf.b); - break; case BB_MEMPOOL: - mempool_free(virt_to_page(buf.b), - &c->compression_bounce[buf.rw]); + mempool_free(buf.b, &c->compression_bounce[buf.rw]); break; } } @@ -434,7 +419,7 @@ out: bio_unmap_or_unbounce(c, dst_data); return compression_type; err: - compression_type = 0; + compression_type = BCH_COMPRESSION_TYPE_incompressible; goto out; } @@ -561,15 +546,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) have_compressed: if (!mempool_initialized(&c->compression_bounce[READ])) { - ret = mempool_init_page_pool(&c->compression_bounce[READ], - 1, order); + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], + 1, order); if (ret) goto out; } if (!mempool_initialized(&c->compression_bounce[WRITE])) { - ret = mempool_init_page_pool(&c->compression_bounce[WRITE], - 1, order); + ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], + 1, order); if (ret) goto out; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 623b6c3eda95..ae5c9fd8d9f7 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -169,12 +169,12 @@ int bch2_dirent_rename(struct btree_trans *trans, const struct qstr *dst_name, u64 *dst_inum, enum bch_rename_mode mode) { - struct btree_iter *src_iter, *dst_iter; + struct btree_iter *src_iter = NULL, *dst_iter = NULL; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); - int ret; + int ret = 0; *src_inum = *dst_inum = 0; @@ -191,8 +191,10 @@ int bch2_dirent_rename(struct btree_trans *trans, : bch2_hash_lookup(trans, bch2_dirent_hash_desc, dst_hash, dst_dir, dst_name, BTREE_ITER_INTENT); - if (IS_ERR(dst_iter)) - return PTR_ERR(dst_iter); + ret = PTR_ERR_OR_ZERO(dst_iter); + if (ret) + goto out; + old_dst = bch2_btree_iter_peek_slot(dst_iter); if (mode != BCH_RENAME) @@ -202,15 +204,18 @@ int bch2_dirent_rename(struct btree_trans *trans, src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, src_hash, src_dir, src_name, BTREE_ITER_INTENT); - if (IS_ERR(src_iter)) - return PTR_ERR(src_iter); + ret = PTR_ERR_OR_ZERO(src_iter); + if (ret) + goto out; + old_src = bch2_btree_iter_peek_slot(src_iter); *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); /* Create new dst key: */ new_dst = dirent_create_key(trans, 0, dst_name, 0); - if (IS_ERR(new_dst)) - return PTR_ERR(new_dst); + ret = PTR_ERR_OR_ZERO(new_dst); + if (ret) + goto out; dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); new_dst->k.p = dst_iter->pos; @@ -218,15 +223,18 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { new_src = dirent_create_key(trans, 0, src_name, 0); - if (IS_ERR(new_src)) - return PTR_ERR(new_src); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); new_src->k.p = src_iter->pos; } else { new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - if (IS_ERR(new_src)) - return PTR_ERR(new_src); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; + bkey_init(&new_src->k); new_src->k.p = src_iter->pos; @@ -247,7 +255,7 @@ int bch2_dirent_rename(struct btree_trans *trans, new_dst->k.p = src_iter->pos; bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); - return 0; + goto out; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to @@ -261,7 +269,7 @@ int bch2_dirent_rename(struct btree_trans *trans, ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, src_hash, src_iter); if (ret < 0) - return ret; + goto out; if (ret) new_src->k.type = KEY_TYPE_whiteout; @@ -270,7 +278,10 @@ int bch2_dirent_rename(struct btree_trans *trans, bch2_trans_update(trans, src_iter, &new_src->k_i, 0); bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); - return 0; +out: + bch2_trans_iter_put(trans, src_iter); + bch2_trans_iter_put(trans, dst_iter); + return ret; } int bch2_dirent_delete_at(struct btree_trans *trans, @@ -331,9 +342,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) break; } } - - if (!IS_ERR(iter)) - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_put(trans, iter); return ret; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index a49d0745c720..933945b65925 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -741,6 +741,8 @@ found_slot: ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: + bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) goto retry; @@ -802,8 +804,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, continue; } - bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - dev = s->key.v.ptrs[idx].dev; bkey_on_stack_reassemble(&sk, c, k); @@ -818,6 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, extent_stripe_ptr_add(e, s, ec_ptr, idx); + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); bch2_trans_update(&trans, iter, sk.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, @@ -1201,8 +1202,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, struct btree_iter *iter, struct stripe *m, size_t idx, - struct bkey_i_stripe *new_key, - unsigned flags) + struct bkey_i_stripe *new_key) { struct bch_fs *c = trans->c; struct bkey_s_c k; @@ -1231,9 +1231,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); bch2_trans_update(trans, iter, &new_key->k_i, 0); - - return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); + return 0; } int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) @@ -1257,12 +1255,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) if (!m->dirty) continue; - do { - bch2_trans_reset(&trans, TRANS_RESET_MEM); - - ret = __bch2_stripe_write_key(&trans, iter, m, - giter.pos, new_key, flags); - } while (ret == -EINTR); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags, + __bch2_stripe_write_key(&trans, iter, m, + giter.pos, new_key)); if (ret) break; @@ -1280,9 +1276,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) { struct btree_trans trans; - struct btree_iter *btree_iter; - struct journal_iter journal_iter; - struct bkey_s_c btree_k, journal_k; + struct btree_and_journal_iter iter; + struct bkey_s_c k; int ret; ret = bch2_fs_ec_start(c); @@ -1291,38 +1286,16 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_trans_init(&trans, c, 0, 0); - btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0); - journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC); + bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, + BTREE_ID_EC, POS_MIN); - btree_k = bch2_btree_iter_peek(btree_iter); - journal_k = bch2_journal_iter_peek(&journal_iter); - while (1) { - bool btree; - - if (btree_k.k && journal_k.k) { - int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); - - if (!cmp) - btree_k = bch2_btree_iter_next(btree_iter); - btree = cmp < 0; - } else if (btree_k.k) { - btree = true; - } else if (journal_k.k) { - btree = false; - } else { - break; - } - - bch2_mark_key(c, btree ? btree_k : journal_k, - 0, 0, NULL, 0, + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_ALLOC_READ| BTREE_TRIGGER_NOATOMIC); - if (btree) - btree_k = bch2_btree_iter_next(btree_iter); - else - journal_k = bch2_journal_iter_next(&journal_iter); + bch2_btree_and_journal_iter_advance(&iter); } ret = bch2_trans_exit(&trans) ?: ret; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 8d9fbfd19f66..cf67abd48490 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -12,6 +12,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, #define bch2_bkey_ops_stripe (struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ } static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 846d77dc2530..2a7d913bdda3 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -39,6 +39,16 @@ static int count_iters_for_insert(struct btree_trans *trans, { int ret = 0; + /* + * The extent update path requires an _additional_ iterator for each + * extent we're inserting and overwriting: + */ + *nr_iters += 1; + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: @@ -105,7 +115,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, b = iter->l[0].b; node_iter = iter->l[0].iter; - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); + BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && + bkey_cmp(bkey_start_pos(&insert->k), + bkey_predecessor(b->data->min_key)) < 0); *end = bpos_min(insert->k.p, b->key.k.p); @@ -114,8 +126,7 @@ int bch2_extent_atomic_end(struct btree_iter *iter, if (ret < 0) return ret; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); unsigned offset = 0; @@ -167,402 +178,39 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, - unsigned *u64s) + struct bkey_i *insert) { struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *_k; + struct bkey_s_c k; struct bkey unpacked; int sectors; - while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - overlap = bch2_extent_overlap(&insert->k, k.k); - - /* - * If we're overwriting an existing extent, we may need to emit - * a whiteout - unless we're inserting a new extent at the same - * position: - */ - if (k.k->needs_whiteout && - (!bkey_whiteout(&insert->k) || - bkey_cmp(k.k->p, insert->k.p))) - *u64s += BKEY_U64s; - - /* - * If we're partially overwriting an existing extent which has - * been written out to disk, we'll need to emit a new version of - * that extent: - */ - if (bkey_written(l->b, _k) && - overlap != BCH_EXTENT_OVERLAP_ALL) - *u64s += _k->u64s; - - /* And we may be splitting an existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_bkey_sectors_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - - bch2_btree_node_iter_advance(&node_iter, l->b); - } - - return BTREE_INSERT_OK; -} - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *k = - bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - -static void pack_push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) -{ - struct bkey_packed k; - - if (!bkey_pack_pos(&k, pos, b)) { - struct bkey_i tmp; + _k = bch2_btree_node_iter_peek(&node_iter, l->b); + if (!_k) + return BTREE_INSERT_OK; - bkey_init(&tmp.k); - tmp.k.p = pos; - bkey_copy(&k, &tmp); - } + k = bkey_disassemble(l->b, _k, &unpacked); - k.needs_whiteout = true; - push_whiteout(c, b, &k); -} + /* Check if we're splitting a compressed extent: */ -static void -extent_drop(struct bch_fs *c, struct btree_iter *iter, - struct bkey_packed *_k, struct bkey_s k) -{ - struct btree_iter_level *l = &iter->l[0]; - - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); - - k.k->size = 0; - k.k->type = KEY_TYPE_deleted; - - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } -} - -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_on_stack tmp, split; + if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && + bkey_cmp(insert->k.p, k.k->p) < 0 && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; - bkey_on_stack_init(&tmp); - bkey_on_stack_init(&split); - - if (!btree_node_old_extent_overwrite(l->b)) { - if (!bkey_whiteout(&insert->k) && - !bkey_cmp(k.k->p, insert->k.p)) { - insert->k.needs_whiteout = k.k->needs_whiteout; - k.k->needs_whiteout = false; - } - } else { - insert->k.needs_whiteout |= k.k->needs_whiteout; - } - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - /* - * needs_whiteout was propagated to new version of @k, - * @tmp: - */ - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - /* - * No need to call bset_fix_invalidated_key, start of - * extent changed but extents are indexed by where they - * end - */ - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - break; - case BCH_EXTENT_OVERLAP_BACK: - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); - - /* - * @tmp has different position than @k, needs_whiteout - * should not be propagated: - */ - if (!btree_node_old_extent_overwrite(l->b)) - tmp.k->k.needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - /* - * position of @k is changing, emit a whiteout if - * needs_whiteout is set: - */ - if (!btree_node_old_extent_overwrite(l->b) && - k.k->needs_whiteout) { - pack_push_whiteout(c, l->b, k.k->p); - k.k->needs_whiteout = false; - } - - btree_keys_account_val_delta(l->b, _k, - bch2_cut_back_s(bkey_start_pos(&insert->k), k)); - extent_save(l->b, _k, k.k); - - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - break; - case BCH_EXTENT_OVERLAP_ALL: - extent_drop(c, iter, _k, k); - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - bkey_on_stack_reassemble(&split, c, k.s_c); - bch2_cut_back(bkey_start_pos(&insert->k), split.k); - - if (!btree_node_old_extent_overwrite(l->b)) - split.k->k.needs_whiteout = false; - - /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ - if (bkey_written(l->b, _k)) { - bkey_on_stack_reassemble(&tmp, c, k.s_c); - bch2_cut_front(insert->k.p, tmp.k); - - if (!btree_node_old_extent_overwrite(l->b)) - k.k->needs_whiteout = false; - - extent_drop(c, iter, _k, k); - extent_bset_insert(c, iter, tmp.k); - } else { - btree_keys_account_val_delta(l->b, _k, - bch2_cut_front_s(insert->k.p, k)); - - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - } - - extent_bset_insert(c, iter, split.k); - break; - } - - bkey_on_stack_exit(&split, c); - bkey_on_stack_exit(&tmp, c); -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - bool do_update = !bkey_whiteout(&insert->k); - struct bkey_packed *_k; - struct bkey unpacked; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + switch (bch2_disk_reservation_add(trans->c, trans->disk_res, + sectors, flags)) { + case 0: break; - - if (!bkey_whiteout(k.k)) - do_update = true; - - if (!do_update) { - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - - bch2_cut_front(cur_end, insert); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - } else { - extent_squash(c, iter, insert, _k, k, overlap); + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); } - - node_iter = l->iter; - - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; } - l->iter = node_iter; - bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - - if (do_update) { - if (insert->k.type == KEY_TYPE_deleted) - insert->k.type = KEY_TYPE_discard; - - if (!bkey_whiteout(&insert->k) || - btree_node_old_extent_overwrite(l->b)) - extent_bset_insert(c, iter, insert); - - bch2_btree_journal_key(trans, iter, insert); - } - - bch2_cut_front(insert->k.p, insert); + return BTREE_INSERT_OK; } diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index e9dc8091ba3f..38dc084627d2 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, - struct bkey_i *, unsigned *); -void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_iter *, - struct bkey_i *); + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index c4b0b9e15a8f..3f66457d2272 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -9,6 +9,7 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_iter.h" #include "buckets.h" #include "checksum.h" @@ -214,6 +215,37 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + + pr_buf(out, "seq %llu sectors %u written %u min_key ", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors), + le16_to_cpu(bp.v->sectors_written)); + + bch2_bpos_to_text(out, bp.v->min_key); + pr_buf(out, " "); + bch2_bkey_ptrs_to_text(out, c, k); +} + +void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) +{ + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); + + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_node_type_is_extents(btree_id) && + bkey_cmp(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bkey_predecessor(bp.v->min_key) + : bkey_successor(bp.v->min_key); +} + /* KEY_TYPE_extent: */ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -337,7 +369,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, if (!bch2_checksum_mergeable(crc_l.csum_type)) return BCH_MERGE_NOMERGE; - if (crc_l.compression_type) + if (crc_is_compressed(crc_l)) return BCH_MERGE_NOMERGE; if (crc_l.csum_type && @@ -345,7 +377,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, crc_r.uncompressed_size > c->sb.encoded_extent_max) return BCH_MERGE_NOMERGE; - if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) return BCH_MERGE_NOMERGE; @@ -448,7 +480,7 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, struct bch_extent_crc_unpacked n) { - return !u.compression_type && + return !crc_is_compressed(u) && u.csum_type && u.uncompressed_size > u.live_size && bch2_csum_type_is_encryption(u.csum_type) == @@ -492,7 +524,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) /* Find a checksum entry that covers only live data: */ if (!n.csum_type) { bkey_for_each_crc(&k->k, ptrs, u, i) - if (!u.compression_type && + if (!crc_is_compressed(u) && u.csum_type && u.live_size == u.uncompressed_size) { n = u; @@ -501,7 +533,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) return false; } found: - BUG_ON(n.compression_type); + BUG_ON(crc_is_compressed(n)); BUG_ON(n.offset); BUG_ON(n.live_size != k->k.size); @@ -563,15 +595,15 @@ void bch2_extent_crc_append(struct bkey_i *k, enum bch_extent_entry_type type; if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.uncompressed_size <= CRC32_SIZE_MAX && new.nonce <= CRC32_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc32; else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.uncompressed_size <= CRC64_SIZE_MAX && new.nonce <= CRC64_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc64; else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.uncompressed_size <= CRC128_SIZE_MAX && new.nonce <= CRC128_NONCE_MAX) type = BCH_EXTENT_ENTRY_crc128; else @@ -610,8 +642,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_TYPE_none; + ret += !p.ptr.cached && !crc_is_compressed(p.crc); } return ret; @@ -625,13 +656,24 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) unsigned ret = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_TYPE_none) + if (!p.ptr.cached && crc_is_compressed(p.crc)) ret += p.crc.compressed_size; return ret; } +bool bch2_bkey_is_incompressible(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} + bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, unsigned nr_replicas) { @@ -739,6 +781,7 @@ void bch2_bkey_append_ptr(struct bkey_i *k, switch (k->k.type) { case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: case KEY_TYPE_extent: EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); @@ -1021,6 +1064,8 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) if (k.k->type == KEY_TYPE_btree_ptr) size_ondisk = c->opts.btree_node_size; + if (k.k->type == KEY_TYPE_btree_ptr_v2) + size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) @@ -1069,17 +1114,19 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_ptr_swab(struct bkey_s k) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + u64 *d; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + for (d = (u64 *) ptrs.start; + d != (u64 *) ptrs.end; + d++) + *d = swab64(*d); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + for (entry = ptrs.start; + entry < ptrs.end; entry = extent_entry_next(entry)) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 7c5a41e6d79d..29b15365d19c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -175,6 +175,12 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) #undef common_fields } +static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) +{ + return (crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); +} + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { @@ -219,6 +225,13 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) bkey_val_end(r), }; } + case KEY_TYPE_btree_ptr_v2: { + struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } default: return (struct bkey_ptrs_c) { NULL, NULL }; } @@ -359,6 +372,11 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ .key_invalid = bch2_btree_ptr_invalid, \ .key_debugcheck = bch2_btree_ptr_debugcheck, \ @@ -366,6 +384,14 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, .swab = bch2_ptr_swab, \ } +#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ +} + /* KEY_TYPE_extent: */ const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); @@ -410,6 +436,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) { switch (k->type) { case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: case KEY_TYPE_extent: case KEY_TYPE_reflink_v: return true; @@ -483,6 +510,7 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); @@ -525,7 +553,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); +void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 96f7bbe0a3ed..878419d40992 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -19,14 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, struct posix_acl *acl) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter; + struct btree_iter *dir_iter = NULL; struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); u64 now = bch2_current_time(trans->c); int ret; dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - if (IS_ERR(dir_iter)) - return PTR_ERR(dir_iter); + ret = PTR_ERR_OR_ZERO(dir_iter); + if (ret) + goto err; bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); @@ -37,20 +38,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (ret) - return ret; + goto err; if (default_acl) { ret = bch2_set_acl_trans(trans, new_inode, &hash, default_acl, ACL_TYPE_DEFAULT); if (ret) - return ret; + goto err; } if (acl) { ret = bch2_set_acl_trans(trans, new_inode, &hash, acl, ACL_TYPE_ACCESS); if (ret) - return ret; + goto err; } if (name) { @@ -62,48 +63,55 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ret = bch2_inode_write(trans, dir_iter, dir_u); if (ret) - return ret; + goto err; ret = bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(new_inode->bi_mode), name, new_inode->bi_inum, BCH_HASH_SET_MUST_CREATE); if (ret) - return ret; + goto err; } - - return 0; +err: + bch2_trans_iter_put(trans, dir_iter); + return ret; } int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 inum, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name) { - struct btree_iter *dir_iter, *inode_iter; + struct btree_iter *dir_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; u64 now = bch2_current_time(trans->c); + int ret; inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto err; inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); - if (IS_ERR(dir_iter)) - return PTR_ERR(dir_iter); + ret = PTR_ERR_OR_ZERO(dir_iter); + if (ret) + goto err; dir_u->bi_mtime = dir_u->bi_ctime = now; dir_hash = bch2_hash_info_init(trans->c, dir_u); - bch2_trans_iter_put(trans, dir_iter); - return bch2_dirent_create(trans, dir_inum, &dir_hash, + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum, BCH_HASH_SET_MUST_CREATE) ?: bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); +err: + bch2_trans_iter_put(trans, dir_iter); + bch2_trans_iter_put(trans, inode_iter); + return ret; } int bch2_unlink_trans(struct btree_trans *trans, @@ -111,39 +119,49 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, const struct qstr *name) { - struct btree_iter *dir_iter, *dirent_iter, *inode_iter; + struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, + *inode_iter = NULL; struct bch_hash_info dir_hash; u64 inum, now = bch2_current_time(trans->c); struct bkey_s_c k; + int ret; dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - if (IS_ERR(dir_iter)) - return PTR_ERR(dir_iter); + ret = PTR_ERR_OR_ZERO(dir_iter); + if (ret) + goto err; dir_hash = bch2_hash_info_init(trans->c, dir_u); dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, name, BTREE_ITER_INTENT); - if (IS_ERR(dirent_iter)) - return PTR_ERR(dirent_iter); + ret = PTR_ERR_OR_ZERO(dirent_iter); + if (ret) + goto err; k = bch2_btree_iter_peek_slot(dirent_iter); inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto err; dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); bch2_inode_nlink_dec(inode_u); - return (S_ISDIR(inode_u->bi_mode) + ret = (S_ISDIR(inode_u->bi_mode) ? bch2_empty_dir_trans(trans, inum) : 0) ?: bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); +err: + bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_put(trans, dirent_iter); + bch2_trans_iter_put(trans, dir_iter); + return ret; } bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, @@ -179,24 +197,26 @@ int bch2_rename_trans(struct btree_trans *trans, const struct qstr *dst_name, enum bch_rename_mode mode) { - struct btree_iter *src_dir_iter, *dst_dir_iter = NULL; - struct btree_iter *src_inode_iter, *dst_inode_iter = NULL; + struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; + struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; struct bch_hash_info src_hash, dst_hash; u64 src_inode, dst_inode, now = bch2_current_time(trans->c); int ret; src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, BTREE_ITER_INTENT); - if (IS_ERR(src_dir_iter)) - return PTR_ERR(src_dir_iter); + ret = PTR_ERR_OR_ZERO(src_dir_iter); + if (ret) + goto err; src_hash = bch2_hash_info_init(trans->c, src_dir_u); if (dst_dir != src_dir) { dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, BTREE_ITER_INTENT); - if (IS_ERR(dst_dir_iter)) - return PTR_ERR(dst_dir_iter); + ret = PTR_ERR_OR_ZERO(dst_dir_iter); + if (ret) + goto err; dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); } else { @@ -211,38 +231,48 @@ int bch2_rename_trans(struct btree_trans *trans, dst_name, &dst_inode, mode); if (ret) - return ret; + goto err; src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, BTREE_ITER_INTENT); - if (IS_ERR(src_inode_iter)) - return PTR_ERR(src_inode_iter); + ret = PTR_ERR_OR_ZERO(src_inode_iter); + if (ret) + goto err; if (dst_inode) { dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, BTREE_ITER_INTENT); - if (IS_ERR(dst_inode_iter)) - return PTR_ERR(dst_inode_iter); + ret = PTR_ERR_OR_ZERO(dst_inode_iter); + if (ret) + goto err; } if (mode == BCH_RENAME_OVERWRITE) { if (S_ISDIR(src_inode_u->bi_mode) != - S_ISDIR(dst_inode_u->bi_mode)) - return -ENOTDIR; + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -ENOTDIR; + goto err; + } if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inode)) - return -ENOTEMPTY; + bch2_empty_dir_trans(trans, dst_inode)) { + ret = -ENOTEMPTY; + goto err; + } } if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) - return -EXDEV; + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } if (mode == BCH_RENAME_EXCHANGE && bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) - return -EXDEV; + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } if (S_ISDIR(src_inode_u->bi_mode)) { src_dir_u->bi_nlink--; @@ -270,7 +300,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (dst_inode) dst_inode_u->bi_ctime = now; - return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: + ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: (src_dir != dst_dir ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) : 0 ) ?: @@ -278,4 +308,10 @@ int bch2_rename_trans(struct btree_trans *trans, (dst_inode ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) : 0 ); +err: + bch2_trans_iter_put(trans, dst_inode_iter); + bch2_trans_iter_put(trans, src_inode_iter); + bch2_trans_iter_put(trans, dst_dir_iter); + bch2_trans_iter_put(trans, src_dir_iter); + return ret; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index c0f8cd8942e4..0aa3afade4ea 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -602,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -627,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -782,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1037,32 +1034,33 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1086,7 +1084,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1240,7 +1238,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1805,10 +1803,11 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct address_space *mapping = req->ki_filp->f_mapping; struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; + unsigned unaligned; u64 new_i_size; - bool sync; + bool sync = dio->sync; long ret; if (dio->loop) @@ -1838,7 +1837,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1856,7 +1855,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), GFP_KERNEL); if (unlikely(!iov)) { - dio->sync = true; + dio->sync = sync = true; goto do_io; } @@ -1870,7 +1869,7 @@ do_io: dio->loop = true; closure_call(&dio->op.cl, bch2_write, NULL, NULL); - if (dio->sync) + if (sync) wait_for_completion(&dio->done); else return -EIOCBQUEUED; @@ -1886,7 +1885,7 @@ loop: i_size_write(&inode->v, new_i_size); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; @@ -1904,7 +1903,6 @@ err: if (dio->free_iov) kfree(dio->iter.iov); - sync = dio->sync; bio_put(bio); /* inode->i_dio_count is our ref on inode and thus bch_fs */ @@ -2514,10 +2512,8 @@ reassemble: bkey_on_stack_reassemble(©, c, k); if (insert && - bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { + bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) bch2_cut_front(move_pos, copy.k); - bch2_btree_iter_set_pos(src, bkey_start_pos(©.k->k)); - } copy.k->k.p.offset += shift >> 9; bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); @@ -2537,8 +2533,9 @@ reassemble: } bkey_init(&delete.k); - delete.k.p = src->pos; - bch2_key_resize(&delete.k, copy.k->k.size); + delete.k.p = copy.k->k.p; + delete.k.size = copy.k->k.size; + delete.k.p.offset -= shift >> 9; next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2559,6 +2556,8 @@ reassemble: BUG_ON(ret); } + bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); + ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: bch2_trans_commit(&trans, &disk_res, @@ -2649,7 +2648,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, struct bkey_i_reservation reservation; struct bkey_s_c k; - bch2_trans_reset(&trans, TRANS_RESET_MEM); + bch2_trans_begin(&trans); k = bch2_btree_iter_peek_slot(iter); if ((ret = bkey_err(k))) @@ -2823,235 +2822,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3244,7 +3014,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..7063556d289b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 13b8bbcdb694..1c89a1b2c2d0 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -142,8 +142,6 @@ retry: &inode->ei_journal_seq, BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); - if (ret == -EINTR) - goto retry; /* * the btree node lock protects inode->ei_inode, not ei_update_lock; @@ -152,6 +150,11 @@ retry: if (!ret) bch2_inode_update_after_write(c, inode, &inode_u, fields); + bch2_trans_iter_put(&trans, iter); + + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); return ret < 0 ? ret : 0; } @@ -963,15 +966,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -989,7 +983,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1520,7 +1514,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9ef532d875e8..3ab621c62c43 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "dirent.h" #include "error.h" @@ -81,7 +82,6 @@ static int remove_dirent(struct btree_trans *trans, return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - TRANS_RESET_MEM, __remove_dirent(trans, dirent)); } @@ -182,8 +182,6 @@ static int hash_redo_key(const struct bch_hash_desc desc, struct bkey_i delete; struct bkey_i *tmp; - bch2_trans_reset(trans, TRANS_RESET_MEM); - tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if (IS_ERR(tmp)) return PTR_ERR(tmp); @@ -194,11 +192,8 @@ static int hash_redo_key(const struct bch_hash_desc desc, delete.k.p = k_iter->pos; bch2_trans_update(trans, k_iter, &delete, 0); - return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, - tmp, BCH_HASH_SET_MUST_CREATE) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, + tmp, BCH_HASH_SET_MUST_CREATE); } static int fsck_hash_delete_at(struct btree_trans *trans, @@ -320,10 +315,9 @@ static int hash_check_key(struct btree_trans *trans, desc.btree_id, k.k->p.offset, hashed, h->chain->pos.offset, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { - do { - ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); - } while (ret == -EINTR); - + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + hash_redo_key(desc, trans, h, k_iter, k, hashed)); if (ret) { bch_err(c, "hash_redo_key err %i", ret); return ret; @@ -387,7 +381,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - TRANS_RESET_MEM, (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); if (ret) goto err; @@ -410,11 +403,10 @@ err_redo: k->k->p.offset, hash, h->chain->pos.offset, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - do { - ret = hash_redo_key(bch2_dirent_hash_desc, trans, - h, iter, *k, hash); - } while (ret == -EINTR); - + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + hash_redo_key(bch2_dirent_hash_desc, trans, + h, iter, *k, hash)); if (ret) bch_err(c, "hash_redo_key err %i", ret); else @@ -431,6 +423,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) POS(inode_nr + 1, 0), NULL); } +static int bch2_fix_overlapping_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, struct bpos cut_at) +{ + struct btree_iter *u_iter; + struct bkey_i *u; + int ret; + + u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(u, k); + bch2_cut_front(cut_at, u); + + u_iter = bch2_trans_copy_iter(trans, iter); + ret = PTR_ERR_OR_ZERO(u_iter); + if (ret) + return ret; + + /* + * We don't want to go through the + * extent_handle_overwrites path: + */ + __bch2_btree_iter_set_pos(u_iter, u->k.p, false); + + /* + * XXX: this is going to leave disk space + * accounting slightly wrong + */ + ret = bch2_trans_update(trans, u_iter, u, 0); + bch2_trans_iter_put(trans, u_iter); + return ret; +} + /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent @@ -442,17 +470,40 @@ static int check_extents(struct bch_fs *c) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + struct bkey_on_stack prev; u64 i_sectors; int ret = 0; + bkey_on_stack_init(&prev); + prev.k->k = KEY(0, 0, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(BCACHEFS_ROOT_INO, 0), 0); + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT); retry: for_each_btree_key_continue(iter, 0, k, ret) { + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_fix_overlapping_extent(&trans, + iter, k, prev.k->k.p)); + if (ret) + goto err; + } + } + bkey_on_stack_reassemble(&prev, c, k); + ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -477,7 +528,8 @@ retry: !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && w.inode.bi_sectors != (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), - c, "i_sectors wrong: got %llu, should be %llu", + c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", + w.inode.bi_inum, w.inode.bi_sectors, i_sectors)) { struct bkey_inode_buf p; @@ -519,6 +571,7 @@ err: fsck_err: if (ret == -EINTR) goto retry; + bkey_on_stack_exit(&prev, c); return bch2_trans_exit(&trans) ?: ret; } @@ -660,7 +713,6 @@ retry: ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - TRANS_RESET_MEM, (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); kfree(n); if (ret) @@ -986,12 +1038,12 @@ retry: if (!ret) continue; - if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c, + if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, "unreachable directory found (inum %llu)", - k.k->p.inode)) { + k.k->p.offset)) { bch2_trans_unlock(&trans); - ret = reattach_inode(c, lostfound_inode, k.k->p.inode); + ret = reattach_inode(c, lostfound_inode, k.k->p.offset); if (ret) { goto err; } @@ -1275,7 +1327,6 @@ static int check_inode(struct btree_trans *trans, ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - TRANS_RESET_MEM, (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); if (ret) bch_err(c, "error in fsck: error %i " @@ -1302,18 +1353,18 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, - POS(range_start, 0), 0); + POS(0, range_start), 0); nlinks_iter = genradix_iter_init(links, 0); while ((k = bch2_btree_iter_peek(iter)).k && !(ret2 = bkey_err(k))) { peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); - if (!link && (!k.k || iter->pos.inode >= range_end)) + if (!link && (!k.k || iter->pos.offset >= range_end)) break; nlinks_pos = range_start + nlinks_iter.pos; - if (iter->pos.inode > nlinks_pos) { + if (iter->pos.offset > nlinks_pos) { /* Should have been caught by dirents pass: */ need_fsck_err_on(link && link->count, c, "missing inode %llu (nlink %u)", @@ -1322,7 +1373,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); goto peek_nlinks; } - if (iter->pos.inode < nlinks_pos || !link) + if (iter->pos.offset < nlinks_pos || !link) link = &zero_links; if (k.k && k.k->type == KEY_TYPE_inode) { @@ -1338,7 +1389,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); nlinks_pos, link->count); } - if (nlinks_pos == iter->pos.inode) + if (nlinks_pos == iter->pos.offset) genradix_iter_advance(&nlinks_iter, links); bch2_btree_iter_next(iter); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index e811b98d0f03..7d20f082ad45 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -98,7 +98,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, unsigned bytes; bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.inode = inode->bi_inum; + packed->inode.k.p.offset = inode->bi_inum; packed->inode.v.bi_hash_seed = inode->bi_hash_seed; packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); @@ -149,7 +149,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, unsigned fieldnr = 0, field_bits; int ret; - unpacked->bi_inum = inode.k->p.inode; + unpacked->bi_inum = inode.k->p.offset; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); @@ -188,7 +188,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), BTREE_ITER_SLOTS|flags); if (IS_ERR(iter)) return iter; @@ -232,13 +232,13 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); struct bch_inode_unpacked unpacked; - if (k.k->p.offset) - return "nonzero offset"; + if (k.k->p.inode) + return "nonzero k.p.inode"; if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) return "incorrect value size"; - if (k.k->p.inode < BLOCKDEV_INODE_MAX) + if (k.k->p.offset < BLOCKDEV_INODE_MAX) return "fs inode in blockdev range"; if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) @@ -280,8 +280,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, const char *bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (k.k->p.offset) - return "nonzero offset"; + if (k.k->p.inode) + return "nonzero k.p.inode"; if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) return "incorrect value size"; @@ -362,16 +362,16 @@ int bch2_inode_create(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, u64 min, u64 max, u64 *hint) { - struct bch_fs *c = trans->c; struct bkey_inode_buf *inode_p; - struct btree_iter *iter; + struct btree_iter *iter = NULL; + struct bkey_s_c k; u64 start; int ret; if (!max) max = ULLONG_MAX; - if (c->opts.inodes_32bit) + if (trans->c->opts.inodes_32bit) max = min_t(u64, max, U32_MAX); start = READ_ONCE(*hint); @@ -382,48 +382,37 @@ int bch2_inode_create(struct btree_trans *trans, inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); if (IS_ERR(inode_p)) return PTR_ERR(inode_p); - - iter = bch2_trans_get_iter(trans, - BTREE_ID_INODES, POS(start, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); again: - while (1) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - - ret = bkey_err(k); - if (ret) - return ret; + for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; - switch (k.k->type) { - case KEY_TYPE_inode: - /* slot used */ - if (iter->pos.inode >= max) - goto out; + if (k.k->type != KEY_TYPE_inode) + goto found_slot; + } - bch2_btree_iter_next_slot(iter); - break; + bch2_trans_iter_put(trans, iter); - default: - *hint = k.k->p.inode; - inode_u->bi_inum = k.k->p.inode; - inode_u->bi_generation = bkey_generation(k); + if (ret) + return ret; - bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); - return 0; - } - } -out: if (start != min) { /* Retry from start */ start = min; - bch2_btree_iter_set_pos(iter, POS(start, 0)); goto again; } return -ENOSPC; +found_slot: + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + + bch2_inode_pack(inode_p, inode_u); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + bch2_trans_iter_put(trans, iter); + return 0; } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) @@ -454,7 +443,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0), + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); @@ -486,10 +475,10 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) if (!bi_generation) { bkey_init(&delete.k); - delete.k.p.inode = inode_nr; + delete.k.p.offset = inode_nr; } else { bkey_inode_generation_init(&delete.k_i); - delete.k.p.inode = inode_nr; + delete.k.p.offset = inode_nr; delete.v.bi_generation = cpu_to_le32(bi_generation); } @@ -511,21 +500,20 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(inode_nr, 0), BTREE_ITER_SLOTS); + POS(0, inode_nr), BTREE_ITER_SLOTS); if (IS_ERR(iter)) return PTR_ERR(iter); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) - return ret; + goto err; ret = k.k->type == KEY_TYPE_inode ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) : -ENOENT; - +err: bch2_trans_iter_put(trans, iter); - return ret; } diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index f2a2c45a02ad..19059702428a 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -124,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -325,7 +325,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_disk_reservation_init(c, 0); struct bkey_i delete; - bch2_trans_reset(trans, TRANS_RESET_MEM); + bch2_trans_begin(trans); ret = bkey_err(k); if (ret) @@ -399,7 +399,7 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { - bch2_trans_reset(&trans, TRANS_RESET_MEM); + bch2_trans_begin(&trans); k = bch2_keylist_front(keys); @@ -546,9 +546,14 @@ static void __bch2_write_index(struct bch_write_op *op) * particularly want to plumb io_opts all the way through the btree * update stack right now */ - for_each_keylist_key(keys, k) + for_each_keylist_key(keys, k) { bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) + bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); + + } + if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); @@ -784,8 +789,9 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - op->crc.compression_type == op->compression_type) { - if (!op->crc.compression_type && + (op->crc.compression_type == op->compression_type || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && bch2_write_rechecksum(c, op, op->csum_type)) return PREP_ENCODED_CHECKSUM_ERR; @@ -797,7 +803,7 @@ static enum prep_encoded_ret { * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it: */ - if (op->crc.compression_type) { + if (crc_is_compressed(op->crc)) { struct bch_csum csum; if (bch2_write_decrypt(op)) @@ -864,6 +870,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: + BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -908,11 +915,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_type && !bounce); - crc.compression_type = op->compression_type - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_type + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_type) : 0; - if (!crc.compression_type) { + if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); @@ -933,7 +942,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (bch2_csum_type_is_encryption(op->csum_type)) { if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version) + 1; + version.lo = atomic64_inc_return(&c->key_version); } else { crc.nonce = op->nonce; op->nonce += src_len >> 9; @@ -941,7 +950,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } if ((op->flags & BCH_WRITE_DATA_ENCODED) && - !crc.compression_type && + !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { /* @@ -1060,6 +1069,12 @@ again: BKEY_EXTENT_U64s_MAX)) goto flush_io; + if ((op->flags & BCH_WRITE_FROM_INTERNAL) && + percpu_ref_is_dying(&c->writes)) { + ret = -EROFS; + goto err; + } + wp = bch2_alloc_sectors_start(c, op->target, op->opts.erasure_code, @@ -1212,7 +1227,8 @@ void bch2_write(struct closure *cl) if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - __bcache_io_error(c, "read only"); + if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) + __bcache_io_error(c, "read only"); op->error = -EROFS; goto err; } @@ -1338,6 +1354,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) static struct promote_op *__promote_alloc(struct bch_fs *c, enum btree_id btree_id, + struct bkey_s_c k, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1394,8 +1411,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, - btree_id, - bkey_s_c_null); + btree_id, k); BUG_ON(ret); return op; @@ -1437,7 +1453,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_REFLINK : BTREE_ID_EXTENTS, - pos, pick, opts, sectors, rbio); + k, pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1690,33 +1706,39 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } -static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_on_stack new; - struct bch_extent_crc_unpacked new_crc; u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; - int ret; - - if (rbio->pick.crc.compression_type) - return; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter *iter = NULL; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; - bkey_on_stack_init(&new); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); + if (crc_is_compressed(rbio->pick.crc)) + return 0; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, + iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = PTR_ERR_OR_ZERO(iter))) + goto out; + k = bch2_btree_iter_peek_slot(iter); - if (IS_ERR_OR_NULL(k.k)) + if ((ret = bkey_err(k))) goto out; - bkey_on_stack_reassemble(&new, c, k); - k = bkey_i_to_s_c(new.k); + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + BKEY_EXTENT_U64s_MAX * 8); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + k = bkey_i_to_s_c(new); if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) @@ -1732,21 +1754,23 @@ retry: bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; goto out; } - if (!bch2_bkey_narrow_crcs(new.k, new_crc)) + if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - bch2_trans_update(&trans, iter, new.k, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOWAIT); - if (ret == -EINTR) - goto retry; + bch2_trans_update(trans, iter, new, 0); out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&new, c); + bch2_trans_iter_put(trans, iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(&trans, rbio)); } /* Inner part that may run in process context */ @@ -1786,7 +1810,7 @@ static void __bch2_read_endio(struct work_struct *work) crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - if (crc.compression_type != BCH_COMPRESSION_TYPE_none) { + if (crc_is_compressed(crc)) { bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; @@ -1883,7 +1907,7 @@ static void bch2_read_endio(struct bio *bio) } if (rbio->narrow_crcs || - rbio->pick.crc.compression_type || + crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) @@ -1994,7 +2018,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none || + if (crc_is_compressed(pick.crc) || (pick.crc.csum_type != BCH_CSUM_NONE && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && @@ -2009,7 +2033,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick.crc.compression_type); + EBUG_ON(crc_is_compressed(pick.crc)); EBUG_ON(pick.crc.csum_type && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 45c950942d78..e45dcf9635ae 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -31,10 +31,11 @@ enum bch_write_flags { BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_NOPUT_RESERVATION = (1 << 7), BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), + BCH_WRITE_FROM_INTERNAL = (1 << 9), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), - BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10), + BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -78,6 +79,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; op->alloc_reserve = RESERVE_NONE; + op->incompressible = 0; op->open_buckets.nr = 0; op->devs_have.nr = 0; op->target = 0; diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index c37b7d7401e9..684e4c9a5d98 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -104,7 +104,8 @@ struct bch_write_op { unsigned compression_type:4; unsigned nr_replicas:4; unsigned nr_replicas_required:4; - unsigned alloc_reserve:4; + unsigned alloc_reserve:3; + unsigned incompressible:1; struct bch_devs_list devs_have; u16 target; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 9f03a479c9a2..0a4538b3dc60 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -376,7 +376,8 @@ unlock: goto retry; if (ret == -ENOSPC) { - BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED)); + WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), + "JOURNAL_RES_GET_RESERVED set but journal full"); /* * Journal is full - can't rely on reclaim from work item due to diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7112a25d0600..39bb2154cce1 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "alloc_foreground.h" +#include "btree_io.h" #include "buckets.h" #include "checksum.h" #include "error.h" @@ -138,7 +139,8 @@ static void journal_entry_null_range(void *start, void *end) static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - struct bkey_i *k, enum btree_node_type key_type, + unsigned level, enum btree_id btree_id, + struct bkey_i *k, const char *type, int write) { void *next = vstruct_next(entry); @@ -171,14 +173,13 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, return 0; } - if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) - bch2_bkey_swab(NULL, bkey_to_packed(k)); + if (!write) + bch2_bkey_compat(level, btree_id, version, + JSET_BIG_ENDIAN(jset), write, + NULL, bkey_to_packed(k)); - if (!write && - version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(key_type, bkey_to_packed(k), write); - - invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id)); if (invalid) { char buf[160]; @@ -192,9 +193,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, return 0; } - if (write && - version < bcachefs_metadata_version_bkey_renumber) - bch2_bkey_renumber(key_type, bkey_to_packed(k), write); + if (write) + bch2_bkey_compat(level, btree_id, version, + JSET_BIG_ENDIAN(jset), write, + NULL, bkey_to_packed(k)); fsck_err: return ret; } @@ -207,10 +209,10 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, struct bkey_i *k; vstruct_for_each(entry, k) { - int ret = journal_validate_key(c, jset, entry, k, - __btree_node_type(entry->level, - entry->btree_id), - "key", write); + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, + k, "key", write); if (ret) return ret; } @@ -240,7 +242,7 @@ static int journal_entry_validate_btree_root(struct bch_fs *c, return 0; } - return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE, + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, "btree root", write); fsck_err: return ret; @@ -1016,8 +1018,7 @@ void bch2_journal_write(struct closure *cl) if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; - if (le32_to_cpu(jset->version) < - bcachefs_metadata_version_bkey_renumber) + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) validate_before_checksum = true; if (validate_before_checksum && @@ -1041,9 +1042,16 @@ void bch2_journal_write(struct closure *cl) bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); +retry_alloc: spin_lock(&j->lock); ret = journal_write_alloc(j, w, sectors); + if (ret && j->can_discard) { + spin_unlock(&j->lock); + bch2_journal_do_discards(j); + goto retry_alloc; + } + /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 695b2c8ba03b..db3afd908474 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -290,38 +290,6 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) } } -static inline void __journal_pin_add(struct journal *j, - u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - BUG_ON(journal_pin_active(pin)); - BUG_ON(!atomic_read(&pin_list->count)); - - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; - - list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); - - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - journal_wake(j); -} - -void bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - __journal_pin_add(j, seq, pin, flush_fn); - spin_unlock(&j->lock); -} - static inline void __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { @@ -354,42 +322,46 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) +void __bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) { + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + spin_lock(&j->lock); - if (pin->seq != seq) { - __journal_pin_drop(j, pin); - __journal_pin_add(j, seq, pin, flush_fn); - } else { - struct journal_entry_pin_list *pin_list = - journal_seq_pin(j, seq); + __journal_pin_drop(j, pin); + + BUG_ON(!atomic_read(&pin_list->count)); - list_move(&pin->list, &pin_list->list); - } + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + + list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); spin_unlock(&j->lock); + + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); } -void bch2_journal_pin_add_if_older(struct journal *j, - struct journal_entry_pin *src_pin, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) +void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) { - spin_lock(&j->lock); - - if (journal_pin_active(src_pin) && - (!journal_pin_active(pin) || - src_pin->seq < pin->seq)) { - __journal_pin_drop(j, pin); - __journal_pin_add(j, src_pin->seq, pin, flush_fn); - } - - spin_unlock(&j->lock); + if (journal_pin_active(src) && + (!journal_pin_active(dst) || src->seq < dst->seq)) + __bch2_journal_pin_add(j, src->seq, dst, flush_fn); } +/** + * bch2_journal_pin_flush: ensure journal pin callback is no longer running + */ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) { BUG_ON(journal_pin_active(pin)); diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 9bf982a17797..883a0a5680af 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -29,16 +29,24 @@ journal_seq_pin(struct journal *j, u64 seq) } void bch2_journal_pin_put(struct journal *, u64); - -void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); -void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void bch2_journal_pin_add_if_older(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); + +void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); + +static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin))) + __bch2_journal_pin_add(j, seq, pin, flush_fn); +} + +void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); + void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_do_discards(struct journal *); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 1ef62a189e33..e26fa1608f39 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -123,23 +123,21 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) for_each_btree_node(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, b) { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; - struct bkey_i_btree_ptr *new_key; retry: if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), dev_idx)) continue; bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_btree_ptr(&tmp.k); - ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), + ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), dev_idx, flags, true); if (ret) { bch_err(c, "Cannot drop device without losing data"); goto err; } - ret = bch2_btree_node_update_key(c, iter, b, new_key); + ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); if (ret == -EINTR) { b = bch2_btree_iter_peek_node(iter); goto retry; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 20885b605b50..4afda95f4017 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -215,6 +215,9 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, enum btree_id btree_id, struct bkey_s_c k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; int ret; m->btree_id = btree_id; @@ -223,9 +226,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->nr_ptrs_reserved = 0; bch2_write_op_init(&m->op, c, io_opts); - m->op.compression_type = - bch2_compression_opt_to_type[io_opts.background_compression ?: - io_opts.compression]; + + if (!bch2_bkey_is_incompressible(k)) + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.background_compression ?: + io_opts.compression]; + else + m->op.incompressible = true; + m->op.target = data_opts.target, m->op.write_point = wp; @@ -235,7 +243,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED; + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_FROM_INTERNAL; m->op.nr_replicas = 1; m->op.nr_replicas_required = 1; @@ -265,14 +274,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; unsigned compressed_sectors = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc_is_compressed(p.crc) && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) compressed_sectors += p.crc.compressed_size; @@ -300,12 +306,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1c05effa71e6..ba4903352343 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -255,6 +255,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ + x(keep_journal, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ x(noexcl, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 612385e9d4e4..ab1934325948 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -17,50 +17,52 @@ #include <linux/sched/cputime.h> #include <trace/events/bcachefs.h> -static inline bool rebalance_ptr_pred(struct bch_fs *c, - struct extent_ptr_decoded p, - struct bch_io_opts *io_opts) +/* + * Check if an extent should be moved: + * returns -1 if it should not be moved, or + * device of pointer that should be moved, if known, or INT_MAX if unknown + */ +static int __bch2_rebalance_pred(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) { - if (io_opts->background_target && - !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) && - !p.ptr.cached) - return true; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; if (io_opts->background_compression && - p.crc.compression_type != - bch2_compression_opt_to_type[io_opts->background_compression]) - return true; - - return false; + !bch2_bkey_is_incompressible(k)) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return p.ptr.dev; + + if (io_opts->background_target) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) + return p.ptr.dev; + + return -1; } void bch2_rebalance_add_key(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + atomic64_t *counter; + int dev; - if (!io_opts->background_target && - !io_opts->background_compression) + dev = __bch2_rebalance_pred(c, k, io_opts); + if (dev < 0) return; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (rebalance_ptr_pred(c, p, io_opts)) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + counter = dev < INT_MAX + ? &bch_dev_bkey_exists(c, dev)->rebalance_work + : &c->rebalance.work_unknown_dev; - if (atomic64_add_return(p.crc.compressed_size, - &ca->rebalance_work) == - p.crc.compressed_size) - rebalance_wakeup(c); - } -} - -void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -{ - if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == - sectors) + if (atomic64_add_return(k.k->size, counter) == k.k->size) rebalance_wakeup(c); } @@ -69,26 +71,20 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned nr_replicas = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - nr_replicas += !p.ptr.cached; - - if (rebalance_ptr_pred(c, p, io_opts)) - goto found; + if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; + } else { + return DATA_SKIP; } +} - if (nr_replicas < io_opts->data_replicas) - goto found; - - return DATA_SKIP; -found: - data_opts->target = io_opts->background_target; - data_opts->btree_insert_flags = 0; - return DATA_ADD_REPLICAS; +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == + sectors) + rebalance_wakeup(c); } struct rebalance_work { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8ecd4abc8eeb..a4d0eec2ea3e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -27,43 +27,173 @@ /* iterate over keys read from the journal: */ -struct journal_iter bch2_journal_iter_init(struct journal_keys *keys, - enum btree_id id) +static struct journal_key *journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { - return (struct journal_iter) { - .keys = keys, - .k = keys->d, - .btree_id = id, - }; + size_t l = 0, r = journal_keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if ((cmp_int(id, journal_keys->d[m].btree_id) ?: + cmp_int(level, journal_keys->d[m].level) ?: + bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < journal_keys->nr && + (cmp_int(id, journal_keys->d[l].btree_id) ?: + cmp_int(level, journal_keys->d[l].level) ?: + bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); + + BUG_ON(l && + (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: + cmp_int(level, journal_keys->d[l - 1].level) ?: + bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); + + return l < journal_keys->nr ? journal_keys->d + l : NULL; +} + +static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) +{ + if (iter->k && + iter->k < iter->keys->d + iter->keys->nr && + iter->k->btree_id == iter->btree_id && + iter->k->level == iter->level) + return iter->k->k; + + iter->k = NULL; + return NULL; } -struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +static void bch2_journal_iter_advance(struct journal_iter *iter) { + if (iter->k) + iter->k++; +} + +static void bch2_journal_iter_init(struct journal_iter *iter, + struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = journal_keys; + iter->k = journal_key_search(journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return iter->btree + ? bch2_btree_iter_peek(iter->btree) + : bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + if (iter->btree) + bch2_btree_iter_next(iter->btree); + else + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + switch (iter->last) { + case none: + break; + case btree: + bch2_journal_iter_advance_btree(iter); + break; + case journal: + bch2_journal_iter_advance(&iter->journal); + break; + } + + iter->last = none; +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c ret; + while (1) { - if (iter->k == iter->keys->d + iter->keys->nr) + struct bkey_s_c btree_k = + bch2_journal_iter_peek_btree(iter); + struct bkey_s_c journal_k = + bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); + + if (btree_k.k && journal_k.k) { + int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + + if (!cmp) + bch2_journal_iter_advance_btree(iter); + + iter->last = cmp < 0 ? btree : journal; + } else if (btree_k.k) { + iter->last = btree; + } else if (journal_k.k) { + iter->last = journal; + } else { + iter->last = none; return bkey_s_c_null; + } - if (iter->k->btree_id == iter->btree_id) - return bkey_i_to_s_c(iter->k->k); + ret = iter->last == journal ? journal_k : btree_k; - iter->k++; + if (iter->b && + bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { + iter->journal.k = NULL; + iter->last = none; + return bkey_s_c_null; + } + + if (!bkey_deleted(ret.k)) + break; + + bch2_btree_and_journal_iter_advance(iter); } - return bkey_s_c_null; + return ret; +} + +struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) +{ + bch2_btree_and_journal_iter_advance(iter); + + return bch2_btree_and_journal_iter_peek(iter); +} + +void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, + struct btree_trans *trans, + struct journal_keys *journal_keys, + enum btree_id id, struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->btree = bch2_trans_get_iter(trans, id, pos, 0); + bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); } -struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter) +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct journal_keys *journal_keys, + struct btree *b) { - if (iter->k == iter->keys->d + iter->keys->nr) - return bkey_s_c_null; + memset(iter, 0, sizeof(*iter)); - iter->k++; - return bch2_journal_iter_peek(iter); + iter->b = b; + bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); + bch2_journal_iter_init(&iter->journal, journal_keys, + b->btree_id, b->level, b->data->min_key); } /* sort and dedup all keys in the journal: */ -static void journal_entries_free(struct list_head *list) +void bch2_journal_entries_free(struct list_head *list) { while (!list_empty(list)) { @@ -75,13 +205,17 @@ static void journal_entries_free(struct list_head *list) } } +/* + * When keys compare equal, oldest compares first: + */ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos) ?: + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + bkey_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -91,27 +225,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->pos, r->pos); -} - -static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i) -{ - while (i + 1 < keys->d + keys->nr && - journal_sort_key_cmp(i, i + 1) > 0) { - swap(i[0], i[1]); - i++; - } + return cmp_int(r->level, l->level) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->k->k.p, r->k->k.p); } -static void journal_keys_free(struct journal_keys *keys) +void bch2_journal_keys_free(struct journal_keys *keys) { - struct journal_key *i; - - for_each_journal_key(*keys, i) - if (i->allocated) - kfree(i->k); kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -122,15 +243,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) struct journal_replay *p; struct jset_entry *entry; struct bkey_i *k, *_n; - struct journal_keys keys = { NULL }, keys_deduped = { NULL }; - struct journal_key *i; + struct journal_keys keys = { NULL }; + struct journal_key *src, *dst; size_t nr_keys = 0; list_for_each_entry(p, journal_entries, list) for_each_jset_key(k, _n, entry, &p->j) nr_keys++; - keys.journal_seq_base = keys_deduped.journal_seq_base = + keys.journal_seq_base = le64_to_cpu(list_first_entry(journal_entries, struct journal_replay, list)->j.seq); @@ -139,91 +260,33 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (!keys.d) goto err; - keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL); - if (!keys_deduped.d) - goto err; - list_for_each_entry(p, journal_entries, list) for_each_jset_key(k, _n, entry, &p->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, - .pos = bkey_start_pos(&k->k), + .level = entry->level, .k = k, .journal_seq = le64_to_cpu(p->j.seq) - keys.journal_seq_base, .journal_offset = k->_data - p->j._data, }; - sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); - - i = keys.d; - while (i < keys.d + keys.nr) { - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - !bkey_cmp(i[0].pos, i[1].pos)) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - i++; - } else { - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - } - continue; - } + sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); - if (i + 1 < keys.d + keys.nr && - i[0].btree_id == i[1].btree_id && - bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) { - if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: - cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); - } else { - struct bkey_i *split = - kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); - - if (!split) - goto err; - - bkey_copy(split, i[0].k); - bch2_cut_back(bkey_start_pos(&i[1].k->k), split); - keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { - .btree_id = i[0].btree_id, - .allocated = true, - .pos = bkey_start_pos(&split->k), - .k = split, - .journal_seq = i[0].journal_seq, - .journal_offset = i[0].journal_offset, - }; - - bch2_cut_front(i[1].k->k.p, i[0].k); - i[0].pos = i[1].k->k.p; - journal_keys_sift(&keys, i); - continue; - } - } else { - if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) { - i[1] = i[0]; - i++; - continue; - } else { - bch2_cut_front(i[0].k->k.p, i[1].k); - i[1].pos = i[0].k->k.p; - journal_keys_sift(&keys, i + 1); - continue; - } - } - } + src = dst = keys.d; + while (src < keys.d + keys.nr) { + while (src + 1 < keys.d + keys.nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + src++; - keys_deduped.d[keys_deduped.nr++] = *i++; + *dst++ = *src++; } - kvfree(keys.d); - return keys_deduped; + keys.nr = dst - keys.d; err: - journal_keys_free(&keys_deduped); - kvfree(keys.d); - return (struct journal_keys) { NULL }; + return keys; } /* journal replay: */ @@ -274,11 +337,6 @@ retry: atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); - split_iter = bch2_trans_copy_iter(&trans, iter); - ret = PTR_ERR_OR_ZERO(split_iter); - if (ret) - goto err; - split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ret = PTR_ERR_OR_ZERO(split); if (ret) @@ -297,12 +355,25 @@ retry: } bkey_copy(split, k); - bch2_cut_front(split_iter->pos, split); + bch2_cut_front(iter->pos, split); bch2_cut_back(atomic_end, split); + split_iter = bch2_trans_copy_iter(&trans, iter); + ret = PTR_ERR_OR_ZERO(split_iter); + if (ret) + goto err; + + /* + * It's important that we don't go through the + * extent_handle_overwrites() and extent_update_to_keys() path + * here: journal replay is supposed to treat extents like + * regular keys + */ + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); bch2_trans_update(&trans, split_iter, split, !remark ? BTREE_TRIGGER_NORUN : BTREE_TRIGGER_NOOVERWRITES); + bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); @@ -328,27 +399,40 @@ err: } static int __bch2_journal_replay_key(struct btree_trans *trans, - enum btree_id id, struct bkey_i *k) + enum btree_id id, unsigned level, + struct bkey_i *k) { struct btree_iter *iter; + int ret; - iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); + iter = bch2_trans_get_node_iter(trans, id, k->k.p, + BTREE_MAX_DEPTH, level, + BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); - return 0; + /* + * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run + * extent_handle_overwrites() and extent_update_to_keys() - but we don't + * want that here, journal replay is supposed to treat extents like + * regular keys: + */ + __bch2_btree_iter_set_pos(iter, k->k.p, false); + + ret = bch2_btree_iter_traverse(iter) ?: + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); + return ret; } static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *k) + unsigned level, struct bkey_i *k) { return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY, - __bch2_journal_replay_key(&trans, id, k)); + __bch2_journal_replay_key(&trans, id, level, k)); } static int bch2_journal_replay(struct bch_fs *c, @@ -360,15 +444,21 @@ static int bch2_journal_replay(struct bch_fs *c, sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + if (keys.nr) + replay_now_at(j, keys.journal_seq_base); + for_each_journal_key(keys, i) { - replay_now_at(j, keys.journal_seq_base + i->journal_seq); + if (!i->level) + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + if (i->level) + ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - else if (btree_node_type_is_extents(i->btree_id)) + else if (i->k->k.size) ret = bch2_extent_replay_key(c, i->btree_id, i->k); else - ret = bch2_journal_replay_key(c, i->btree_id, i->k); + ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); if (ret) { bch_err(c, "journal replay: error %d while replaying key", @@ -707,8 +797,6 @@ int bch2_fs_recovery(struct bch_fs *c) const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; u64 journal_seq; - LIST_HEAD(journal_entries); - struct journal_keys journal_keys = { NULL }; bool wrote = false, write_sb = false; int ret; @@ -727,33 +815,33 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } - if (!c->sb.clean || c->opts.fsck) { + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { struct jset *j; - ret = bch2_journal_read(c, &journal_entries); + ret = bch2_journal_read(c, &c->journal_entries); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c, + if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&journal_entries)) { + if (!c->sb.clean && list_empty(&c->journal_entries)) { bch_err(c, "no journal entries found"); ret = BCH_FSCK_REPAIR_IMPOSSIBLE; goto err; } - journal_keys = journal_keys_sort(&journal_entries); - if (!journal_keys.d) { + c->journal_keys = journal_keys_sort(&c->journal_entries); + if (!c->journal_keys.d) { ret = -ENOMEM; goto err; } - j = &list_last_entry(&journal_entries, + j = &list_last_entry(&c->journal_entries, struct journal_replay, list)->j; ret = verify_superblock_clean(c, &clean, j); @@ -765,7 +853,14 @@ int bch2_fs_recovery(struct bch_fs *c) journal_seq = le64_to_cpu(clean->journal_seq) + 1; } - ret = journal_replay_early(c, clean, &journal_entries); + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + ret = journal_replay_early(c, clean, &c->journal_entries); if (ret) goto err; @@ -783,15 +878,15 @@ int bch2_fs_recovery(struct bch_fs *c) ret = bch2_blacklist_table_initialize(c); - if (!list_empty(&journal_entries)) { + if (!list_empty(&c->journal_entries)) { ret = verify_journal_entries_not_blacklisted_or_missing(c, - &journal_entries); + &c->journal_entries); if (ret) goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq, - &journal_entries); + &c->journal_entries); if (ret) goto err; @@ -801,14 +896,14 @@ int bch2_fs_recovery(struct bch_fs *c) bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - ret = bch2_alloc_read(c, &journal_keys); + ret = bch2_alloc_read(c, &c->journal_keys); if (ret) goto err; bch_verbose(c, "alloc read done"); bch_verbose(c, "starting stripes_read"); err = "error reading stripes"; - ret = bch2_stripes_read(c, &journal_keys); + ret = bch2_stripes_read(c, &c->journal_keys); if (ret) goto err; bch_verbose(c, "stripes_read done"); @@ -824,7 +919,7 @@ int bch2_fs_recovery(struct bch_fs *c) */ bch_info(c, "starting metadata mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, NULL, true, true); + ret = bch2_gc(c, &c->journal_keys, true, true); if (ret) goto err; bch_verbose(c, "mark and sweep done"); @@ -835,7 +930,7 @@ int bch2_fs_recovery(struct bch_fs *c) test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, &journal_keys, true, false); + ret = bch2_gc(c, &c->journal_keys, true, false); if (ret) goto err; bch_verbose(c, "mark and sweep done"); @@ -856,7 +951,7 @@ int bch2_fs_recovery(struct bch_fs *c) bch_verbose(c, "starting journal replay"); err = "journal replay failed"; - ret = bch2_journal_replay(c, journal_keys); + ret = bch2_journal_replay(c, c->journal_keys); if (ret) goto err; bch_verbose(c, "journal replay done"); @@ -922,8 +1017,7 @@ int bch2_fs_recovery(struct bch_fs *c) c->disk_sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_min); c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; write_sb = true; } @@ -953,8 +1047,10 @@ fsck_err: set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); - journal_keys_free(&journal_keys); - journal_entries_free(&journal_entries); + if (!c->opts.keep_journal) { + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(&c->journal_entries); + } kfree(clean); if (ret) bch_err(c, "Error in recovery: %s (%i)", err, ret); @@ -1042,8 +1138,7 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->version = c->disk_sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 479ea46f8dcb..19f2f172a26b 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,32 +2,50 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -struct journal_keys { - struct journal_key { - enum btree_id btree_id:8; - unsigned allocated:1; - struct bpos pos; - struct bkey_i *k; - u32 journal_seq; - u32 journal_offset; - } *d; - size_t nr; - u64 journal_seq_base; -}; - #define for_each_journal_key(keys, i) \ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) struct journal_iter { + enum btree_id btree_id; + unsigned level; struct journal_keys *keys; struct journal_key *k; - enum btree_id btree_id; }; -struct journal_iter bch2_journal_iter_init(struct journal_keys *, - enum btree_id); -struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *); -struct bkey_s_c bch2_journal_iter_next(struct journal_iter *); +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ + +struct btree_and_journal_iter { + struct btree_iter *btree; + + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + + struct journal_iter journal; + + enum last_key_returned { + none, + btree, + journal, + } last; +}; + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, + struct btree_trans *, + struct journal_keys *, + enum btree_id, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct journal_keys *, + struct btree *); + +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct list_head *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 3b8c74ca3725..2f223be74926 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -128,10 +128,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); err: - if (!IS_ERR(reflink_iter)) { + if (!IS_ERR(reflink_iter)) c->reflink_hint = reflink_iter->pos.offset; - bch2_trans_iter_put(trans, reflink_iter); - } + bch2_trans_iter_put(trans, reflink_iter); return ret; } @@ -185,7 +184,7 @@ s64 bch2_remap_range(struct bch_fs *c, BTREE_ITER_INTENT); while (1) { - bch2_trans_reset(&trans, TRANS_RESET_MEM); + bch2_trans_begin(&trans); trans.mem_top = 0; diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index ac23b855858c..5445c1cf0797 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -22,6 +22,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ } s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 366888b1b36d..be4908575f72 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -112,6 +112,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, switch (k.k->type) { case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: e->data_type = BCH_DATA_BTREE; extent_to_replicas(k, e); break; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index f2779159a6b8..dea9b7252b88 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -163,6 +163,7 @@ bch2_hash_lookup(struct btree_trans *trans, break; } } + bch2_trans_iter_put(trans, iter); return ERR_PTR(ret ?: -ENOENT); } @@ -187,6 +188,9 @@ bch2_hash_hole(struct btree_trans *trans, return iter; } + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret ?: -ENOSPC); } @@ -262,10 +266,8 @@ int bch2_hash_set(struct btree_trans *trans, if (!ret) ret = -ENOSPC; out: - if (!IS_ERR_OR_NULL(slot)) - bch2_trans_iter_put(trans, slot); - if (!IS_ERR_OR_NULL(iter)) - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_put(trans, slot); + bch2_trans_iter_put(trans, iter); return ret; found: @@ -319,13 +321,16 @@ int bch2_hash_delete(struct btree_trans *trans, u64 inode, const void *key) { struct btree_iter *iter; + int ret; iter = bch2_hash_lookup(trans, desc, info, inode, key, BTREE_ITER_INTENT); if (IS_ERR(iter)) return PTR_ERR(iter); - return bch2_hash_delete_at(trans, desc, info, iter); + ret = bch2_hash_delete_at(trans, desc, info, iter); + bch2_trans_iter_put(trans, iter); + return ret; } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 43927853210a..6596764c8421 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -956,6 +956,9 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1086,6 +1089,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); + c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 38920fff4500..d2c275ce79ab 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -500,6 +500,8 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); kfree(c->usage_scratch); free_percpu(c->usage[1]); @@ -549,6 +551,10 @@ void bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&c->journal_seq_blacklist_gc_work); + mutex_lock(&c->state_lock); + bch2_fs_read_only(c); + mutex_unlock(&c->state_lock); + for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) @@ -572,10 +578,6 @@ void bch2_fs_stop(struct bch_fs *c) closure_sync(&c->cl); closure_debug_destroy(&c->cl); - mutex_lock(&c->state_lock); - bch2_fs_read_only(c); - mutex_unlock(&c->state_lock); - /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); @@ -674,6 +676,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->list); INIT_LIST_HEAD(&c->btree_interior_update_list); + INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); mutex_init(&c->btree_reserve_cache_lock); mutex_init(&c->btree_interior_update_lock); @@ -688,6 +691,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_WORK(&c->journal_seq_blacklist_gc_work, bch2_blacklist_entries_gc); + INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 602def1ee95a..d78ffcc0e8a4 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) struct extent_ptr_decoded p; extent_for_each_ptr_decode(e, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) { + if (!crc_is_compressed(p.crc)) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else { diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 8f9b0cca17da..4dcace650416 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -18,7 +18,7 @@ static void delete_test_keys(struct bch_fs *c) NULL); BUG_ON(ret); - ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, POS(0, 0), POS(0, U64_MAX), NULL); BUG_ON(ret); @@ -37,14 +37,14 @@ static void test_delete(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, iter, &k.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); BUG_ON(ret); pr_info("deleting once"); @@ -69,14 +69,14 @@ static void test_delete_written(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p, + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, iter, &k.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); BUG_ON(ret); bch2_journal_flush_all_pins(&c->journal); @@ -107,7 +107,7 @@ static void test_iterate(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i; - ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, NULL, NULL, 0); BUG_ON(ret); } @@ -116,9 +116,13 @@ static void test_iterate(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, + POS_MIN, 0, k, ret) { + if (k.k->p.inode) + break; + BUG_ON(k.k->p.offset != i++); + } BUG_ON(i != nr); @@ -202,7 +206,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i * 2; - ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, NULL, NULL, 0); BUG_ON(ret); } @@ -211,8 +215,11 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) { + if (k.k->p.inode) + break; + BUG_ON(k.k->p.offset != i); i += 2; } @@ -224,11 +231,12 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, BTREE_ITER_SLOTS, k, ret) { + BUG_ON(k.k->p.offset != i); BUG_ON(bkey_deleted(k.k) != (i & 1)); - BUG_ON(k.k->p.offset != i++); + i++; if (i == nr * 2) break; } @@ -307,7 +315,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); k = bch2_btree_iter_peek(iter); BUG_ON(k.k); @@ -409,18 +417,24 @@ static u64 test_rand(void) static void rand_insert(struct bch_fs *c, u64 nr) { + struct btree_trans trans; struct bkey_i_cookie k; int ret; u64 i; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); - ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i, - NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); + BUG_ON(ret); } + + bch2_trans_exit(&trans); } static void rand_lookup(struct bch_fs *c, u64 nr) @@ -433,7 +447,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(0, test_rand()), 0); k = bch2_btree_iter_peek(iter); @@ -454,7 +468,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(0, test_rand()), 0); k = bch2_btree_iter_peek(iter); @@ -465,8 +479,9 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p = iter->pos; - bch2_trans_update(&trans, iter, &k.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &k.k_i, 0)); + BUG_ON(ret); } @@ -476,20 +491,50 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bch2_trans_exit(&trans); } +static int __do_delete(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter *iter; + struct bkey_i delete; + struct bkey_s_c k; + int ret = 0; + + iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + goto err; + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + delete.k.p = k.k->p; + + bch2_trans_update(trans, iter, &delete, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static void rand_delete(struct bch_fs *c, u64 nr) { - struct bkey_i k; + struct btree_trans trans; int ret; u64 i; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < nr; i++) { - bkey_init(&k.k); - k.k.p.offset = test_rand(); + struct bpos pos = POS(0, test_rand()); - ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k, - NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); BUG_ON(ret); } + + bch2_trans_exit(&trans); } static void seq_insert(struct bch_fs *c, u64 nr) @@ -505,12 +550,13 @@ static void seq_insert(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; - bch2_trans_update(&trans, iter, &insert.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &insert.k_i, 0)); + BUG_ON(ret); if (++i == nr) @@ -528,7 +574,7 @@ static void seq_lookup(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ; bch2_trans_exit(&trans); } @@ -542,14 +588,15 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, BTREE_ITER_INTENT, k, ret) { struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, iter, &u.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, 0); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_trans_update(&trans, iter, &u.k_i, 0)); + BUG_ON(ret); } bch2_trans_exit(&trans); @@ -559,7 +606,7 @@ static void seq_delete(struct bch_fs *c, u64 nr) { int ret; - ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS, + ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, POS(0, 0), POS(0, U64_MAX), NULL); BUG_ON(ret); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 2b19a0038045..0128daba5970 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); |