summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-04-07 15:26:03 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2020-05-06 17:14:18 -0400
commit683ee194ef320abff86e8d1ced7993406e63896d (patch)
tree48fa4bfaa1e33e6ef659e4862445fb35fbd300b4
parent880fedbfc7edc4f1a0debc800a17d751cd71e87b (diff)
Merge with e4871e8f27 bcachefs: Fix a deadlock on starting an interior btree update
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/alloc_background.c20
-rw-r--r--fs/bcachefs/bcachefs.h17
-rw-r--r--fs/bcachefs/bcachefs_format.h45
-rw-r--r--fs/bcachefs/bkey.h4
-rw-r--r--fs/bcachefs/bkey_methods.c67
-rw-r--r--fs/bcachefs/bkey_methods.h23
-rw-r--r--fs/bcachefs/bkey_sort.c103
-rw-r--r--fs/bcachefs/bset.c13
-rw-r--r--fs/bcachefs/btree_cache.c136
-rw-r--r--fs/bcachefs/btree_cache.h25
-rw-r--r--fs/bcachefs/btree_gc.c265
-rw-r--r--fs/bcachefs/btree_io.c230
-rw-r--r--fs/bcachefs/btree_io.h58
-rw-r--r--fs/bcachefs/btree_iter.c680
-rw-r--r--fs/bcachefs/btree_iter.h81
-rw-r--r--fs/bcachefs/btree_types.h26
-rw-r--r--fs/bcachefs/btree_update.h25
-rw-r--r--fs/bcachefs/btree_update_interior.c575
-rw-r--r--fs/bcachefs/btree_update_interior.h48
-rw-r--r--fs/bcachefs/btree_update_leaf.c343
-rw-r--r--fs/bcachefs/buckets.c42
-rw-r--r--fs/bcachefs/buckets.h5
-rw-r--r--fs/bcachefs/checksum.c34
-rw-r--r--fs/bcachefs/checksum.h13
-rw-r--r--fs/bcachefs/compress.c29
-rw-r--r--fs/bcachefs/dirent.c45
-rw-r--r--fs/bcachefs/ec.c59
-rw-r--r--fs/bcachefs/ec.h1
-rw-r--r--fs/bcachefs/extent_update.c420
-rw-r--r--fs/bcachefs/extent_update.h5
-rw-r--r--fs/bcachefs/extents.c85
-rw-r--r--fs/bcachefs/extents.h30
-rw-r--r--fs/bcachefs/fs-common.c126
-rw-r--r--fs/bcachefs/fs-io.c292
-rw-r--r--fs/bcachefs/fs-io.h4
-rw-r--r--fs/bcachefs/fs.c20
-rw-r--r--fs/bcachefs/fsck.c111
-rw-r--r--fs/bcachefs/inode.c84
-rw-r--r--fs/bcachefs/io.c122
-rw-r--r--fs/bcachefs/io.h6
-rw-r--r--fs/bcachefs/io_types.h3
-rw-r--r--fs/bcachefs/journal.c3
-rw-r--r--fs/bcachefs/journal_io.c44
-rw-r--r--fs/bcachefs/journal_reclaim.c86
-rw-r--r--fs/bcachefs/journal_reclaim.h26
-rw-r--r--fs/bcachefs/migrate.c6
-rw-r--r--fs/bcachefs/move.c26
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/rebalance.c94
-rw-r--r--fs/bcachefs/recovery.c399
-rw-r--r--fs/bcachefs/recovery.h54
-rw-r--r--fs/bcachefs/reflink.c7
-rw-r--r--fs/bcachefs/reflink.h1
-rw-r--r--fs/bcachefs/replicas.c1
-rw-r--r--fs/bcachefs/str_hash.h15
-rw-r--r--fs/bcachefs/super-io.c5
-rw-r--r--fs/bcachefs/super.c13
-rw-r--r--fs/bcachefs/sysfs.c2
-rw-r--r--fs/bcachefs/tests.c115
-rw-r--r--fs/bcachefs/util.h29
60 files changed, 2748 insertions, 2503 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c57df50168e0..b2d1b8f9c9b8 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -211,33 +211,31 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
- struct journal_key *j;
unsigned i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
+ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+ BTREE_ID_ALLOC, POS_MIN);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
bch_err(c, "error reading alloc info: %i", ret);
return ret;
}
- for_each_journal_key(*journal_keys, j)
- if (j->btree_id == BTREE_ID_ALLOC)
- bch2_mark_key(c, bkey_i_to_s_c(j->k),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_ALLOC_READ|
- BTREE_TRIGGER_NOATOMIC);
-
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index be2c1ed9fcb2..fa9593764f0c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -521,6 +521,18 @@ struct journal_seq_blacklist_table {
} entries[0];
};
+struct journal_keys {
+ struct journal_key {
+ enum btree_id btree_id:8;
+ unsigned level:8;
+ struct bkey_i *k;
+ u32 journal_seq;
+ u32 journal_offset;
+ } *d;
+ size_t nr;
+ u64 journal_seq_base;
+};
+
struct bch_fs {
struct closure cl;
@@ -608,6 +620,7 @@ struct bch_fs {
mempool_t btree_interior_update_pool;
struct list_head btree_interior_update_list;
+ struct list_head btree_interior_updates_unwritten;
struct mutex btree_interior_update_lock;
struct closure_waitlist btree_interior_update_wait;
@@ -719,7 +732,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -786,6 +799,8 @@ struct bch_fs {
mempool_t btree_bounce_pool;
struct journal journal;
+ struct list_head journal_entries;
+ struct journal_keys journal_keys;
u64 last_bucket_seq_cleanup;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f6141fde830b..616863ef77d4 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -339,7 +339,8 @@ static inline void bkey_init(struct bkey *k)
x(stripe, 14) \
x(reflink_p, 15) \
x(reflink_v, 16) \
- x(inline_data, 17)
+ x(inline_data, 17) \
+ x(btree_ptr_v2, 18)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -595,6 +596,19 @@ struct bch_btree_ptr {
__u64 _data[0];
} __attribute__((packed, aligned(8)));
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ /* In case we ever decide to do variable size btree nodes: */
+ __le16 sectors;
+ struct bpos min_key;
+ struct bch_extent_ptr start[0];
+ __u64 _data[0];
+} __attribute__((packed, aligned(8)));
+
struct bch_extent {
struct bch_val v;
@@ -626,7 +640,8 @@ struct bch_reservation {
/* Btree pointers don't carry around checksums: */
#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
#define BKEY_BTREE_PTR_U64s_MAX \
(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
@@ -1141,7 +1156,8 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
bcachefs_metadata_version_new_versioning = 10,
bcachefs_metadata_version_bkey_renumber = 10,
- bcachefs_metadata_version_max = 11,
+ bcachefs_metadata_version_inode_btree_change = 11,
+ bcachefs_metadata_version_max = 12,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -1294,7 +1310,17 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(reflink, 6) \
x(new_siphash, 7) \
x(inline_data, 8) \
- x(new_extent_overwrite, 9)
+ x(new_extent_overwrite, 9) \
+ x(incompressible, 10) \
+ x(btree_ptr_v2, 11) \
+ x(extents_above_btree_updates, 12) \
+ x(btree_updates_journalled, 13)
+
+#define BCH_SB_FEATURES_ALL \
+ ((1ULL << BCH_FEATURE_new_siphash)| \
+ (1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1374,11 +1400,12 @@ enum bch_csum_opts {
};
#define BCH_COMPRESSION_TYPES() \
- x(none, 0) \
- x(lz4_old, 1) \
- x(gzip, 2) \
- x(lz4, 3) \
- x(zstd, 4)
+ x(none, 0) \
+ x(lz4_old, 1) \
+ x(gzip, 2) \
+ x(lz4, 3) \
+ x(zstd, 4) \
+ x(incompressible, 5)
enum bch_compression_type {
#define x(t, n) BCH_COMPRESSION_TYPE_##t,
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index f2d5f3009b21..cbcfbd26bc58 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
static inline void bkey_reassemble(struct bkey_i *dst,
struct bkey_s_c src)
{
- BUG_ON(bkey_packed(src.k));
dst->k = *src.k;
- memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })
@@ -565,6 +564,7 @@ BKEY_VAL_ACCESSORS(stripe);
BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
+BKEY_VAL_ACCESSORS(btree_ptr_v2);
/* byte order helpers */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 320e17d108d2..c97e1e9002cb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
- if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+ if (bkey_cmp(k.k->p, b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
@@ -202,15 +202,12 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
bch2_val_to_text(out, c, k);
}
-void bch2_bkey_swab(const struct bkey_format *f,
- struct bkey_packed *k)
+void bch2_bkey_swab_val(struct bkey_s k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
-
- bch2_bkey_swab_key(f, k);
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
if (ops->swab)
- ops->swab(f, k);
+ ops->swab(k);
}
bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
@@ -276,3 +273,59 @@ void bch2_bkey_renumber(enum btree_node_type btree_node_type,
break;
}
}
+
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct bkey_format *f,
+ struct bkey_packed *k)
+{
+ const struct bkey_ops *ops;
+ struct bkey uk;
+ struct bkey_s u;
+
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_key(f, k);
+
+ if (version < bcachefs_metadata_version_bkey_renumber)
+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_INODES) {
+ if (!bkey_packed(k)) {
+ struct bkey_i *u = packed_to_bkey(k);
+ swap(u->k.p.inode, u->k.p.offset);
+ } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+ f->bits_per_field[BKEY_FIELD_OFFSET]) {
+ struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+ swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+ tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+ swap(tmp.field_offset[BKEY_FIELD_INODE],
+ tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+ if (!write)
+ swap(in, out);
+
+ uk = __bch2_bkey_unpack_key(in, k);
+ swap(uk.p.inode, uk.p.offset);
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+ }
+ }
+
+ if (!bkey_packed(k)) {
+ u = bkey_i_to_s(packed_to_bkey(k));
+ } else {
+ uk = __bch2_bkey_unpack_key(f, k);
+ u.k = &uk;
+ u.v = bkeyp_val(f, k);
+ }
+
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_val(u);
+
+ ops = &bch2_bkey_ops[k->type];
+
+ if (ops->compat)
+ ops->compat(btree_id, version, big_endian, write, u);
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 8568b65c1ed2..0bca725ae3b8 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -29,10 +29,13 @@ struct bkey_ops {
void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
- void (*swab)(const struct bkey_format *, struct bkey_packed *);
+ void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
enum merge_result (*key_merge)(struct bch_fs *,
struct bkey_s, struct bkey_s);
+ void (*compat)(enum btree_id id, unsigned version,
+ unsigned big_endian, int write,
+ struct bkey_s);
};
const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
@@ -51,7 +54,7 @@ void bch2_val_to_text(struct printbuf *, struct bch_fs *,
void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_bkey_swab_val(struct bkey_s);
bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
@@ -60,4 +63,20 @@ enum merge_result bch2_bkey_merge(struct bch_fs *,
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+ int, struct bkey_format *, struct bkey_packed *);
+
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct bkey_format *f,
+ struct bkey_packed *k)
+{
+ if (version < bcachefs_metadata_version_current ||
+ big_endian != CPU_BIG_ENDIAN)
+ __bch2_bkey_compat(level, btree_id, version,
+ big_endian, write, f, k);
+
+}
+
#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 7cbb57042af1..839e78d1dc35 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
return nr;
}
-static void extent_sort_advance_prev(struct bkey_format *f,
- struct btree_nr_keys *nr,
- struct bkey_packed *start,
- struct bkey_packed **prev)
-{
- if (*prev) {
- bch2_bkey_pack(*prev, (void *) *prev, f);
-
- btree_keys_account_key_add(nr, 0, *prev);
- *prev = bkey_next(*prev);
- } else {
- *prev = start;
- }
-}
-
static void extent_sort_append(struct bch_fs *c,
struct bkey_format *f,
struct btree_nr_keys *nr,
- struct bkey_packed *start,
- struct bkey_packed **prev,
+ struct bkey_packed **out,
struct bkey_s k)
{
- if (bkey_whiteout(k.k))
- return;
-
- /*
- * prev is always unpacked, for key merging - until right before we
- * advance it:
- */
+ if (!bkey_whiteout(k.k)) {
+ if (!bch2_bkey_pack_key(*out, k.k, f))
+ memcpy_u64s_small(*out, k.k, BKEY_U64s);
- if (*prev &&
- bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
- BCH_MERGE_MERGE)
- return;
+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
- extent_sort_advance_prev(f, nr, start, prev);
-
- bkey_reassemble((void *) *prev, k.s_c);
+ btree_keys_account_key_add(nr, 0, *out);
+ *out = bkey_next(*out);
+ }
}
/* Sort + repack in a new format: */
@@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
return nr;
}
-/* Sort, repack, and merge: */
+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
struct btree_nr_keys
bch2_sort_repack_merge(struct bch_fs *c,
struct bset *dst, struct btree *src,
@@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
struct bkey_format *out_f,
bool filter_whiteouts)
{
- struct bkey_packed *prev = NULL, *k_packed;
+ struct bkey_packed *out = vstruct_last(dst), *k_packed;
struct bkey_on_stack k;
struct btree_nr_keys nr;
@@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c,
bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
continue;
- extent_sort_append(c, out_f, &nr, vstruct_last(dst),
- &prev, bkey_i_to_s(k.k));
+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
}
- extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&k, c);
return nr;
}
@@ -311,6 +285,25 @@ static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
cmp_int((unsigned long) r, (unsigned long) l);
}
+/*
+ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same
+ * bset being ordered by start offset - but 0 size whiteouts (which are always
+ * KEY_TYPE_deleted) break this ordering, so we need to skip over them:
+ */
+static void extent_iter_advance(struct sort_iter *iter, unsigned idx)
+{
+ struct sort_iter_set *i = iter->data + idx;
+
+ do {
+ i->k = bkey_next_skip_noops(i->k, i->end);
+ } while (i->k != i->end && bkey_deleted(i->k));
+
+ if (i->k == i->end)
+ array_remove_item(iter->data, iter->used, idx);
+ else
+ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp);
+}
+
struct btree_nr_keys
bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
struct sort_iter *iter)
@@ -318,24 +311,31 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
struct btree *b = iter->b;
struct bkey_format *f = &b->format;
struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
- struct bkey_packed *prev = NULL;
+ struct bkey_packed *out = dst->start;
struct bkey l_unpacked, r_unpacked;
struct bkey_s l, r;
struct btree_nr_keys nr;
struct bkey_on_stack split;
+ unsigned i;
memset(&nr, 0, sizeof(nr));
bkey_on_stack_init(&split);
sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+ for (i = 0; i < iter->used;) {
+ if (bkey_deleted(iter->data[i].k))
+ __sort_iter_advance(iter, i,
+ extent_sort_fix_overlapping_cmp);
+ else
+ i++;
+ }
while (!sort_iter_end(iter)) {
l = __bkey_disassemble(b, _l->k, &l_unpacked);
if (iter->used == 1) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- sort_iter_advance(iter,
- extent_sort_fix_overlapping_cmp);
+ extent_sort_append(c, f, &nr, &out, l);
+ extent_iter_advance(iter, 0);
continue;
}
@@ -343,16 +343,14 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
/* If current key and next key don't overlap, just append */
if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- sort_iter_advance(iter,
- extent_sort_fix_overlapping_cmp);
+ extent_sort_append(c, f, &nr, &out, l);
+ extent_iter_advance(iter, 0);
continue;
}
/* Skip 0 size keys */
if (!r.k->size) {
- __sort_iter_advance(iter, 1,
- extent_sort_fix_overlapping_cmp);
+ extent_iter_advance(iter, 1);
continue;
}
@@ -369,8 +367,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
if (_l->k > _r->k) {
/* l wins, trim r */
if (bkey_cmp(l.k->p, r.k->p) >= 0) {
- __sort_iter_advance(iter, 1,
- extent_sort_fix_overlapping_cmp);
+ extent_iter_advance(iter, 1);
} else {
bch2_cut_front_s(l.k->p, r);
extent_save(b, _r->k, r.k);
@@ -391,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
__sort_iter_sift(iter, 0,
extent_sort_fix_overlapping_cmp);
- extent_sort_append(c, f, &nr, dst->start,
- &prev, bkey_i_to_s(split.k));
+ extent_sort_append(c, f, &nr, &out,
+ bkey_i_to_s(split.k));
} else {
bch2_cut_back_s(bkey_start_pos(r.k), l);
extent_save(b, _l->k, l.k);
}
}
- extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&split, c);
return nr;
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index cf8fa59fada1..6360b2e8cf73 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -79,8 +79,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
_n = bkey_next_skip_noops(_k, vstruct_last(i));
bch2_bkey_to_text(&PBUF(buf), &k);
- printk(KERN_ERR "block %u key %5u: %s\n", set,
- __btree_node_key_to_offset(b, _k), buf);
+ printk(KERN_ERR "block %u key %5zu: %s\n", set,
+ _k->_data - i->_data, buf);
if (_n == vstruct_last(i))
continue;
@@ -1206,7 +1206,8 @@ void bch2_bset_insert(struct btree *b,
memcpy_u64s(bkeyp_val(f, where), &insert->v,
bkeyp_val_u64s(f, src));
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+ if (src->u64s != clobber_u64s)
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
bch2_verify_btree_nr_keys(b);
}
@@ -1681,7 +1682,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
struct bset_tree *t;
unsigned end = 0;
- bch2_btree_node_iter_verify(iter, b);
+ if (btree_keys_expensive_checks(b))
+ bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
k = bch2_bkey_prev_all(b, t,
@@ -1716,7 +1718,8 @@ found:
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- bch2_btree_node_iter_verify(iter, b);
+ if (btree_keys_expensive_checks(b))
+ bch2_btree_node_iter_verify(iter, b);
return prev;
}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 0c737f35f430..c12f8a6b5205 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -62,13 +62,13 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const struct btree *b = obj;
const u64 *v = arg->key;
- return PTR_HASH(&b->key) == *v ? 0 : 1;
+ return b->hash_val == *v ? 0 : 1;
}
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, key.v),
- .key_len = sizeof(struct bch_extent_ptr),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
@@ -114,11 +114,14 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
/* Cause future lookups for this node to fail: */
- PTR_HASH(&b->key) = 0;
+ b->hash_val = 0;
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(b->hash_val);
+ b->hash_val = btree_ptr_hash_val(&b->key);
+
return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
}
@@ -144,8 +147,9 @@ __flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
{
- return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
- bch_btree_cache_params);
+ u64 v = btree_ptr_hash_val(k);
+
+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
}
/*
@@ -199,7 +203,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
btree_node_wait_on_io(b);
}
out:
- if (PTR_HASH(&b->key) && !ret)
+ if (b->hash_val && !ret)
trace_btree_node_reap(c, b);
return ret;
out_unlock:
@@ -584,6 +588,7 @@ err:
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_iter *iter,
const struct bkey_i *k,
+ enum btree_id btree_id,
unsigned level,
enum six_lock_type lock_type,
bool sync)
@@ -591,23 +596,24 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
- BUG_ON(!btree_node_locked(iter, level + 1));
- BUG_ON(level >= BTREE_MAX_DEPTH);
+ if (iter && !bch2_btree_node_relock(iter, level + 1))
+ return ERR_PTR(-EINTR);
b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
- PTR_HASH(&b->key) = 0;
+ b->hash_val = 0;
mutex_lock(&bc->lock);
list_add(&b->list, &bc->freeable);
@@ -619,15 +625,11 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
}
/*
- * If the btree node wasn't cached, we can't drop our lock on
- * the parent until after it's added to the cache - because
- * otherwise we could race with a btree_split() freeing the node
- * we're trying to lock.
+ * Unlock before doing IO:
*
- * But the deadlock described below doesn't exist in this case,
- * so it's safe to not drop the parent lock until here:
+ * XXX: ideally should be dropping all btree node locks here
*/
- if (btree_node_read_locked(iter, level + 1))
+ if (iter && btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
bch2_btree_node_read(c, b, sync);
@@ -662,16 +664,11 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
struct btree *b;
struct bset_tree *t;
- /*
- * XXX: locking optimization
- *
- * we can make the locking looser here - caller can drop lock on parent
- * node before locking child node (and potentially blocking): we just
- * have to have bch2_btree_node_fill() call relock on the parent and
- * return -EINTR if that fails
- */
- EBUG_ON(!btree_node_locked(iter, level + 1));
EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_node_mem_ptr(k);
+ if (b)
+ goto lock_node;
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
@@ -680,7 +677,8 @@ retry:
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ level, lock_type, true);
/* We raced and found the btree node in the cache */
if (!b)
@@ -689,6 +687,7 @@ retry:
if (IS_ERR(b))
return b;
} else {
+lock_node:
/*
* There's a potential deadlock with splits and insertions into
* interior nodes we have to avoid:
@@ -710,7 +709,7 @@ retry:
* free it:
*
* To guard against this, btree nodes are evicted from the cache
- * when they're freed - and PTR_HASH() is zeroed out, which we
+ * when they're freed - and b->hash_val is zeroed out, which we
* check for after we lock the node.
*
* Then, bch2_btree_node_relock() on the parent will fail - because
@@ -723,7 +722,7 @@ retry:
if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
return ERR_PTR(-EINTR);
- if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->level != level ||
race_fault())) {
six_unlock_type(&b->lock, lock_type);
@@ -735,6 +734,7 @@ retry:
}
}
+ /* XXX: waiting on IO with btree locks held: */
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
@@ -749,7 +749,7 @@ retry:
}
/* avoid atomic set bit if it's not needed: */
- if (btree_node_accessed(b))
+ if (!btree_node_accessed(b))
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
@@ -764,6 +764,74 @@ retry:
return b;
}
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+ const struct bkey_i *k,
+ enum btree_id btree_id,
+ unsigned level)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ struct bset_tree *t;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_node_mem_ptr(k);
+ if (b)
+ goto lock_node;
+retry:
+ b = btree_cache_find(bc, k);
+ if (unlikely(!b)) {
+ b = bch2_btree_node_fill(c, NULL, k, btree_id,
+ level, SIX_LOCK_read, true);
+
+ /* We raced and found the btree node in the cache */
+ if (!b)
+ goto retry;
+
+ if (IS_ERR(b))
+ return b;
+ } else {
+lock_node:
+ six_lock_read(&b->lock);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->btree_id != btree_id ||
+ b->level != level)) {
+ six_unlock_read(&b->lock);
+ goto retry;
+ }
+ }
+
+ /* XXX: waiting on IO with btree locks held: */
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_read(&b->lock);
+ return ERR_PTR(-EIO);
+ }
+
+ EBUG_ON(b->btree_id != btree_id ||
+ BTREE_NODE_LEVEL(b->data) != level ||
+ bkey_cmp(b->data->max_key, k->k.p));
+
+ return b;
+}
+
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
@@ -855,8 +923,7 @@ out:
if (sib != btree_prev_sib)
swap(n1, n2);
- BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
- n1->key.k.p),
+ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
n2->data->min_key));
}
@@ -878,7 +945,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
if (b)
return;
- bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+ bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ level, SIX_LOCK_read, false);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 83358d6a4df8..132cc95a4c02 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
enum six_lock_type);
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+ enum btree_id, unsigned);
+
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
@@ -35,13 +38,29 @@ void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
void bch2_fs_btree_cache_init_early(struct btree_cache *);
-#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+ case KEY_TYPE_btree_ptr_v2:
+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
+ default:
+ return 0;
+ }
+}
+
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+ return k->k.type == KEY_TYPE_btree_ptr_v2
+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+ : NULL;
+}
/* is btree node in hash table? */
static inline bool btree_node_hashed(struct btree *b)
{
- return b->key.k.type == KEY_TYPE_btree_ptr &&
- PTR_HASH(&b->key);
+ return b->hash_val != 0;
}
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 05879b66d6af..cef8e148f784 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -47,65 +47,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
-/* range_checks - for validating min/max pos of each btree node: */
-
-struct range_checks {
- struct range_level {
- struct bpos min;
- struct bpos max;
- } l[BTREE_MAX_DEPTH];
- unsigned depth;
-};
-
-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
+static int bch2_gc_check_topology(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bpos *expected_start,
+ struct bpos expected_end,
+ bool is_last)
{
- unsigned i;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- r->l[i].min = r->l[i].max = POS_MIN;
- r->depth = depth;
-}
-
-static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
- struct range_checks *r)
-{
- struct range_level *l = &r->l[b->level];
-
- struct bpos expected_min = bkey_cmp(l->min, l->max)
- ? btree_type_successor(b->btree_id, l->max)
- : l->max;
-
- bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
- "btree node has incorrect min key: %llu:%llu != %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset,
- expected_min.inode,
- expected_min.offset);
-
- l->max = b->data->max_key;
+ int ret = 0;
- if (b->level > r->depth) {
- l = &r->l[b->level - 1];
+ if (k.k->type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
- "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset,
- l->min.inode,
- l->min.offset);
+ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
+ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
+ bp.v->min_key.inode,
+ bp.v->min_key.offset,
+ expected_start->inode,
+ expected_start->offset)) {
+ BUG();
+ }
+ }
- bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
- "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
- b->data->max_key.inode,
- b->data->max_key.offset,
- l->max.inode,
- l->max.offset);
-
- if (bkey_cmp(b->data->max_key, POS_MAX))
- l->min = l->max =
- btree_type_successor(b->btree_id,
- b->data->max_key);
+ *expected_start = bkey_cmp(k.k->p, POS_MAX)
+ ? bkey_successor(k.k->p)
+ : k.k->p;
+
+ if (fsck_err_on(is_last &&
+ bkey_cmp(k.k->p, expected_end), c,
+ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
+ k.k->p.inode,
+ k.k->p.offset,
+ expected_end.inode,
+ expected_end.offset)) {
+ BUG();
}
+fsck_err:
+ return ret;
}
/* marking of btree keys/nodes: */
@@ -124,7 +101,11 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
BUG_ON(journal_seq_verify(c) &&
k.k->version.lo > journal_cur_seq(&c->journal));
- if (k.k->version.lo > atomic64_read(&c->key_version))
+ /* XXX change to fsck check */
+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+ "key version number higher than recorded: %llu > %llu",
+ k.k->version.lo,
+ atomic64_read(&c->key_version)))
atomic64_set(&c->key_version, k.k->version.lo);
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
@@ -180,9 +161,10 @@ fsck_err:
return ret;
}
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
- u8 *max_stale, bool initial)
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+ bool initial)
{
+ struct bpos next_node_start = b->data->min_key;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
@@ -193,13 +175,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
- for_each_btree_node_key_unpack(b, k, &iter,
- &unpacked) {
+ bch2_btree_node_iter_init_from_start(&iter, b);
+
+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
if (ret)
break;
+
+ bch2_btree_node_iter_advance(&iter, b);
+
+ if (b->level) {
+ ret = bch2_gc_check_topology(c, k,
+ &next_node_start,
+ b->data->max_key,
+ bch2_btree_node_iter_end(&iter));
+ if (ret)
+ break;
+ }
}
return ret;
@@ -211,7 +205,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- struct range_checks r;
unsigned depth = metadata_only ? 1
: expensive_debug_checks(c) ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
@@ -223,12 +216,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
- btree_node_range_checks_init(&r, depth);
-
__for_each_btree_node(&trans, iter, btree_id, POS_MIN,
0, depth, BTREE_ITER_PREFETCH, b) {
- btree_node_range_checks(c, b, &r);
-
bch2_verify_btree_nr_keys(b);
gc_pos_set(c, gc_pos_btree_node(b));
@@ -269,40 +258,116 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
return ret;
}
-static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+ struct journal_keys *journal_keys,
+ unsigned target_depth)
{
- return (int) btree_id_to_gc_phase(l) -
- (int) btree_id_to_gc_phase(r);
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
+ struct bpos next_node_start = b->data->min_key;
+ u8 max_stale = 0;
+ int ret = 0;
+
+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_debugcheck(c, b, k);
+
+ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+
+ ret = bch2_gc_mark_key(c, k, &max_stale, true);
+ if (ret)
+ break;
+
+ if (b->level) {
+ struct btree *child;
+ BKEY_PADDED(k) tmp;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ ret = bch2_gc_check_topology(c, k,
+ &next_node_start,
+ b->data->max_key,
+ !bch2_btree_and_journal_iter_peek(&iter).k);
+ if (ret)
+ break;
+
+ if (b->level > target_depth) {
+ child = bch2_btree_node_get_noiter(c, &tmp.k,
+ b->btree_id, b->level - 1);
+ ret = PTR_ERR_OR_ZERO(child);
+ if (ret)
+ break;
+
+ ret = bch2_gc_btree_init_recurse(c, child,
+ journal_keys, target_depth);
+ six_unlock_read(&child->lock);
+
+ if (ret)
+ break;
+ }
+ } else {
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+ }
+
+ return ret;
}
-static int mark_journal_key(struct bch_fs *c, enum btree_id id,
- struct bkey_i *insert)
+static int bch2_gc_btree_init(struct bch_fs *c,
+ struct journal_keys *journal_keys,
+ enum btree_id btree_id,
+ bool metadata_only)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- u8 max_stale;
+ struct btree *b;
+ unsigned target_depth = metadata_only ? 1
+ : expensive_debug_checks(c) ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
+ : 0;
+ u8 max_stale = 0;
int ret = 0;
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
- if (ret)
- return ret;
+ b = c->btree_roots[btree_id].b;
- bch2_trans_init(&trans, c, 0, 0);
+ if (btree_node_fake(b))
+ return 0;
- for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
- BTREE_ITER_SLOTS, k, ret) {
- percpu_down_read(&c->mark_lock);
- ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
- BTREE_TRIGGER_GC|
- BTREE_TRIGGER_NOATOMIC);
- percpu_up_read(&c->mark_lock);
+ six_lock_read(&b->lock);
+ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+ "btree root with incorrect min_key: %llu:%llu",
+ b->data->min_key.inode,
+ b->data->min_key.offset)) {
+ BUG();
+ }
- if (!ret)
- break;
+ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+ "btree root with incorrect min_key: %llu:%llu",
+ b->data->max_key.inode,
+ b->data->max_key.offset)) {
+ BUG();
}
- return bch2_trans_exit(&trans) ?: ret;
+ if (b->level >= target_depth)
+ ret = bch2_gc_btree_init_recurse(c, b,
+ journal_keys, target_depth);
+
+ if (!ret)
+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ &max_stale, true);
+fsck_err:
+ six_unlock_read(&b->lock);
+
+ return ret;
+}
+
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+ return (int) btree_id_to_gc_phase(l) -
+ (int) btree_id_to_gc_phase(r);
}
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
@@ -317,24 +382,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
- enum btree_node_type type = __btree_node_type(0, id);
-
- int ret = bch2_gc_btree(c, id, initial, metadata_only);
+ int ret = initial
+ ? bch2_gc_btree_init(c, journal_keys,
+ id, metadata_only)
+ : bch2_gc_btree(c, id, initial, metadata_only);
if (ret)
return ret;
-
- if (journal_keys && !metadata_only &&
- btree_node_type_needs_gc(type)) {
- struct journal_key *j;
- int ret;
-
- for_each_journal_key(*journal_keys, j)
- if (j->btree_id == id) {
- ret = mark_journal_key(c, id, j->k);
- if (ret)
- return ret;
- }
- }
}
return 0;
@@ -880,7 +933,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
return;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
@@ -951,9 +1004,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
n1->key.k.p = n1->data->max_key =
bkey_unpack_pos(n1, last);
- n2->data->min_key =
- btree_type_successor(iter->btree_id,
- n1->data->max_key);
+ n2->data->min_key = bkey_successor(n1->data->max_key);
memcpy_u64s(vstruct_last(s1),
s2->start, u64s);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c126985b6ef5..ac8b98861aae 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -19,6 +19,7 @@
#include "journal_seq_blacklist.h"
#include "super-io.h"
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
static void verify_no_dups(struct btree *b,
@@ -68,17 +69,19 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order,
static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
bool *used_mempool)
{
+ unsigned flags = memalloc_nofs_save();
void *p;
BUG_ON(order > btree_page_order(c));
*used_mempool = false;
p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
- if (p)
- return p;
-
- *used_mempool = true;
- return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+ if (!p) {
+ *used_mempool = true;
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
+ }
+ memalloc_nofs_restore(flags);
+ return p;
}
static void sort_bkey_ptrs(const struct btree *bt,
@@ -617,7 +620,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -706,76 +709,107 @@ out: \
static int validate_bset(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors,
- unsigned *whiteout_u64s, int write,
- bool have_retry)
+ int write, bool have_retry)
{
- struct bkey_packed *k, *prev = NULL;
- struct bpos prev_pos = POS_MIN;
- struct bpos prev_data = POS_MIN;
- bool seen_non_whiteout = false;
- unsigned version;
+ unsigned version = le16_to_cpu(i->version);
const char *err;
int ret = 0;
- if (i == &b->data->keys) {
+ btree_err_on((version != BCH_BSET_VERSION_OLD &&
+ version < bcachefs_metadata_version_min) ||
+ version >= bcachefs_metadata_version_max,
+ BTREE_ERR_FATAL, c, b, i,
+ "unsupported bset version");
+
+ if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+ BTREE_ERR_FIXABLE, c, b, i,
+ "bset past end of btree node")) {
+ i->u64s = 0;
+ return 0;
+ }
+
+ btree_err_on(b->written && !i->u64s,
+ BTREE_ERR_FIXABLE, c, b, i,
+ "empty bset");
+
+ if (!b->written) {
+ struct btree_node *bn =
+ container_of(i, struct btree_node, keys);
/* These indicate that we read the wrong btree node: */
- btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
+ btree_err_on(BTREE_NODE_ID(bn) != b->btree_id,
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect btree id");
- btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->level,
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect level");
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
- u64 *p = (u64 *) &b->data->ptr;
+ u64 *p = (u64 *) &bn->ptr;
*p = swab64(*p);
- bch2_bpos_swab(&b->data->min_key);
- bch2_bpos_swab(&b->data->max_key);
}
- btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
+ if (!write)
+ compat_btree_node(b->level, b->btree_id, version,
+ BSET_BIG_ENDIAN(i), write, bn);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+ BTREE_ERR_MUST_RETRY, c, b, NULL,
+ "incorrect min_key: got %llu:%llu should be %llu:%llu",
+ b->data->min_key.inode,
+ b->data->min_key.offset,
+ bp->min_key.inode,
+ bp->min_key.offset);
+ }
+
+ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect max key");
+ if (write)
+ compat_btree_node(b->level, b->btree_id, version,
+ BSET_BIG_ENDIAN(i), write, bn);
+
/* XXX: ideally we would be validating min_key too */
#if 0
/*
* not correct anymore, due to btree node write error
* handling
*
- * need to add b->data->seq to btree keys and verify
+ * need to add bn->seq to btree keys and verify
* against that
*/
btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
- b->data->ptr),
+ bn->ptr),
BTREE_ERR_FATAL, c, b, i,
"incorrect backpointer");
#endif
- err = bch2_bkey_format_validate(&b->data->format);
+ err = bch2_bkey_format_validate(&bn->format);
btree_err_on(err,
BTREE_ERR_FATAL, c, b, i,
"invalid bkey format: %s", err);
- }
- version = le16_to_cpu(i->version);
- btree_err_on((version != BCH_BSET_VERSION_OLD &&
- version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max,
- BTREE_ERR_FATAL, c, b, i,
- "unsupported bset version");
-
- if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
- BTREE_ERR_FIXABLE, c, b, i,
- "bset past end of btree node")) {
- i->u64s = 0;
- return 0;
+ compat_bformat(b->level, b->btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &bn->format);
}
+fsck_err:
+ return ret;
+}
- btree_err_on(b->written && !i->u64s,
- BTREE_ERR_FIXABLE, c, b, i,
- "empty bset");
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+ struct bset *i, unsigned *whiteout_u64s,
+ int write, bool have_retry)
+{
+ unsigned version = le16_to_cpu(i->version);
+ struct bkey_packed *k, *prev = NULL;
+ bool seen_non_whiteout = false;
+ int ret = 0;
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
@@ -784,7 +818,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
for (k = i->start;
k != vstruct_last(i);) {
- struct bkey_s_c u;
+ struct bkey_s u;
struct bkey tmp;
const char *invalid;
@@ -804,22 +838,21 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
continue;
}
- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(&b->format, k);
-
- if (!write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(btree_node_type(b), k, write);
+ /* XXX: validate k->u64s */
+ if (!write)
+ bch2_bkey_compat(b->level, b->btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &b->format, k);
- u = bkey_disassemble(b, k, &tmp);
+ u = __bkey_disassemble(b, k, &tmp);
- invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, u) ?:
- (write ? bch2_bkey_val_invalid(c, u) : NULL);
+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
+ bch2_bkey_in_btree_node(b, u.s_c) ?:
+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
if (invalid) {
char buf[160];
- bch2_bkey_val_to_text(&PBUF(buf), c, u);
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey:\n%s\n%s", invalid, buf);
@@ -829,9 +862,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
continue;
}
- if (write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(btree_node_type(b), k, write);
+ if (write)
+ bch2_bkey_compat(b->level, b->btree_id, version,
+ BSET_BIG_ENDIAN(i), write,
+ &b->format, k);
/*
* with the separate whiteouts thing (used for extents), the
@@ -841,29 +875,27 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
- (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
+ (prev && bkey_iter_cmp(b, prev, k) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
- } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
- bkey_cmp(prev_pos, u.k->p) > 0) {
+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+ char buf1[80];
+ char buf2[80];
+ struct bkey up = bkey_unpack_key(b, prev);
+
+ bch2_bkey_to_text(&PBUF(buf1), &up);
+ bch2_bkey_to_text(&PBUF(buf2), u.k);
+
+ bch2_dump_bset(b, i, 0);
btree_err(BTREE_ERR_FATAL, c, b, i,
- "keys out of order: %llu:%llu > %llu:%llu",
- prev_pos.inode,
- prev_pos.offset,
- u.k->p.inode,
- bkey_start_offset(u.k));
+ "keys out of order: %s > %s",
+ buf1, buf2);
/* XXX: repair this */
}
- if (!bkey_deleted(u.k))
- prev_data = u.k->p;
- prev_pos = u.k->p;
-
prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
-
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
fsck_err:
return ret;
}
@@ -895,6 +927,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
BTREE_ERR_MUST_RETRY, c, b, NULL,
"bad btree header");
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ btree_err_on(b->data->keys.seq != bp->seq,
+ BTREE_ERR_MUST_RETRY, c, b, NULL,
+ "got wrong btree node");
+ }
+
while (b->written < c->opts.btree_node_size) {
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
@@ -922,8 +963,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
set_btree_node_old_extent_overwrite(b);
sectors = vstruct_sectors(b->data, c->block_bits);
-
- btree_node_set_format(b, b->data->format);
} else {
bne = write_block(b);
i = &bne->keys;
@@ -947,11 +986,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
sectors = vstruct_sectors(bne, c->block_bits);
}
- ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
+ ret = validate_bset(c, b, i, sectors,
READ, have_retry);
if (ret)
goto fsck_err;
+ if (!b->written)
+ btree_node_set_format(b, b->data->format);
+
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s,
+ READ, have_retry);
+ if (ret)
+ goto fsck_err;
+
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
b->written += sectors;
blacklisted = bch2_journal_seq_is_blacklisted(c,
@@ -1002,15 +1051,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
- struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
- const char *invalid = bch2_bkey_val_invalid(c, u);
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
if (invalid ||
(inject_invalid_keys(c) &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
- bch2_bkey_val_to_text(&PBUF(buf), c, u);
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
@@ -1023,6 +1072,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
continue;
}
+ if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+ bp.v->mem_ptr = 0;
+ }
+
k = bkey_next_skip_noops(k, vstruct_last(i));
}
@@ -1236,7 +1291,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
closure_put(&((struct btree_update *) new)->cl);
bch2_journal_pin_drop(&c->journal, &w->journal);
- closure_wake_up(&w->wait);
}
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1252,8 +1306,6 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
{
struct btree *b = wbio->wbio.bio.bi_private;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct bkey_i_btree_ptr *new_key;
- struct bkey_s_btree_ptr bp;
struct bch_extent_ptr *ptr;
struct btree_trans trans;
struct btree_iter *iter;
@@ -1279,16 +1331,13 @@ retry:
bkey_copy(&tmp.k, &b->key);
- new_key = bkey_i_to_btree_ptr(&tmp.k);
- bp = btree_ptr_i_to_s(new_key);
-
bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bp.s_c))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
goto err;
- ret = bch2_btree_node_update_key(c, iter, b, new_key);
+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
if (ret == -EINTR)
goto retry;
if (ret)
@@ -1394,7 +1443,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
return -1;
- ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
+ ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
+ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
if (ret)
bch2_inconsistent_error(c);
@@ -1544,8 +1594,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
validate_before_checksum = true;
/* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) <
- bcachefs_metadata_version_bkey_renumber)
+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
validate_before_checksum = true;
/* if we're going to be encrypting, check metadata validity first: */
@@ -1598,9 +1647,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
- if (b->level || !b->written)
- wbio->wbio.bio.bi_opf |= REQ_FUA;
-
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
@@ -1625,6 +1671,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->written += sectors_to_write;
+ /* XXX: submitting IO with btree locks held: */
bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
return;
err:
@@ -1773,12 +1820,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos) {
unsigned long flags = READ_ONCE(b->flags);
- unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
- pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1786,9 +1832,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b->written,
!list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0,
- b->will_make_reachable & 1,
- b->writes[ idx].wait.list.first != NULL,
- b->writes[!idx].wait.list.first != NULL);
+ b->will_make_reachable & 1);
}
rcu_read_unlock();
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e90e89eee273..337d2bdd29e8 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BTREE_IO_H
#define _BCACHEFS_BTREE_IO_H
+#include "bkey_methods.h"
#include "bset.h"
#include "btree_locking.h"
#include "extents.h"
@@ -102,19 +103,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+ enum six_lock_type lock_held)
{
while (b->written &&
btree_node_need_write(b) &&
btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) {
- bch2_btree_node_write(c, b, SIX_LOCK_read);
+ bch2_btree_node_write(c, b, lock_held);
break;
}
- six_unlock_read(&b->lock);
+ six_unlock_type(&b->lock, lock_held);
btree_node_wait_on_io(b);
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ btree_node_lock_type(c, b, lock_held);
}
}
@@ -131,7 +133,7 @@ do { \
new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\
- btree_node_write_if_need(_c, _b); \
+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
@@ -139,4 +141,50 @@ void bch2_btree_flush_all_writes(struct bch_fs *);
void bch2_btree_verify_flushed(struct bch_fs *);
ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write, struct bkey_format *f)
+{
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_INODES) {
+ swap(f->bits_per_field[BKEY_FIELD_INODE],
+ f->bits_per_field[BKEY_FIELD_OFFSET]);
+ swap(f->field_offset[BKEY_FIELD_INODE],
+ f->field_offset[BKEY_FIELD_OFFSET]);
+ }
+}
+
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write, struct bpos *p)
+{
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bpos_swab(p);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_INODES)
+ swap(p->inode, p->offset);
+}
+
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+ unsigned version, unsigned big_endian,
+ int write,
+ struct btree_node *bn)
+{
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_node_type_is_extents(btree_id) &&
+ bkey_cmp(bn->min_key, POS_MIN) &&
+ write)
+ bn->min_key = bkey_predecessor(bn->min_key);
+
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_node_type_is_extents(btree_id) &&
+ bkey_cmp(bn->min_key, POS_MIN) &&
+ !write)
+ bn->min_key = bkey_successor(bn->min_key);
+}
+
#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ea0555b806f0..5528ba0f1d44 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -5,6 +5,7 @@
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_locking.h"
+#include "btree_update.h"
#include "debug.h"
#include "extents.h"
@@ -35,6 +36,26 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
return pos;
}
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+ struct btree *b)
+{
+ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
+}
+
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+ struct btree *b)
+{
+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+}
+
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+ struct btree *b)
+{
+ return iter->btree_id == b->btree_id &&
+ !btree_iter_pos_before_node(iter, b) &&
+ !btree_iter_pos_after_node(iter, b);
+}
+
/* Btree node locking: */
void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
@@ -241,7 +262,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
/* Btree iterator locking: */
#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
{
unsigned l;
@@ -262,6 +283,8 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
trans_for_each_iter(trans, iter)
bch2_btree_iter_verify_locks(iter);
}
+#else
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
__flatten
@@ -385,21 +408,43 @@ void bch2_trans_unlock(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
-static void __bch2_btree_iter_verify(struct btree_iter *iter,
- struct btree *b)
+static void bch2_btree_iter_verify_level(struct btree_iter *iter,
+ unsigned level)
{
struct bpos pos = btree_iter_search_key(iter);
- struct btree_iter_level *l = &iter->l[b->level];
+ struct btree_iter_level *l = &iter->l[level];
struct btree_node_iter tmp = l->iter;
- struct bkey_packed *k;
+ bool locked = btree_node_locked(iter, level);
+ struct bkey_packed *p, *k;
+ char buf1[100], buf2[100];
+ const char *msg;
if (!debug_check_iterators(iter->trans->c))
return;
- if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+ BUG_ON(iter->level < iter->min_depth);
+
+ if (!btree_iter_node(iter, level))
return;
- bch2_btree_node_iter_verify(&l->iter, b);
+ if (!bch2_btree_node_relock(iter, level))
+ return;
+
+ /*
+ * Ideally this invariant would always be true, and hopefully in the
+ * future it will be, but for now set_pos_same_leaf() breaks it:
+ */
+ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
+ !btree_iter_pos_in_node(iter, l->b));
+
+ /*
+ * node iterators don't use leaf node iterator:
+ */
+ if (btree_iter_type(iter) == BTREE_ITER_NODES &&
+ level <= iter->min_depth)
+ goto unlock;
+
+ bch2_btree_node_iter_verify(&l->iter, l->b);
/*
* For interior nodes, the iterator will have skipped past
@@ -408,46 +453,73 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
* For extents, the iterator may have skipped past deleted keys (but not
* whiteouts)
*/
- k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
- ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
- : bch2_btree_node_iter_prev_all(&tmp, b);
- if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
- char buf[100];
- struct bkey uk = bkey_unpack_key(b, k);
+ p = level || btree_node_type_is_extents(iter->btree_id)
+ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
+ : bch2_btree_node_iter_prev_all(&tmp, l->b);
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("iterator should be before prev key:\n%s\n%llu:%llu\n",
- buf, iter->pos.inode, iter->pos.offset);
+ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+ msg = "before";
+ goto err;
}
- k = bch2_btree_node_iter_peek_all(&l->iter, b);
- if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) {
- char buf[100];
- struct bkey uk = bkey_unpack_key(b, k);
+ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+ msg = "after";
+ goto err;
+ }
+unlock:
+ if (!locked)
+ btree_node_unlock(iter, level);
+ return;
+err:
+ strcpy(buf1, "(none)");
+ strcpy(buf2, "(none)");
+
+ if (p) {
+ struct bkey uk = bkey_unpack_key(l->b, p);
+ bch2_bkey_to_text(&PBUF(buf1), &uk);
+ }
- bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("iter should be after current key:\n"
- "iter pos %llu:%llu\n"
- "cur key %s\n",
- iter->pos.inode, iter->pos.offset, buf);
+ if (k) {
+ struct bkey uk = bkey_unpack_key(l->b, k);
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
}
+
+ panic("iterator should be %s key at level %u:\n"
+ "iter pos %s %llu:%llu\n"
+ "prev key %s\n"
+ "cur key %s\n",
+ msg, level,
+ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
+ iter->pos.inode, iter->pos.offset,
+ buf1, buf2);
}
-void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
+static void bch2_btree_iter_verify(struct btree_iter *iter)
{
- struct btree_iter *linked;
+ unsigned i;
- if (!debug_check_iterators(iter->trans->c))
+ bch2_btree_trans_verify_locks(iter->trans);
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ bch2_btree_iter_verify_level(iter, i);
+}
+
+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+{
+ struct btree_iter *iter;
+
+ if (!debug_check_iterators(trans->c))
return;
- trans_for_each_iter_with_node(iter->trans, b, linked)
- __bch2_btree_iter_verify(linked, b);
+ trans_for_each_iter_with_node(trans, b, iter)
+ bch2_btree_iter_verify_level(iter, b->level);
}
#else
-static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
- struct btree *b) {}
+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
#endif
@@ -492,7 +564,7 @@ void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
trans_for_each_iter_with_node(iter->trans, b, linked) {
__bch2_btree_iter_fix_key_modified(linked, b, where);
- __bch2_btree_iter_verify(linked, b);
+ bch2_btree_iter_verify_level(linked, b->level);
}
}
@@ -563,7 +635,7 @@ fixup_done:
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
(b->level ||
- (iter->flags & BTREE_ITER_IS_EXTENTS))) {
+ btree_node_type_is_extents(iter->btree_id))) {
struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
@@ -591,19 +663,8 @@ fixup_done:
if (!b->level &&
node_iter == &iter->l[0].iter &&
- iter_current_key_modified) {
- struct bkey_packed *k =
- bch2_btree_node_iter_peek_all(node_iter, b);
-
- if (likely(k)) {
- bkey_disassemble(b, k, &iter->k);
- } else {
- /* XXX: for extents, calculate size of hole? */
- iter->k.type = KEY_TYPE_deleted;
- }
-
+ iter_current_key_modified)
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
- }
}
void bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -619,14 +680,16 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
if (node_iter != &iter->l[b->level].iter) {
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- bch2_btree_node_iter_verify(node_iter, b);
+
+ if (debug_check_iterators(iter->trans->c))
+ bch2_btree_node_iter_verify(node_iter, b);
}
trans_for_each_iter_with_node(iter->trans, b, linked) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->level].iter, t,
where, clobber_u64s, new_u64s);
- __bch2_btree_iter_verify(linked, b);
+ bch2_btree_iter_verify_level(linked, b->level);
}
}
@@ -736,26 +799,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
btree_node_unlock(iter, b->level + 1);
}
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
- struct btree *b)
-{
- return bkey_cmp(iter->pos, b->data->min_key) < 0;
-}
-
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
- struct btree *b)
-{
- return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
-}
-
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
- struct btree *b)
-{
- return iter->btree_id == b->btree_id &&
- !btree_iter_pos_before_node(iter, b) &&
- !btree_iter_pos_after_node(iter, b);
-}
-
static inline void __btree_iter_init(struct btree_iter *iter,
unsigned level)
{
@@ -912,6 +955,27 @@ static void btree_iter_prefetch(struct btree_iter *iter)
btree_node_unlock(iter, iter->level);
}
+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+ unsigned plevel, struct btree *b)
+{
+ struct btree_iter_level *l = &iter->l[plevel];
+ bool locked = btree_node_locked(iter, plevel);
+ struct bkey_packed *k;
+ struct bch_btree_ptr_v2 *bp;
+
+ if (!bch2_btree_node_relock(iter, plevel))
+ return;
+
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+ bp = (void *) bkeyp_val(&l->b->format, k);
+ bp->mem_ptr = (unsigned long)b;
+
+ if (!locked)
+ btree_node_unlock(iter, plevel);
+}
+
static __always_inline int btree_iter_down(struct btree_iter *iter)
{
struct bch_fs *c = iter->trans->c;
@@ -933,6 +997,10 @@ static __always_inline int btree_iter_down(struct btree_iter *iter)
mark_btree_node_locked(iter, level, lock_type);
btree_iter_node_set(iter, b);
+ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
+ unlikely(b != btree_node_mem_ptr(&tmp.k)))
+ btree_node_mem_ptr_set(iter, level + 1, b);
+
if (iter->flags & BTREE_ITER_PREFETCH)
btree_iter_prefetch(iter);
@@ -1000,7 +1068,14 @@ retry_all:
goto retry_all;
}
- ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
+ if (hweight64(trans->iters_live) > 1)
+ ret = -EINTR;
+ else
+ trans_for_each_iter(trans, iter)
+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
+ ret = -EINTR;
+ break;
+ }
out:
bch2_btree_cache_cannibalize_unlock(c);
return ret;
@@ -1107,9 +1182,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
iter->uptodate = BTREE_ITER_NEED_PEEK;
- bch2_btree_trans_verify_locks(iter->trans);
- if (btree_iter_node(iter, iter->level))
- __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
+ bch2_btree_iter_verify(iter);
return 0;
}
@@ -1129,12 +1202,14 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
enum btree_iter_type type)
{
EBUG_ON(iter->btree_id >= BTREE_ID_NR);
- EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
- (btree_node_type_is_extents(iter->btree_id) &&
- type != BTREE_ITER_NODES));
EBUG_ON(btree_iter_type(iter) != type);
- bch2_btree_trans_verify_locks(iter->trans);
+ BUG_ON(type == BTREE_ITER_KEYS &&
+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+ bkey_cmp(iter->pos, iter->k.p) > 0));
+
+ bch2_btree_iter_verify_locks(iter);
+ bch2_btree_iter_verify_level(iter, iter->level);
}
/* Iterate across nodes (leaf and interior nodes) */
@@ -1162,10 +1237,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->pos = b->key.k.p;
iter->uptodate = BTREE_ITER_UPTODATE;
+ bch2_btree_iter_verify(iter);
+
return b;
}
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
struct btree *b;
int ret;
@@ -1207,11 +1284,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
if (btree_node_read_locked(iter, iter->level))
btree_node_unlock(iter, iter->level);
- /* ick: */
- iter->pos = iter->btree_id == BTREE_ID_INODES
- ? btree_type_successor(iter->btree_id, iter->pos)
- : bkey_successor(iter->pos);
- iter->level = depth;
+ iter->pos = bkey_successor(iter->pos);
+ iter->level = iter->min_depth;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
ret = bch2_btree_iter_traverse(iter);
@@ -1224,6 +1298,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
iter->pos = b->key.k.p;
iter->uptodate = BTREE_ITER_UPTODATE;
+ bch2_btree_iter_verify(iter);
+
return b;
}
@@ -1238,7 +1314,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
EBUG_ON(!btree_node_locked(iter, 0));
EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
- iter->pos = new_pos;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = new_pos;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
btree_iter_advance_to_pos(iter, l, -1);
@@ -1248,9 +1325,14 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
}
-static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
{
- unsigned l = btree_iter_up_until_good_node(iter, cmp);
+ unsigned l = iter->level;
+
+ if (!cmp)
+ goto out;
+
+ l = btree_iter_up_until_good_node(iter, cmp);
if (btree_iter_node(iter, l)) {
/*
@@ -1267,64 +1349,81 @@ static unsigned btree_iter_pos_changed(struct btree_iter *iter, int cmp)
if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, l);
}
+out:
+ if (l != iter->level)
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ else
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
- return l;
+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
+ bool strictly_greater)
+{
+ struct bpos old = btree_iter_search_key(iter);
+ int cmp;
+
+ iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = new_pos;
+
+ cmp = bkey_cmp(btree_iter_search_key(iter), old);
+
+ btree_iter_pos_changed(iter, cmp);
}
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
int cmp = bkey_cmp(new_pos, iter->pos);
- unsigned l;
-
- if (!cmp)
- return;
-
- iter->pos = new_pos;
- l = btree_iter_pos_changed(iter, cmp);
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = new_pos;
- if (l != iter->level)
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- else
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+ btree_iter_pos_changed(iter, cmp);
}
static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
+ bool ret;
- iter->pos = l->b->key.k.p;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = l->b->key.k.p;
- if (!bkey_cmp(iter->pos, POS_MAX)) {
- bkey_init(&iter->k);
- iter->k.p = POS_MAX;
- return false;
- }
+ ret = bkey_cmp(iter->pos, POS_MAX) != 0;
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter->k.p = iter->pos = bkey_successor(iter->pos);
- iter->pos = btree_type_successor(iter->btree_id, iter->pos);
btree_iter_pos_changed(iter, 1);
- return true;
+ return ret;
}
static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
+ bool ret;
- iter->pos = l->b->data->min_key;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = l->b->data->min_key;
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bkey_cmp(iter->pos, POS_MIN)) {
- bkey_init(&iter->k);
- iter->k.p = POS_MIN;
- return false;
+ ret = bkey_cmp(iter->pos, POS_MIN) != 0;
+ if (ret) {
+ iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ iter->k.p = iter->pos = bkey_predecessor(iter->pos);
}
- iter->pos = btree_type_predecessor(iter->btree_id, iter->pos);
btree_iter_pos_changed(iter, -1);
- return true;
+ return ret;
}
+/**
+ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
+ * it currently points to
+ */
static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
@@ -1361,7 +1460,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- if (iter->uptodate == BTREE_ITER_UPTODATE)
+ if (iter->uptodate == BTREE_ITER_UPTODATE &&
+ !bkey_deleted(&iter->k))
return btree_iter_peek_uptodate(iter);
while (1) {
@@ -1386,6 +1486,8 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
iter->pos = bkey_start_pos(k.k);
iter->uptodate = BTREE_ITER_UPTODATE;
+
+ bch2_btree_iter_verify_level(iter, 0);
return k;
}
@@ -1395,52 +1497,101 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
*/
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ return bkey_s_c_null;
+
+ bch2_btree_iter_set_pos(iter,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? iter->k.p
+ : bkey_successor(iter->k.p));
+
+ return bch2_btree_iter_peek(iter);
+}
+
+static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
+{
+ struct bpos pos = btree_iter_search_key(iter);
+ struct btree_trans *trans = iter->trans;
+ struct btree_insert_entry *i;
+
+ trans_for_each_update2(trans, i)
+ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
+ bkey_cmp(pos, i->k->k.p)) <= 0)
+ break;
+
+ return i < trans->updates2 + trans->nr_updates2 &&
+ iter->btree_id == i->iter->btree_id
+ ? bkey_i_to_s_c(i->k)
+ : bkey_s_c_null;
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
+{
struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *p;
+ struct bkey_s_c k = __btree_iter_peek(iter, l);
+ struct bkey_s_c u = __btree_trans_updates_peek(iter);
+
+ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
+ return k;
+ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
+ iter->k = *u.k;
+ return u;
+ }
+ return bkey_s_c_null;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
+{
struct bkey_s_c k;
+ int ret;
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
- return bkey_s_c_null;
+ while (1) {
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
- /*
- * XXX: when we just need to relock we should be able to avoid
- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
- * for that to work
- */
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ k = __bch2_btree_iter_peek_with_updates(iter);
- bch2_btree_iter_set_pos(iter,
- btree_type_successor(iter->btree_id, iter->k.p));
+ if (k.k && bkey_deleted(k.k)) {
+ bch2_btree_iter_set_pos(iter,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? iter->k.p
+ : bkey_successor(iter->k.p));
+ continue;
+ }
- return bch2_btree_iter_peek(iter);
- }
+ if (likely(k.k))
+ break;
- if (unlikely(bkey_deleted(&iter->k))) {
- /*
- * we're currently pointed at a hole, because previously we were
- * iterating over slots:
- */
- return bch2_btree_iter_peek(iter);
+ if (!btree_iter_set_pos_to_next_leaf(iter))
+ return bkey_s_c_null;
}
- do {
- bch2_btree_node_iter_advance(&l->iter, l->b);
- p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- } while (likely(p) && bkey_whiteout(p));
+ /*
+ * iter->pos should always be equal to the key we just
+ * returned - except extents can straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ iter->pos = bkey_start_pos(k.k);
- if (unlikely(!p))
- return btree_iter_set_pos_to_next_leaf(iter)
- ? bch2_btree_iter_peek(iter)
- : bkey_s_c_null;
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ return k;
+}
- k = __btree_iter_unpack(iter, l, &iter->k, p);
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
+{
+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ return bkey_s_c_null;
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
- iter->pos = bkey_start_pos(k.k);
- return k;
+ bch2_btree_iter_set_pos(iter,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? iter->k.p
+ : bkey_successor(iter->k.p));
+
+ return bch2_btree_iter_peek_with_updates(iter);
}
/**
@@ -1449,13 +1600,15 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
+ struct bpos pos = iter->pos;
struct btree_iter_level *l = &iter->l[0];
struct bkey_s_c k;
int ret;
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- if (iter->uptodate == BTREE_ITER_UPTODATE)
+ if (iter->uptodate == BTREE_ITER_UPTODATE &&
+ !bkey_deleted(&iter->k))
return btree_iter_peek_uptodate(iter);
while (1) {
@@ -1464,8 +1617,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
return bkey_s_c_err(ret);
k = __btree_iter_peek(iter, l);
- if (!k.k ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
k = __btree_iter_prev(iter, l);
if (likely(k.k))
@@ -1475,7 +1627,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
return bkey_s_c_null;
}
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
iter->pos = bkey_start_pos(k.k);
iter->uptodate = BTREE_ITER_UPTODATE;
return k;
@@ -1487,33 +1639,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
*/
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_s_c k;
+ struct bpos pos = bkey_start_pos(&iter->k);
bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
- /*
- * XXX: when we just need to relock we should be able to avoid
- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
- * for that to work
- */
- iter->pos = btree_type_predecessor(iter->btree_id,
- iter->pos);
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-
- return bch2_btree_iter_peek_prev(iter);
- }
+ if (unlikely(!bkey_cmp(pos, POS_MIN)))
+ return bkey_s_c_null;
- k = __btree_iter_prev(iter, l);
- if (unlikely(!k.k))
- return btree_iter_set_pos_to_prev_leaf(iter)
- ? bch2_btree_iter_peek(iter)
- : bkey_s_c_null;
+ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0);
- iter->pos = bkey_start_pos(k.k);
- return k;
+ return bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c
@@ -1525,8 +1660,17 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
struct bkey n;
int ret;
-recheck:
- btree_iter_advance_to_pos(iter, l, -1);
+ /* keys & holes can't span inode numbers: */
+ if (iter->pos.offset == KEY_OFFSET_MAX) {
+ if (iter->pos.inode == KEY_INODE_MAX)
+ return bkey_s_c_null;
+
+ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+ }
/*
* iterator is now at the correct position for inserting at iter->pos,
@@ -1540,47 +1684,17 @@ recheck:
if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
/*
- * If there wasn't actually a hole, want the iterator to be
- * pointed at the key we found:
- *
- * XXX: actually, we shouldn't be changing the iterator here:
- * the iterator needs to be correct for inserting at iter->pos,
- * and there may be whiteouts between iter->pos and what this
- * iterator points at:
+ * We're not setting iter->uptodate because the node iterator
+ * doesn't necessarily point at the key we're returning:
*/
- l->iter = node_iter;
EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
- iter->uptodate = BTREE_ITER_UPTODATE;
-
- __bch2_btree_iter_verify(iter, l->b);
+ bch2_btree_iter_verify_level(iter, 0);
return k;
}
- /*
- * If we got to the end of the node, check if we need to traverse to the
- * next node:
- */
- if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- goto recheck;
- }
-
/* hole */
- /* holes can't span inode numbers: */
- if (iter->pos.offset == KEY_OFFSET_MAX) {
- if (iter->pos.inode == KEY_INODE_MAX)
- return bkey_s_c_null;
-
- iter->pos = bkey_successor(iter->pos);
- goto recheck;
- }
-
if (!k.k)
k.k = &l->b->key.k;
@@ -1598,42 +1712,33 @@ recheck:
iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
- __bch2_btree_iter_verify(iter, l->b);
+ bch2_btree_iter_verify_level(iter, 0);
return (struct bkey_s_c) { &iter->k, NULL };
}
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
struct bkey_s_c k;
int ret;
+ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
+
+ if (iter->uptodate == BTREE_ITER_UPTODATE)
+ return btree_iter_peek_uptodate(iter);
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
-recheck:
- while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
- bkey_deleted(k.k) &&
- bkey_cmp(k.k->p, iter->pos) == 0)
- bch2_btree_node_iter_advance(&l->iter, l->b);
-
- /*
- * If we got to the end of the node, check if we need to traverse to the
- * next node:
- */
- if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ k = __btree_iter_peek_all(iter, l, &iter->k);
- goto recheck;
- }
+ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
- if (!k.k ||
- bkey_deleted(k.k) ||
- bkey_cmp(iter->pos, k.k->p)) {
+ if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
/* hole */
bkey_init(&iter->k);
iter->k.p = iter->pos;
@@ -1641,49 +1746,21 @@ recheck:
}
iter->uptodate = BTREE_ITER_UPTODATE;
- __bch2_btree_iter_verify(iter, l->b);
+ bch2_btree_iter_verify_level(iter, 0);
return k;
}
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-{
- int ret;
-
- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
-
- if (iter->uptodate == BTREE_ITER_UPTODATE)
- return btree_iter_peek_uptodate(iter);
-
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- return __bch2_btree_iter_peek_slot(iter);
-}
-
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
- bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
-
- iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
- if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
- /*
- * XXX: when we just need to relock we should be able to avoid
- * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
- * for that to work
- */
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-
- return bch2_btree_iter_peek_slot(iter);
- }
+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ return bkey_s_c_null;
- if (!bkey_deleted(&iter->k))
- bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
+ bch2_btree_iter_set_pos(iter,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? iter->k.p
+ : bkey_successor(iter->k.p));
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
- return __bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(iter);
}
static inline void bch2_btree_iter_init(struct btree_trans *trans,
@@ -1705,12 +1782,12 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
iter->level = 0;
+ iter->min_depth = 0;
iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = NULL;
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
prefetch(c->btree_roots[btree_id].b);
}
@@ -1729,7 +1806,14 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
- int ret = btree_iter_err(iter);
+ int ret;
+
+ if (IS_ERR_OR_NULL(iter))
+ return 0;
+
+ BUG_ON(trans->iters + iter->idx != iter);
+
+ ret = btree_iter_err(iter);
if (!(trans->iters_touched & (1ULL << iter->idx)) &&
!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
@@ -1742,6 +1826,9 @@ int bch2_trans_iter_put(struct btree_trans *trans,
int bch2_trans_iter_free(struct btree_trans *trans,
struct btree_iter *iter)
{
+ if (IS_ERR_OR_NULL(iter))
+ return 0;
+
trans->iters_touched &= ~(1ULL << iter->idx);
return bch2_trans_iter_put(trans, iter);
@@ -1750,7 +1837,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
- void *new_iters, *new_updates;
+ void *p, *new_iters, *new_updates, *new_updates2;
size_t iters_bytes;
size_t updates_bytes;
@@ -1768,21 +1855,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
iters_bytes = sizeof(struct btree_iter) * new_size;
updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
- if (new_iters)
+ p = kmalloc(iters_bytes +
+ updates_bytes +
+ updates_bytes, GFP_NOFS);
+ if (p)
goto success;
- new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
new_size = BTREE_ITER_MAX;
trans->used_mempool = true;
success:
- new_updates = new_iters + iters_bytes;
+ new_iters = p; p += iters_bytes;
+ new_updates = p; p += updates_bytes;
+ new_updates2 = p; p += updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
memcpy(new_updates, trans->updates,
sizeof(struct btree_insert_entry) * trans->nr_updates);
+ memcpy(new_updates2, trans->updates2,
+ sizeof(struct btree_insert_entry) * trans->nr_updates2);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
memset(trans->iters, POISON_FREE,
@@ -1794,6 +1887,7 @@ success:
trans->iters = new_iters;
trans->updates = new_updates;
+ trans->updates2 = new_updates2;
trans->size = new_size;
if (trans->iters_live) {
@@ -1818,13 +1912,14 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
struct btree_iter *iter;
trans_for_each_iter(trans, iter) {
- pr_err("iter: btree %s pos %llu:%llu%s%s%s",
+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %pf",
bch2_btree_ids[iter->btree_id],
iter->pos.inode,
iter->pos.offset,
(trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
(trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+ (void *) iter->ip_allocated);
}
panic("trans iter oveflow\n");
@@ -1931,15 +2026,16 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
return iter;
}
-struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos, unsigned flags)
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos, flags);
if (!IS_ERR(iter))
- bch2_btree_iter_set_pos(iter, pos);
+ __bch2_btree_iter_set_pos(iter, pos,
+ btree_node_type_is_extents(btree_id));
return iter;
}
@@ -1960,6 +2056,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
iter->locks_want = locks_want;
iter->level = depth;
+ iter->min_depth = depth;
for (i = 0; i < ARRAY_SIZE(iter->l); i++)
iter->l[i].b = NULL;
@@ -1968,7 +2065,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
return iter;
}
-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
struct btree_iter *src)
{
struct btree_iter *iter;
@@ -1981,8 +2078,8 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
trans->iters_live |= 1ULL << iter->idx;
/*
- * Don't mark it as touched, we don't need to preserve this iter since
- * it's cheap to copy it again:
+ * We don't need to preserve this iter since it's cheap to copy it
+ * again - this will cause trans_iter_put() to free it right away:
*/
trans->iters_touched &= ~(1ULL << iter->idx);
@@ -2049,16 +2146,12 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
bch2_trans_unlink_iters(trans);
- if (flags & TRANS_RESET_ITERS)
- trans->iters_live = 0;
-
trans->iters_touched &= trans->iters_live;
trans->need_reset = 0;
trans->nr_updates = 0;
-
- if (flags & TRANS_RESET_MEM)
- trans->mem_top = 0;
+ trans->nr_updates2 = 0;
+ trans->mem_top = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
@@ -2077,11 +2170,18 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
{
memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+ /*
+ * reallocating iterators currently completely breaks
+ * bch2_trans_iter_put():
+ */
+ expected_nr_iters = BTREE_ITER_MAX;
+
trans->c = c;
trans->ip = _RET_IP_;
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
+ trans->updates2 = trans->updates2_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@@ -2119,5 +2219,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
- sizeof(u8) * nr);
+ sizeof(struct btree_insert_entry) * nr);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 962380925511..6456787a8f77 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -96,11 +96,11 @@ __trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
(_iter)->idx + 1))
#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
void bch2_btree_trans_verify_locks(struct btree_trans *);
#else
-static inline void bch2_btree_iter_verify(struct btree_iter *iter,
- struct btree *b) {}
+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
+ struct btree *b) {}
static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
#endif
@@ -154,11 +154,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
int bch2_btree_iter_traverse_all(struct btree_trans *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *);
+
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
@@ -166,41 +169,14 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
-static inline struct bpos btree_type_successor(enum btree_id id,
- struct bpos pos)
-{
- if (id == BTREE_ID_INODES) {
- pos.inode++;
- pos.offset = 0;
- } else if (!btree_node_type_is_extents(id)) {
- pos = bkey_successor(pos);
- }
-
- return pos;
-}
-
-static inline struct bpos btree_type_predecessor(enum btree_id id,
- struct bpos pos)
-{
- if (id == BTREE_ID_INODES) {
- --pos.inode;
- pos.offset = 0;
- } else {
- pos = bkey_predecessor(pos);
- }
-
- return pos;
-}
-
static inline int __btree_iter_cmp(enum btree_id id,
struct bpos pos,
const struct btree_iter *r)
{
- if (id != r->btree_id)
- return id < r->btree_id ? -1 : 1;
- return bkey_cmp(pos, r->pos);
+ return cmp_int(id, r->btree_id) ?: bkey_cmp(pos, r->pos);
}
static inline int btree_iter_cmp(const struct btree_iter *l,
@@ -230,7 +206,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
_start, _locks_want, _depth, _flags), \
_b = bch2_btree_iter_peek_node(_iter); \
(_b); \
- (_b) = bch2_btree_iter_next_node(_iter, _depth))
+ (_b) = bch2_btree_iter_next_node(_iter))
#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
_flags, _b) \
@@ -281,23 +257,46 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
void bch2_trans_unlink_iters(struct btree_trans *);
-struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned);
-struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+ struct bpos, unsigned);
+
+static inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+ struct bpos pos, unsigned flags)
+{
+ struct btree_iter *iter =
+ __bch2_trans_get_iter(trans, btree_id, pos, flags);
+
+ if (!IS_ERR(iter))
+ iter->ip_allocated = _THIS_IP_;
+ return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
struct btree_iter *);
+static inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+ struct btree_iter *iter =
+ __bch2_trans_copy_iter(trans, src);
+
+ if (!IS_ERR(iter))
+ iter->ip_allocated = _THIS_IP_;
+ return iter;
+
+}
+
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
-#define TRANS_RESET_ITERS (1 << 0)
-#define TRANS_RESET_MEM (1 << 1)
-#define TRANS_RESET_NOTRAVERSE (1 << 2)
+#define TRANS_RESET_NOTRAVERSE (1 << 0)
void bch2_trans_reset(struct btree_trans *, unsigned);
static inline void bch2_trans_begin(struct btree_trans *trans)
{
- return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
+ return bch2_trans_reset(trans, 0);
}
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b7af88e05837..732cdc35aa7c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -53,7 +53,6 @@ struct bset_tree {
struct btree_write {
struct journal_entry_pin journal;
- struct closure_waitlist wait;
};
struct btree_alloc {
@@ -64,9 +63,7 @@ struct btree_alloc {
struct btree {
/* Hottest entries first */
struct rhash_head hash;
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ u64 hash_val;
struct six_lock lock;
@@ -133,6 +130,9 @@ struct btree {
#ifdef CONFIG_BCACHEFS_DEBUG
bool *expensive_debug_checks;
#endif
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_cache {
@@ -234,9 +234,10 @@ struct btree_iter {
u16 flags;
u8 idx;
- enum btree_iter_uptodate uptodate:4;
enum btree_id btree_id:4;
+ enum btree_iter_uptodate uptodate:4;
unsigned level:4,
+ min_depth:4,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
@@ -252,6 +253,7 @@ struct btree_iter {
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+ unsigned long ip_allocated;
};
static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
@@ -259,6 +261,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
return iter->flags & BTREE_ITER_TYPE;
}
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+ return iter->l + iter->level;
+}
+
struct btree_insert_entry {
unsigned trigger_flags;
unsigned trans_triggers_run:1;
@@ -266,7 +273,11 @@ struct btree_insert_entry {
struct btree_iter *iter;
};
+#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 64
+#else
+#define BTREE_ITER_MAX 32
+#endif
struct btree_trans {
struct bch_fs *c;
@@ -278,6 +289,7 @@ struct btree_trans {
u8 nr_iters;
u8 nr_updates;
+ u8 nr_updates2;
u8 size;
unsigned used_mempool:1;
unsigned error:1;
@@ -290,6 +302,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
+ struct btree_insert_entry *updates2;
/* update path: */
struct journal_res journal_res;
@@ -303,6 +316,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[2];
+ struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
@@ -534,8 +548,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
struct btree_root {
struct btree *b;
- struct btree_update *as;
-
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 2c34bae64281..11f7d02de622 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -12,8 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
-void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
- struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK,
@@ -59,6 +58,7 @@ enum btree_insert_flags {
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
@@ -70,7 +70,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
- struct btree *, struct bkey_i_btree_ptr *);
+ struct btree *, struct bkey_i *);
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_trigger_flags);
@@ -98,17 +98,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
return __bch2_trans_commit(trans);
}
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \
- _flags, _reset_flags, _do) \
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \
({ \
int _ret; \
\
- do { \
- bch2_trans_reset(_trans, _reset_flags); \
- \
+ while (1) { \
_ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \
(_journal_seq), (_flags)); \
- } while (_ret == -EINTR); \
+ if (_ret != -EINTR) \
+ break; \
+ bch2_trans_reset(_trans, 0); \
+ } \
\
_ret; \
})
@@ -120,7 +120,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
\
bch2_trans_init(&trans, (_c), 0, 0); \
_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
- TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \
+ _do); \
_ret2 = bch2_trans_exit(&trans); \
\
_ret ?: _ret2; \
@@ -131,4 +131,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+#define trans_for_each_update2(_trans, _i) \
+ for ((_i) = (_trans)->updates2; \
+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
+ (_i)++)
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 748e6356f3d6..82b66a667e35 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -24,47 +24,42 @@
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
static void btree_node_interior_verify(struct btree *b)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bpos next_node = b->data->min_key;
struct btree_node_iter iter;
- struct bkey_packed *k;
+ struct bkey_s_c k;
+ struct bkey_s_c_btree_ptr_v2 bp;
+ struct bkey unpacked;
BUG_ON(!b->level);
- bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
-#if 1
- BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
- bkey_cmp_left_packed(b, k, &b->key.k.p));
+ bch2_btree_node_iter_init_from_start(&iter, b);
- BUG_ON((bch2_btree_node_iter_advance(&iter, b),
- !bch2_btree_node_iter_end(&iter)));
-#else
- const char *msg;
+ while (1) {
+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
+ break;
+ bp = bkey_s_c_to_btree_ptr_v2(k);
- msg = "not found";
- k = bch2_btree_node_iter_peek(&iter, b);
- if (!k)
- goto err;
+ BUG_ON(bkey_cmp(next_node, bp.v->min_key));
- msg = "isn't what it should be";
- if (bkey_cmp_left_packed(b, k, &b->key.k.p))
- goto err;
+ bch2_btree_node_iter_advance(&iter, b);
- bch2_btree_node_iter_advance(&iter, b);
+ if (bch2_btree_node_iter_end(&iter)) {
+ BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+ break;
+ }
- msg = "isn't last key";
- if (!bch2_btree_node_iter_end(&iter))
- goto err;
- return;
-err:
- bch2_dump_btree_node(b);
- printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
- b->key.k.p.offset, msg);
- BUG();
+ next_node = bkey_successor(k.k->p);
+ }
#endif
}
@@ -260,16 +255,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
}
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
- struct pending_btree_node_free *pending)
+ struct pending_btree_node_free *pending,
+ u64 journal_seq)
{
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+ 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, 0,
+ 0, 0, NULL, journal_seq,
BTREE_TRIGGER_OVERWRITE|
BTREE_TRIGGER_GC);
}
@@ -332,7 +328,11 @@ retry:
goto retry;
}
- bkey_btree_ptr_init(&tmp.k);
+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
+ bkey_btree_ptr_v2_init(&tmp.k);
+ else
+ bkey_btree_ptr_init(&tmp.k);
+
bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
bch2_open_bucket_get(c, wp, &ob);
@@ -354,25 +354,36 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
{
struct bch_fs *c = as->c;
struct btree *b;
+ int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
BUG_ON(!as->reserve->nr);
b = as->reserve->b[--as->reserve->nr];
- BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
-
set_btree_node_accessed(b);
set_btree_node_dirty(b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
+ b->level = level;
+ b->btree_id = as->btree_id;
+
memset(&b->nr, 0, sizeof(b->nr));
b->data->magic = cpu_to_le64(bset_magic(c));
b->data->flags = 0;
SET_BTREE_NODE_ID(b->data, as->btree_id);
SET_BTREE_NODE_LEVEL(b->data, level);
- b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+ bp->v.mem_ptr = 0;
+ bp->v.seq = b->data->keys.seq;
+ bp->v.sectors_written = 0;
+ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size);
+ }
if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
@@ -385,10 +396,26 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
btree_node_will_make_reachable(as, b);
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+ BUG_ON(ret);
+
trace_btree_node_alloc(c, b);
return b;
}
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+ b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+ b->key.k.p = pos;
+ b->data->max_key = pos;
+}
+
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree *b,
struct bkey_format format)
@@ -397,11 +424,12 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
n = bch2_btree_node_alloc(as, b->level);
- n->data->min_key = b->data->min_key;
- n->data->max_key = b->data->max_key;
- n->data->format = format;
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+ btree_set_min(n, b->data->min_key);
+ btree_set_max(n, b->data->max_key);
+
+ n->data->format = format;
btree_node_set_format(n, format);
bch2_btree_sort_into(as->c, n, b);
@@ -431,10 +459,9 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
{
struct btree *b = bch2_btree_node_alloc(as, level);
- b->data->min_key = POS_MIN;
- b->data->max_key = POS_MAX;
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, POS_MAX);
b->data->format = bch2_btree_calc_format(b);
- b->key.k.p = POS_MAX;
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
@@ -550,43 +577,47 @@ err_free:
/* Asynchronous interior node update machinery */
-static void bch2_btree_update_free(struct btree_update *as)
+static void __bch2_btree_update_free(struct btree_update *as)
{
struct bch_fs *c = as->c;
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+ bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
- BUG_ON(as->nr_new_nodes);
- BUG_ON(as->nr_pending);
+ BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+ !bch2_journal_error(&c->journal));;
if (as->reserve)
bch2_btree_reserve_put(c, as->reserve);
- mutex_lock(&c->btree_interior_update_lock);
list_del(&as->list);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
closure_wake_up(&c->btree_interior_update_wait);
- mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_update_nodes_reachable(struct closure *cl)
+static void bch2_btree_update_free(struct btree_update *as)
{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
mutex_lock(&c->btree_interior_update_lock);
+ __bch2_btree_update_free(as);
+ mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
+{
+ struct bch_fs *c = as->c;
while (as->nr_new_nodes) {
struct btree *b = as->new_nodes[--as->nr_new_nodes];
BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
- mutex_unlock(&c->btree_interior_update_lock);
/*
* b->will_make_reachable prevented it from being written, so
@@ -595,150 +626,128 @@ static void btree_update_nodes_reachable(struct closure *cl)
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
six_unlock_read(&b->lock);
- mutex_lock(&c->btree_interior_update_lock);
}
while (as->nr_pending)
- bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- closure_wake_up(&as->wait);
-
- bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
- struct bch_fs *c = as->c;
- int ret;
-
- ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
- if (ret == -EAGAIN) {
- continue_at(cl, btree_update_wait_on_journal, system_wq);
- return;
- }
- if (ret < 0)
- goto err;
-
- bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
- continue_at(cl, btree_update_nodes_reachable, system_wq);
+ bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+ seq);
}
static void btree_update_nodes_written(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
+ struct journal_res res = { 0 };
struct bch_fs *c = as->c;
struct btree *b;
+ struct bset *i;
+ int ret;
/*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
*/
-retry:
mutex_lock(&c->btree_interior_update_lock);
as->nodes_written = true;
+again:
+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+ struct btree_update, unwritten_list);
+ if (!as || !as->nodes_written) {
+ mutex_unlock(&c->btree_interior_update_lock);
+ return;
+ }
+
+ b = as->b;
+ if (b && !six_trylock_intent(&b->lock)) {
+ mutex_unlock(&c->btree_interior_update_lock);
+ btree_node_lock_type(c, b, SIX_LOCK_intent);
+ six_unlock_intent(&b->lock);
+ mutex_lock(&c->btree_interior_update_lock);
+ goto again;
+ }
+
+ list_del(&as->unwritten_list);
+
+ ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
+ JOURNAL_RES_GET_RESERVED);
+ if (ret) {
+ BUG_ON(!bch2_journal_error(&c->journal));
+ /* can't unblock btree writes */
+ goto free_update;
+ }
+
+ {
+ struct journal_buf *buf = &c->journal.buf[res.idx];
+ struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
+
+ res.offset += as->journal_u64s;
+ res.u64s -= as->journal_u64s;
+ memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+ }
switch (as->mode) {
case BTREE_INTERIOR_NO_UPDATE:
BUG();
case BTREE_INTERIOR_UPDATING_NODE:
- /* The usual case: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- six_unlock_read(&b->lock);
- goto retry;
- }
-
- BUG_ON(!btree_node_dirty(b));
- closure_wait(&btree_current_write(b)->wait, cl);
+ /* @b is the node we did the final insert into: */
+ BUG_ON(!res.ref);
+ six_lock_write(&b->lock);
list_del(&as->write_blocked_list);
- /*
- * for flush_held_btree_writes() waiting on updates to flush or
- * nodes to be writeable:
- */
- closure_wake_up(&c->btree_interior_update_wait);
- mutex_unlock(&c->btree_interior_update_lock);
+ i = btree_bset_last(b);
+ i->journal_seq = cpu_to_le64(
+ max(res.seq,
+ le64_to_cpu(i->journal_seq)));
- /*
- * b->write_blocked prevented it from being written, so
- * write it now if it needs to be written:
- */
- bch2_btree_node_write_cond(c, b, true);
- six_unlock_read(&b->lock);
+ bch2_btree_add_journal_pin(c, b, res.seq);
+ six_unlock_write(&b->lock);
break;
case BTREE_INTERIOR_UPDATING_AS:
- /*
- * The btree node we originally updated has been freed and is
- * being rewritten - so we need to write anything here, we just
- * need to signal to that btree_update that it's ok to make the
- * new replacement node visible:
- */
- closure_put(&as->parent_as->cl);
-
- /*
- * and then we have to wait on that btree_update to finish:
- */
- closure_wait(&as->parent_as->wait, cl);
- mutex_unlock(&c->btree_interior_update_lock);
+ BUG_ON(b);
break;
- case BTREE_INTERIOR_UPDATING_ROOT:
- /* b is the new btree root: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- six_unlock_read(&b->lock);
- goto retry;
- }
+ case BTREE_INTERIOR_UPDATING_ROOT: {
+ struct btree_root *r = &c->btree_roots[as->btree_id];
- BUG_ON(c->btree_roots[b->btree_id].as != as);
- c->btree_roots[b->btree_id].as = NULL;
+ BUG_ON(b);
- bch2_btree_set_root_ondisk(c, b, WRITE);
+ mutex_lock(&c->btree_root_lock);
+ bkey_copy(&r->key, as->parent_keys.keys);
+ r->level = as->level;
+ r->alive = true;
+ c->btree_roots_dirty = true;
+ mutex_unlock(&c->btree_root_lock);
+ break;
+ }
+ }
- /*
- * We don't have to wait anything anything here (before
- * btree_update_nodes_reachable frees the old nodes
- * ondisk) - we've ensured that the very next journal write will
- * have the pointer to the new root, and before the allocator
- * can reuse the old nodes it'll have to do a journal commit:
- */
- six_unlock_read(&b->lock);
- mutex_unlock(&c->btree_interior_update_lock);
+ bch2_journal_pin_drop(&c->journal, &as->journal);
+ bch2_journal_res_put(&c->journal, &res);
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
+free_update:
+ /* Do btree write after dropping journal res: */
+ if (b) {
/*
- * Bit of funny circularity going on here we have to break:
- *
- * We have to drop our journal pin before writing the journal
- * entry that points to the new btree root: else, we could
- * deadlock if the journal currently happens to be full.
- *
- * This mean we're dropping the journal pin _before_ the new
- * nodes are technically reachable - but this is safe, because
- * after the bch2_btree_set_root_ondisk() call above they will
- * be reachable as of the very next journal write:
+ * b->write_blocked prevented it from being written, so
+ * write it now if it needs to be written:
*/
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
- as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
- btree_update_wait_on_journal(cl);
- return;
+ btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ six_unlock_intent(&b->lock);
}
- continue_at(cl, btree_update_nodes_reachable, system_wq);
+ if (!ret)
+ btree_update_nodes_reachable(as, res.seq);
+
+ __bch2_btree_update_free(as);
+ /*
+ * for flush_held_btree_writes() waiting on updates to flush or
+ * nodes to be writeable:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
+ goto again;
}
/*
@@ -750,52 +759,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!btree_node_dirty(b));
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
- as->b = b;
+ as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ as->b = b;
+ as->level = b->level;
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * In general, when you're staging things in a journal that will later
- * be written elsewhere, and you also want to guarantee ordering: that
- * is, if you have updates a, b, c, after a crash you should never see c
- * and not a or b - there's a problem:
- *
- * If the final destination of the update(s) (i.e. btree node) can be
- * written/flushed _before_ the relevant journal entry - oops, that
- * breaks ordering, since the various leaf nodes can be written in any
- * order.
- *
- * Normally we use bset->journal_seq to deal with this - if during
- * recovery we find a btree node write that's newer than the newest
- * journal entry, we just ignore it - we don't need it, anything we're
- * supposed to have (that we reported as completed via fsync()) will
- * still be in the journal, and as far as the state of the journal is
- * concerned that btree node write never happened.
- *
- * That breaks when we're rewriting/splitting/merging nodes, since we're
- * mixing btree node writes that haven't happened yet with previously
- * written data that has been reported as completed to the journal.
- *
- * Thus, before making the new nodes reachable, we have to wait the
- * newest journal sequence number we have data for to be written (if it
- * hasn't been yet).
- */
- bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct btree_update *as =
- container_of(pin, struct btree_update, journal);
-
- bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
}
static void btree_update_reparent(struct btree_update *as,
@@ -803,10 +777,10 @@ static void btree_update_reparent(struct btree_update *as,
{
struct bch_fs *c = as->c;
+ lockdep_assert_held(&c->btree_interior_update_lock);
+
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- child->parent_as = as;
- closure_get(&as->cl);
/*
* When we write a new btree root, we have to drop our journal pin
@@ -817,45 +791,24 @@ static void btree_update_reparent(struct btree_update *as,
* just transfer the journal pin to the new interior update so
* btree_update_nodes_written() can drop it.
*/
- bch2_journal_pin_add_if_older(&c->journal, &child->journal,
- &as->journal, interior_update_flush);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
bch2_journal_pin_drop(&c->journal, &child->journal);
-
- as->journal_seq = max(as->journal_seq, child->journal_seq);
}
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- struct btree_root *r = &c->btree_roots[as->btree_id];
-
- mutex_lock(&c->btree_interior_update_lock);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(!bch2_keylist_empty(&as->parent_keys));
- /*
- * Old root might not be persistent yet - if so, redirect its
- * btree_update operation to point to us:
- */
- if (r->as)
- btree_update_reparent(as, r->as);
-
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
- as->b = r->b;
- r->as = as;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+ as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->level = b->level;
+ bch2_keylist_add(&as->parent_keys, &b->key);
mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * When we're rewriting nodes and updating interior nodes, there's an
- * issue with updates that haven't been written in the journal getting
- * mixed together with older data - see btree_update_updated_node()
- * for the explanation.
- *
- * However, this doesn't affect us when we're writing a new btree root -
- * because to make that new root reachable we have to write out a new
- * journal entry, which must necessarily be newer than as->journal_seq.
- */
}
static void btree_node_will_make_reachable(struct btree_update *as,
@@ -932,10 +885,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree *b)
{
struct bch_fs *c = as->c;
- struct closure *cl, *cl_n;
struct btree_update *p, *n;
struct btree_write *w;
- struct bset_tree *t;
set_btree_node_dying(b);
@@ -944,18 +895,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
btree_interior_update_add_node_reference(as, b);
- /*
- * Does this node have data that hasn't been written in the journal?
- *
- * If so, we have to wait for the corresponding journal entry to be
- * written before making the new nodes reachable - we can't just carry
- * over the bset->journal_seq tracking, since we'll be mixing those keys
- * in with keys that aren't in the journal anymore:
- */
- for_each_bset(b, t)
- as->journal_seq = max(as->journal_seq,
- le64_to_cpu(bset(b, t)->journal_seq));
-
mutex_lock(&c->btree_interior_update_lock);
/*
@@ -979,16 +918,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
clear_btree_node_dirty(b);
clear_btree_node_need_write(b);
- w = btree_current_write(b);
-
- /*
- * Does this node have any btree_update operations waiting on this node
- * to be written?
- *
- * If so, wake them up when this btree_update operation is reachable:
- */
- llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
- llist_add(&cl->list, &as->wait.list);
/*
* Does this node have unwritten data that has a pin on the journal?
@@ -998,13 +927,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk:
*/
- bch2_journal_pin_add_if_older(&c->journal, &w->journal,
- &as->journal, interior_update_flush);
+ w = btree_current_write(b);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_add_if_older(&c->journal, &w->journal,
- &as->journal, interior_update_flush);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1021,12 +949,33 @@ void bch2_btree_update_done(struct btree_update *as)
}
struct btree_update *
-bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
unsigned nr_nodes, unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
+ struct journal_preres journal_preres = { 0 };
struct btree_reserve *reserve;
struct btree_update *as;
+ int ret;
+
+ ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ BTREE_UPDATE_JOURNAL_RES,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (ret == -EAGAIN) {
+ bch2_trans_unlock(trans);
+
+ ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ BTREE_UPDATE_JOURNAL_RES,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!bch2_trans_relock(trans)) {
+ bch2_journal_preres_put(&c->journal, &journal_preres);
+ return ERR_PTR(-EINTR);
+ }
+ }
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
if (IS_ERR(reserve))
@@ -1040,6 +989,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
as->btree_id = id;
as->reserve = reserve;
INIT_LIST_HEAD(&as->write_blocked_list);
+ as->journal_preres = journal_preres;
bch2_keylist_init(&as->parent_keys, as->inline_keys);
@@ -1102,22 +1052,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
- struct btree_root *r = &c->btree_roots[b->btree_id];
-
- mutex_lock(&c->btree_root_lock);
-
- BUG_ON(b != r->b);
- bkey_copy(&r->key, &b->key);
- r->level = b->level;
- r->alive = true;
- if (rw == WRITE)
- c->btree_roots_dirty = true;
-
- mutex_unlock(&c->btree_root_lock);
-}
-
/**
* bch_btree_set_root - update the root in memory and on disk
*
@@ -1150,7 +1084,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
bch2_btree_set_root_inmem(as, b);
- btree_update_updated_root(as);
+ btree_update_updated_root(as, b);
/*
* Unlock old root after new root is visible:
@@ -1171,10 +1105,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
{
struct bch_fs *c = as->c;
struct bch_fs_usage *fs_usage;
+ struct jset_entry *entry;
struct bkey_packed *k;
struct bkey tmp;
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+ ARRAY_SIZE(as->journal_entries));
+
+ entry = (void *) &as->journal_entries[as->journal_u64s];
+ memset(entry, 0, sizeof(*entry));
+ entry->u64s = cpu_to_le16(insert->k.u64s);
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ entry->btree_id = b->btree_id;
+ entry->level = b->level;
+ memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
+ as->journal_u64s += jset_u64s(insert->k.u64s);
mutex_lock(&c->btree_interior_update_lock);
percpu_down_read(&c->mark_lock);
@@ -1263,10 +1208,8 @@ static struct btree *__btree_split_node(struct btree_update *as,
BUG_ON(!prev);
- n1->key.k.p = bkey_unpack_pos(n1, prev);
- n1->data->max_key = n1->key.k.p;
- n2->data->min_key =
- btree_type_successor(n1->btree_id, n1->key.k.p);
+ btree_set_max(n1, bkey_unpack_pos(n1, prev));
+ btree_set_min(n2, bkey_successor(n1->key.k.p));
set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
@@ -1325,6 +1268,14 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
struct bkey_packed *src, *dst, *n;
struct bset *i;
+ /*
+ * XXX
+ *
+ * these updates must be journalled
+ *
+ * oops
+ */
+
BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
@@ -1332,11 +1283,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
while (!bch2_keylist_empty(keys)) {
k = bch2_keylist_front(keys);
- BUG_ON(bch_keylist_u64s(keys) >
- bch_btree_keys_u64s_remaining(as->c, b));
- BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
- BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
-
bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
bch2_keylist_pop_front(keys);
}
@@ -1422,7 +1368,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->lock);
- bch2_keylist_add(&as->parent_keys, &n1->key);
+ if (parent)
+ bch2_keylist_add(&as->parent_keys, &n1->key);
}
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
@@ -1496,19 +1443,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
(bkey_cmp_packed(b, k, &insert->k) >= 0))
;
- while (!bch2_keylist_empty(keys)) {
- insert = bch2_keylist_front(keys);
-
+ for_each_keylist_key(keys, insert)
bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
- bch2_keylist_pop_front(keys);
- }
btree_update_updated_node(as, b);
trans_for_each_iter_with_node(iter->trans, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
- bch2_btree_iter_verify(iter, b);
+ bch2_btree_trans_verify_iters(iter->trans, b);
}
/**
@@ -1581,7 +1524,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned flags)
{
struct btree_trans *trans = iter->trans;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
struct btree_update *as;
struct closure cl;
int ret = 0;
@@ -1620,7 +1563,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
goto out;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(trans, iter->btree_id,
btree_update_reserve_required(c, b), flags,
!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
@@ -1732,7 +1675,7 @@ retry:
goto err_unlock;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(trans, iter->btree_id,
btree_update_reserve_required(c, parent) + 1,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
@@ -1749,10 +1692,9 @@ retry:
n = bch2_btree_node_alloc(as, b->level);
- n->data->min_key = prev->data->min_key;
- n->data->max_key = next->data->max_key;
+ btree_set_min(n, prev->data->min_key);
+ btree_set_max(n, next->data->max_key);
n->data->format = new_f;
- n->key.k.p = next->key.k.p;
btree_node_set_format(n, new_f);
@@ -1779,7 +1721,7 @@ retry:
bch2_btree_iter_node_replace(iter, n);
- bch2_btree_iter_verify(iter, n);
+ bch2_btree_trans_verify_iters(trans, n);
bch2_btree_node_free_inmem(c, b, iter);
bch2_btree_node_free_inmem(c, m, iter);
@@ -1846,7 +1788,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
struct btree *n, *parent = btree_node_parent(iter, b);
struct btree_update *as;
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
@@ -1944,7 +1886,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
struct btree_update *as,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
- struct bkey_i_btree_ptr *new_key)
+ struct bkey_i *new_key)
{
struct btree *parent;
int ret;
@@ -1989,20 +1931,20 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
*/
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
- bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+ bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {
- bkey_copy(&new_hash->key, &new_key->k_i);
+ bkey_copy(&new_hash->key, new_key);
ret = bch2_btree_node_hash_insert(&c->btree_cache,
new_hash, b->level, b->btree_id);
BUG_ON(ret);
}
- bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+ bch2_keylist_add(&as->parent_keys, new_key);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
if (new_hash) {
@@ -2011,12 +1953,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bch2_btree_node_hash_remove(&c->btree_cache, b);
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
}
} else {
struct bch_fs_usage *fs_usage;
@@ -2029,11 +1971,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
percpu_down_read(&c->mark_lock);
fs_usage = bch2_fs_usage_scratch_get(c);
- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
0, 0, fs_usage, 0,
BTREE_TRIGGER_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
0, 0, NULL, 0,
BTREE_TRIGGER_INSERT||
BTREE_TRIGGER_GC);
@@ -2047,19 +1989,19 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
percpu_up_read(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
}
- btree_update_updated_root(as);
+ btree_update_updated_root(as, b);
bch2_btree_node_unlock_write(b, iter);
}
@@ -2068,7 +2010,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree *b,
- struct bkey_i_btree_ptr *new_key)
+ struct bkey_i *new_key)
{
struct btree *parent = btree_node_parent(iter, b);
struct btree_update *as = NULL;
@@ -2091,8 +2033,11 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
}
}
- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /*
+ * check btree_ptr_hash_val() after @b is locked by
+ * btree_iter_traverse():
+ */
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
/* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
@@ -2110,7 +2055,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
new_hash = bch2_btree_node_mem_alloc(c);
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
parent ? btree_update_reserve_required(c, parent) : 0,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
@@ -2134,7 +2079,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
goto err;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
if (ret)
goto err_free_update;
@@ -2193,14 +2138,14 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
bkey_btree_ptr_init(&b->key);
b->key.k.p = POS_MAX;
- PTR_HASH(&b->key) = U64_MAX - id;
+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
bch2_bset_init_first(b, &b->data->keys);
bch2_btree_build_aux_trees(b);
b->data->flags = 0;
- b->data->min_key = POS_MIN;
- b->data->max_key = POS_MAX;
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, POS_MAX);
b->data->format = bch2_btree_calc_format(b);
btree_node_set_format(b, b->data->format);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 2d8e0b7f3aaf..2fddf5d31eb9 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -32,6 +32,9 @@ struct pending_btree_node_free {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
+#define BTREE_UPDATE_JOURNAL_RES \
+ ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
@@ -55,6 +58,7 @@ struct btree_update {
struct bch_fs *c;
struct list_head list;
+ struct list_head unwritten_list;
/* What kind of update are we doing? */
enum {
@@ -68,8 +72,10 @@ struct btree_update {
unsigned nodes_written:1;
enum btree_id btree_id;
+ u8 level;
struct btree_reserve *reserve;
+ struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +89,6 @@ struct btree_update {
struct list_head write_blocked_list;
/*
- * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
- * we're now blocking another btree_update
- * @parent_as - btree_update that's waiting on our nodes to finish
- * writing, before it can make new nodes visible on disk
- * @wait - list of child btree_updates that are waiting on this
- * btree_update to make all the new nodes visible before they can free
- * their old btree nodes
- */
- struct btree_update *parent_as;
- struct closure_waitlist wait;
-
- /*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
* btree_update operation, and release it when the new node(s)
@@ -102,8 +96,6 @@ struct btree_update {
*/
struct journal_entry_pin journal;
- u64 journal_seq;
-
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
@@ -115,6 +107,9 @@ struct btree_update {
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
unsigned nr_new_nodes;
+ unsigned journal_u64s;
+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
+
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
/*
@@ -139,7 +134,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
unsigned, struct closure *);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
@@ -302,18 +297,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bkey_packed *k)
+ struct bpos pos)
{
- unsigned u64s = bkeyp_key_u64s(&b->format, k);
- struct bkey_packed *dst;
+ struct bkey_packed k;
+
+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey *u = (void *) &k;
+
+ bkey_init(u);
+ u->p = pos;
+ }
- BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+ k.needs_whiteout = true;
- b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
- dst = unwritten_whiteouts_start(c, b);
- memcpy_u64s(dst, k, u64s);
- dst->u64s = u64s;
- dst->type = KEY_TYPE_deleted;
+ b->whiteout_u64s += k.u64s;
+ bkey_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index afd2086edeff..7faf98fd2f64 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -23,11 +23,10 @@
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- return i != trans->updates &&
- i[0].iter->l[0].b == i[-1].iter->l[0].b;
+ return i != trans->updates2 &&
+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
}
-
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
@@ -53,45 +52,45 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
{
- const struct bkey_format *f = &b->format;
struct bkey_packed *k;
- unsigned clobber_u64s;
+ unsigned clobber_u64s = 0, new_u64s = 0;
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
- bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
+ bkey_cmp(bkey_start_pos(&insert->k),
+ bkey_predecessor(b->data->min_key)) < 0);
+ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
+ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(insert->k.u64s >
+ bch_btree_keys_u64s_remaining(iter->trans->c, b));
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_packed(b, k, &insert->k))
k = NULL;
/* @k is the key being overwritten/deleted, if any: */
-
EBUG_ON(k && bkey_whiteout(k));
+ /* Deleting, but not found? nothing to do: */
+ if (bkey_whiteout(&insert->k) && !k)
+ return false;
+
if (bkey_whiteout(&insert->k)) {
/* Deleting: */
-
- /* Not found? Nothing to do: */
- if (!k)
- return false;
-
btree_account_key_drop(b, k);
k->type = KEY_TYPE_deleted;
- if (k->needs_whiteout) {
- push_whiteout(iter->trans->c, b, k);
- k->needs_whiteout = false;
- }
+ if (k->needs_whiteout)
+ push_whiteout(iter->trans->c, b, insert->k.p);
+ k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
clobber_u64s = k->u64s;
-
bch2_bset_delete(b, k, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- clobber_u64s, 0);
+ goto fix_iter;
} else {
bch2_btree_iter_fix_key_modified(iter, b, k);
}
@@ -101,14 +100,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
if (k) {
/* Overwriting: */
- if (!bkey_written(b, k) &&
- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
- k->type = insert->k.type;
- memcpy_u64s(bkeyp_val(f, k), &insert->v,
- bkey_val_u64s(&insert->k));
- return true;
- }
-
btree_account_key_drop(b, k);
k->type = KEY_TYPE_deleted;
@@ -124,11 +115,13 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
}
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
- clobber_u64s = 0;
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- clobber_u64s, k->u64s);
+ new_u64s = k->u64s;
+fix_iter:
+ if (clobber_u64s != new_u64s)
+ bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ clobber_u64s, new_u64s);
return true;
}
@@ -155,6 +148,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
return __btree_node_flush(j, pin, 1, seq);
}
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+ struct btree *b, u64 seq)
+{
+ struct btree_write *w = btree_current_write(b);
+
+ bch2_journal_pin_add(&c->journal, seq, &w->journal,
+ btree_node_write_idx(b) == 0
+ ? btree_node_flush0
+ : btree_node_flush1);
+}
+
static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_i *insert)
@@ -176,16 +180,14 @@ static inline void __btree_journal_key(struct btree_trans *trans,
*trans->journal_seq = seq;
}
-void bch2_btree_journal_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
+static void bch2_btree_journal_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- struct btree *b = iter->l[0].b;
- struct btree_write *w = btree_current_write(b);
+ struct btree *b = iter_l(iter)->b;
- EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
@@ -195,35 +197,15 @@ void bch2_btree_journal_key(struct btree_trans *trans,
cpu_to_le64(trans->journal_res.seq);
}
- if (unlikely(!journal_pin_active(&w->journal))) {
- u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ bch2_btree_add_journal_pin(c, b,
+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
- : j->replay_journal_seq;
-
- bch2_journal_pin_add(j, seq, &w->journal,
- btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
- }
+ : j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
}
-static void bch2_insert_fixup_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- EBUG_ON(iter->level);
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, l->b));
-
- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
- bch2_btree_journal_key(trans, iter, insert);
-}
-
/**
* btree_insert_key - insert a key one key into a leaf node
*/
@@ -232,7 +214,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
struct bset_tree *t = bset_tree_last(b);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
@@ -240,10 +222,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
insert->k.needs_whiteout = false;
- if (!btree_node_is_extents(b))
- bch2_insert_fixup_key(trans, iter, insert);
- else
- bch2_insert_fixup_extent(trans, iter, insert);
+ if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
+ bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -268,14 +248,10 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- BUG_ON(iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
- EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
-
+ BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
- !bkey_deleted(&insert->k) &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ __btree_node_type(iter->level, iter->btree_id)));
}
static noinline int
@@ -321,15 +297,22 @@ btree_key_can_insert(struct btree_trans *trans,
unsigned *u64s)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
static enum btree_insert_ret ret;
if (unlikely(btree_node_fake(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
- ret = !btree_node_is_extents(b)
+ /*
+ * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
+ * style extent updates:
+ */
+ if (unlikely(btree_node_old_extent_overwrite(b)))
+ return BTREE_INSERT_BTREE_NODE_FULL;
+
+ ret = !(iter->flags & BTREE_ITER_IS_EXTENTS)
? BTREE_INSERT_OK
- : bch2_extent_can_insert(trans, iter, insert, u64s);
+ : bch2_extent_can_insert(trans, iter, insert);
if (ret)
return ret;
@@ -369,7 +352,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+ if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
bch2_mark_update(trans, i->iter, i->k, NULL,
i->trigger_flags|BTREE_TRIGGER_GC);
}
@@ -398,7 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
- trans_for_each_update(trans, i) {
+ trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
@@ -437,10 +420,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
@@ -463,7 +446,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
@@ -484,8 +467,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
- trans_for_each_update(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, 0));
+ trans_for_each_update2(trans, i)
+ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
ret = bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
@@ -512,20 +495,20 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
- i->iter->l[0].b, i->iter);
+ iter_l(i->iter)->b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
i->iter);
/*
@@ -540,14 +523,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
return 0;
@@ -670,6 +653,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
+static void bch2_trans_update2(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+ .iter = iter, .k = insert
+ };
+
+ btree_insert_entry_checks(trans, n.iter, n.k);
+
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+ trans_for_each_update2(trans, i) {
+ if (btree_iter_cmp(n.iter, i->iter) == 0) {
+ *i = n;
+ return;
+ }
+
+ if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ break;
+ }
+
+ array_insert_item(trans->updates2, trans->nr_updates2,
+ i - trans->updates2, n);
+}
+
+static int extent_update_to_keys(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert)
+{
+ struct btree_iter *iter;
+
+ if (bkey_deleted(&insert->k))
+ return 0;
+
+ iter = bch2_trans_copy_iter(trans, orig_iter);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ iter->flags |= BTREE_ITER_INTENT;
+ __bch2_btree_iter_set_pos(iter, insert->k.p, false);
+ bch2_trans_update2(trans, iter, insert);
+ bch2_trans_iter_put(trans, iter);
+ return 0;
+}
+
+static int extent_handle_overwrites(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos start, struct bpos end)
+{
+ struct btree_iter *iter = NULL, *update_iter;
+ struct bkey_i *update;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_with_updates(iter);
+
+ while (k.k && !(ret = bkey_err(k))) {
+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+ break;
+
+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_back(start, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ if (bkey_cmp(k.k->p, end) > 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_front(end, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ } else {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ update->k = *k.k;
+ set_bkey_val_u64s(&update->k, 0);
+ update->k.type = KEY_TYPE_deleted;
+ update->k.size = 0;
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ k = bch2_btree_iter_next_with_updates(iter);
+ }
+err:
+ if (!IS_ERR_OR_NULL(iter))
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
@@ -739,7 +851,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
} while (trans_trigger_run);
+ /* Turn extents updates into keys: */
+ trans_for_each_update(trans, i)
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ struct bpos start = bkey_start_pos(&i->k->k);
+
+ while (i + 1 < trans->updates + trans->nr_updates &&
+ i[0].iter->btree_id == i[1].iter->btree_id &&
+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
+ i++;
+
+ ret = extent_handle_overwrites(trans, i->iter->btree_id,
+ start, i->k->k.p);
+ if (ret)
+ goto out;
+ }
+
trans_for_each_update(trans, i) {
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ ret = extent_update_to_keys(trans, i->iter, i->k);
+ if (ret)
+ goto out;
+ } else {
+ bch2_trans_update2(trans, i->iter, i->k);
+ }
+ }
+
+ trans_for_each_update2(trans, i) {
+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+ BUG_ON(i->iter->locks_want < 1);
+
u64s = jset_u64s(i->k->k.u64s);
if (0)
trans->journal_preres_u64s += u64s;
@@ -770,7 +911,7 @@ out:
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
out_noupdates:
- bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE);
+ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
return ret;
err:
@@ -788,11 +929,14 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+ EBUG_ON(bkey_cmp(iter->pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&k->k)
+ : k->k.p));
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (btree_node_type_is_extents(iter->btree_id)) {
iter->pos_after_commit = k->k.p;
iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
}
@@ -851,18 +995,21 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
return 0;
}
-static int __bch2_btree_insert(struct btree_trans *trans,
- enum btree_id id, struct bkey_i *k)
+int __bch2_btree_insert(struct btree_trans *trans,
+ enum btree_id id, struct bkey_i *k)
{
struct btree_iter *iter;
+ int ret;
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
- bch2_trans_update(trans, iter, k, 0);
- return 0;
+ ret = bch2_btree_iter_traverse(iter) ?:
+ bch2_trans_update(trans, iter, k, 0);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
/**
@@ -894,7 +1041,7 @@ retry:
bkey_cmp(iter->pos, end) < 0) {
struct bkey_i delete;
- bch2_trans_reset(trans, TRANS_RESET_MEM);
+ bch2_trans_begin(trans);
bkey_init(&delete.k);
@@ -910,7 +1057,7 @@ retry:
*/
delete.k.p = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (btree_node_type_is_extents(iter->btree_id)) {
unsigned max_sectors =
KEY_SIZE_MAX & (~0 << trans->c->block_bits);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 731b93255876..2e1df04c760d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1194,6 +1194,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
@@ -1253,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
- unsigned flags)
+ unsigned flags,
+ bool is_extents)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
unsigned offset = 0;
- s64 sectors = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
- if (btree_node_is_extents(b)
+ if (is_extents
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return 0;
- if (btree_node_is_extents(b)) {
+ if (is_extents) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
@@ -1334,13 +1335,13 @@ int bch2_mark_update(struct btree_trans *trans,
!bkey_deleted(&insert->k))
return 0;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
ret = bch2_mark_overwrite(trans, iter, k, insert,
- fs_usage, flags);
+ fs_usage, flags,
+ btree_node_type_is_extents(iter->btree_id));
if (ret <= 0)
break;
@@ -1380,8 +1381,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
pr_err("overlapping with");
node_iter = iter->l[0].iter;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k;
@@ -1443,8 +1443,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
- u16 *dst_sectors;
- bool overflow;
+ u16 *dst_sectors, orig_sectors;
int ret;
ret = trans_get_key(trans, BTREE_ID_ALLOC,
@@ -1501,13 +1500,12 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
dst_sectors = !p.ptr.cached
? &u.dirty_sectors
: &u.cached_sectors;
+ orig_sectors = *dst_sectors;
- overflow = checked_add(*dst_sectors, sectors);
-
- if (overflow) {
+ if (checked_add(*dst_sectors, sectors)) {
bch2_fs_inconsistent(c,
"bucket sector count overflow: %u + %lli > U16_MAX",
- *dst_sectors, sectors);
+ orig_sectors, sectors);
/* return an error indicating that we need full fsck */
ret = -EIO;
goto out;
@@ -1672,8 +1670,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
k.k->p.offset > idx + sectors))
goto out;
- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+ sectors = k.k->p.offset - idx;
r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(r_v);
@@ -1690,9 +1687,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
set_bkey_val_u64s(&r_v->k, 0);
}
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
bch2_trans_update(trans, iter, &r_v->k_i, 0);
out:
- ret = k.k->p.offset - idx;
+ ret = sectors;
err:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1729,6 +1729,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
@@ -1792,8 +1793,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
return 0;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k;
unsigned offset = 0;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4717a1a6f568..765650ce9d0a 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -97,7 +97,8 @@ static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
const struct bch_extent_ptr *ptr)
{
- if (k->type == KEY_TYPE_btree_ptr)
+ if (k->type == KEY_TYPE_btree_ptr ||
+ k->type == KEY_TYPE_btree_ptr_v2)
return BCH_DATA_BTREE;
return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
@@ -267,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
- struct bch_fs_usage *, unsigned);
+ struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ad6993b7565a..6f1afa4a3119 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <keys/user-type.h>
@@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -198,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -223,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -324,7 +325,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
BUG_ON(len_a + len_b > bio_sectors(bio));
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
- BUG_ON(crc_old.compression_type);
+ BUG_ON(crc_is_compressed(crc_old));
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
bch2_csum_type_is_encryption(new_csum_type));
@@ -353,6 +354,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
if (i->crc)
*i->crc = (struct bch_extent_crc_unpacked) {
.csum_type = i->csum_type,
+ .compression_type = crc_old.compression_type,
.compressed_size = i->len,
.uncompressed_size = i->len,
.offset = 0,
@@ -461,7 +463,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -544,7 +546,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -572,7 +574,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -604,7 +606,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1963cbfaaa05..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
@@ -155,13 +155,16 @@ static inline struct nonce null_nonce(void)
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
- unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+ unsigned compression_type = crc_is_compressed(crc)
+ ? crc.compression_type
+ : 0;
+ unsigned size = compression_type ? crc.uncompressed_size : 0;
struct nonce nonce = (struct nonce) {{
[0] = cpu_to_le32(size << 22),
[1] = cpu_to_le32(version.lo),
[2] = cpu_to_le32(version.lo >> 32),
[3] = cpu_to_le32(version.hi|
- (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+ (compression_type << 24))^BCH_NONCE_EXTENT,
}};
return nonce_add(nonce, crc.nonce << 9);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index b6b4ec48dccc..0713286d7999 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -17,7 +17,6 @@ struct bbuf {
BB_NONE,
BB_VMAP,
BB_KMALLOC,
- BB_VMALLOC,
BB_MEMPOOL,
} type;
int rw;
@@ -33,17 +32,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
if (b)
return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
- b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
- b = b ? page_address(b) : NULL;
- if (b)
- return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
-
- b = vmalloc(size);
- if (b)
- return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
-
b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
- b = b ? page_address(b) : NULL;
if (b)
return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
@@ -66,7 +55,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
#ifndef CONFIG_HIGHMEM
- __bio_for_each_contig_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (bv.bv_len == start.bi_size)
return (struct bbuf) {
.b = page_address(bv.bv_page) + bv.bv_offset,
@@ -129,12 +118,8 @@ static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
case BB_KMALLOC:
kfree(buf.b);
break;
- case BB_VMALLOC:
- vfree(buf.b);
- break;
case BB_MEMPOOL:
- mempool_free(virt_to_page(buf.b),
- &c->compression_bounce[buf.rw]);
+ mempool_free(buf.b, &c->compression_bounce[buf.rw]);
break;
}
}
@@ -434,7 +419,7 @@ out:
bio_unmap_or_unbounce(c, dst_data);
return compression_type;
err:
- compression_type = 0;
+ compression_type = BCH_COMPRESSION_TYPE_incompressible;
goto out;
}
@@ -561,15 +546,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
have_compressed:
if (!mempool_initialized(&c->compression_bounce[READ])) {
- ret = mempool_init_page_pool(&c->compression_bounce[READ],
- 1, order);
+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+ 1, order);
if (ret)
goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
- ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
- 1, order);
+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+ 1, order);
if (ret)
goto out;
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 623b6c3eda95..ae5c9fd8d9f7 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -169,12 +169,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
const struct qstr *dst_name, u64 *dst_inum,
enum bch_rename_mode mode)
{
- struct btree_iter *src_iter, *dst_iter;
+ struct btree_iter *src_iter = NULL, *dst_iter = NULL;
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
- int ret;
+ int ret = 0;
*src_inum = *dst_inum = 0;
@@ -191,8 +191,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name,
BTREE_ITER_INTENT);
- if (IS_ERR(dst_iter))
- return PTR_ERR(dst_iter);
+ ret = PTR_ERR_OR_ZERO(dst_iter);
+ if (ret)
+ goto out;
+
old_dst = bch2_btree_iter_peek_slot(dst_iter);
if (mode != BCH_RENAME)
@@ -202,15 +204,18 @@ int bch2_dirent_rename(struct btree_trans *trans,
src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
src_hash, src_dir, src_name,
BTREE_ITER_INTENT);
- if (IS_ERR(src_iter))
- return PTR_ERR(src_iter);
+ ret = PTR_ERR_OR_ZERO(src_iter);
+ if (ret)
+ goto out;
+
old_src = bch2_btree_iter_peek_slot(src_iter);
*src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
- if (IS_ERR(new_dst))
- return PTR_ERR(new_dst);
+ ret = PTR_ERR_OR_ZERO(new_dst);
+ if (ret)
+ goto out;
dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
new_dst->k.p = dst_iter->pos;
@@ -218,15 +223,18 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(trans, 0, src_name, 0);
- if (IS_ERR(new_src))
- return PTR_ERR(new_src);
+ ret = PTR_ERR_OR_ZERO(new_src);
+ if (ret)
+ goto out;
dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
new_src->k.p = src_iter->pos;
} else {
new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- if (IS_ERR(new_src))
- return PTR_ERR(new_src);
+ ret = PTR_ERR_OR_ZERO(new_src);
+ if (ret)
+ goto out;
+
bkey_init(&new_src->k);
new_src->k.p = src_iter->pos;
@@ -247,7 +255,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_dst->k.p = src_iter->pos;
bch2_trans_update(trans, src_iter,
&new_dst->k_i, 0);
- return 0;
+ goto out;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
@@ -261,7 +269,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
src_hash, src_iter);
if (ret < 0)
- return ret;
+ goto out;
if (ret)
new_src->k.type = KEY_TYPE_whiteout;
@@ -270,7 +278,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
- return 0;
+out:
+ bch2_trans_iter_put(trans, src_iter);
+ bch2_trans_iter_put(trans, dst_iter);
+ return ret;
}
int bch2_dirent_delete_at(struct btree_trans *trans,
@@ -331,9 +342,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
break;
}
}
-
- if (!IS_ERR(iter))
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_put(trans, iter);
return ret;
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index a49d0745c720..933945b65925 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -741,6 +741,8 @@ found_slot:
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
+ bch2_trans_iter_put(&trans, iter);
+
if (ret == -EINTR)
goto retry;
@@ -802,8 +804,6 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
continue;
}
- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-
dev = s->key.v.ptrs[idx].dev;
bkey_on_stack_reassemble(&sk, c, k);
@@ -818,6 +818,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
extent_stripe_ptr_add(e, s, ec_ptr, idx);
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
@@ -1201,8 +1202,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
struct btree_iter *iter,
struct stripe *m,
size_t idx,
- struct bkey_i_stripe *new_key,
- unsigned flags)
+ struct bkey_i_stripe *new_key)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
@@ -1231,9 +1231,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
spin_unlock(&c->ec_stripes_heap_lock);
bch2_trans_update(trans, iter, &new_key->k_i, 0);
-
- return bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
+ return 0;
}
int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
@@ -1257,12 +1255,10 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
if (!m->dirty)
continue;
- do {
- bch2_trans_reset(&trans, TRANS_RESET_MEM);
-
- ret = __bch2_stripe_write_key(&trans, iter, m,
- giter.pos, new_key, flags);
- } while (ret == -EINTR);
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|flags,
+ __bch2_stripe_write_key(&trans, iter, m,
+ giter.pos, new_key));
if (ret)
break;
@@ -1280,9 +1276,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct btree_trans trans;
- struct btree_iter *btree_iter;
- struct journal_iter journal_iter;
- struct bkey_s_c btree_k, journal_k;
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
int ret;
ret = bch2_fs_ec_start(c);
@@ -1291,38 +1286,16 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_trans_init(&trans, c, 0, 0);
- btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
- journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+ BTREE_ID_EC, POS_MIN);
- btree_k = bch2_btree_iter_peek(btree_iter);
- journal_k = bch2_journal_iter_peek(&journal_iter);
- while (1) {
- bool btree;
-
- if (btree_k.k && journal_k.k) {
- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
-
- if (!cmp)
- btree_k = bch2_btree_iter_next(btree_iter);
- btree = cmp < 0;
- } else if (btree_k.k) {
- btree = true;
- } else if (journal_k.k) {
- btree = false;
- } else {
- break;
- }
-
- bch2_mark_key(c, btree ? btree_k : journal_k,
- 0, 0, NULL, 0,
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
- if (btree)
- btree_k = bch2_btree_iter_next(btree_iter);
- else
- journal_k = bch2_journal_iter_next(&journal_iter);
+ bch2_btree_and_journal_iter_advance(&iter);
}
ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 8d9fbfd19f66..cf67abd48490 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,6 +12,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
#define bch2_bkey_ops_stripe (struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
+ .swab = bch2_ptr_swab, \
}
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 846d77dc2530..2a7d913bdda3 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -39,6 +39,16 @@ static int count_iters_for_insert(struct btree_trans *trans,
{
int ret = 0;
+ /*
+ * The extent update path requires an _additional_ iterator for each
+ * extent we're inserting and overwriting:
+ */
+ *nr_iters += 1;
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ ret = 1;
+ }
+
switch (k.k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
@@ -105,7 +115,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
b = iter->l[0].b;
node_iter = iter->l[0].iter;
- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
+ bkey_cmp(bkey_start_pos(&insert->k),
+ bkey_predecessor(b->data->min_key)) < 0);
*end = bpos_min(insert->k.p, b->key.k.p);
@@ -114,8 +126,7 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
if (ret < 0)
return ret;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
unsigned offset = 0;
@@ -167,402 +178,39 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
- unsigned *u64s)
+ struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *_k;
+ struct bkey_s_c k;
struct bkey unpacked;
int sectors;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
- break;
-
- overlap = bch2_extent_overlap(&insert->k, k.k);
-
- /*
- * If we're overwriting an existing extent, we may need to emit
- * a whiteout - unless we're inserting a new extent at the same
- * position:
- */
- if (k.k->needs_whiteout &&
- (!bkey_whiteout(&insert->k) ||
- bkey_cmp(k.k->p, insert->k.p)))
- *u64s += BKEY_U64s;
-
- /*
- * If we're partially overwriting an existing extent which has
- * been written out to disk, we'll need to emit a new version of
- * that extent:
- */
- if (bkey_written(l->b, _k) &&
- overlap != BCH_EXTENT_OVERLAP_ALL)
- *u64s += _k->u64s;
-
- /* And we may be splitting an existing extent: */
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- *u64s += _k->u64s;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bch2_bkey_sectors_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c,
- trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
- }
- }
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
-
- bch2_btree_node_iter_advance(&node_iter, l->b);
- }
-
- return BTREE_INSERT_OK;
-}
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter *_iter,
- struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_node_iter iter;
- struct bkey_packed *k;
- struct bkey uk;
-
- if (!expensive_debug_checks(c))
- return;
-
- iter = *_iter;
- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
- iter = *_iter;
- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
- if (k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
- char buf1[100];
- char buf2[100];
-
- bch2_bkey_to_text(&PBUF(buf1), &insert->k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
-
- bch2_dump_btree_node(b);
- panic("insert > next :\n"
- "insert %s\n"
- "next %s\n",
- buf1, buf2);
- }
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *k =
- bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
- bch2_bset_insert(l->b, &l->iter, k, insert, 0);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
-{
- struct bkey_packed k;
-
- if (!bkey_pack_pos(&k, pos, b)) {
- struct bkey_i tmp;
+ _k = bch2_btree_node_iter_peek(&node_iter, l->b);
+ if (!_k)
+ return BTREE_INSERT_OK;
- bkey_init(&tmp.k);
- tmp.k.p = pos;
- bkey_copy(&k, &tmp);
- }
+ k = bkey_disassemble(l->b, _k, &unpacked);
- k.needs_whiteout = true;
- push_whiteout(c, b, &k);
-}
+ /* Check if we're splitting a compressed extent: */
-static void
-extent_drop(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_packed *_k, struct bkey_s k)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- if (!bkey_whiteout(k.k))
- btree_account_key_drop(l->b, _k);
-
- k.k->size = 0;
- k.k->type = KEY_TYPE_deleted;
-
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- if (_k >= btree_bset_last(l->b)->start) {
- unsigned u64s = _k->u64s;
-
- bch2_bset_delete(l->b, _k, _k->u64s);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
- } else {
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-}
-
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_packed *_k, struct bkey_s k,
- enum bch_extent_overlap overlap)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_on_stack tmp, split;
+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
+ bkey_cmp(insert->k.p, k.k->p) < 0 &&
+ (sectors = bch2_bkey_sectors_compressed(k))) {
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
- bkey_on_stack_init(&tmp);
- bkey_on_stack_init(&split);
-
- if (!btree_node_old_extent_overwrite(l->b)) {
- if (!bkey_whiteout(&insert->k) &&
- !bkey_cmp(k.k->p, insert->k.p)) {
- insert->k.needs_whiteout = k.k->needs_whiteout;
- k.k->needs_whiteout = false;
- }
- } else {
- insert->k.needs_whiteout |= k.k->needs_whiteout;
- }
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- /*
- * needs_whiteout was propagated to new version of @k,
- * @tmp:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- /*
- * No need to call bset_fix_invalidated_key, start of
- * extent changed but extents are indexed by where they
- * end
- */
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
-
- /*
- * @tmp has different position than @k, needs_whiteout
- * should not be propagated:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- tmp.k->k.needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- /*
- * position of @k is changing, emit a whiteout if
- * needs_whiteout is set:
- */
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_back_s(bkey_start_pos(&insert->k), k));
- extent_save(l->b, _k, k.k);
-
- bch2_bset_fix_invalidated_key(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- }
- break;
- case BCH_EXTENT_OVERLAP_ALL:
- extent_drop(c, iter, _k, k);
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- bkey_on_stack_reassemble(&split, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- split.k->k.needs_whiteout = false;
-
- /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-
- extent_bset_insert(c, iter, split.k);
- break;
- }
-
- bkey_on_stack_exit(&split, c);
- bkey_on_stack_exit(&tmp, c);
-}
-
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- bool do_update = !bkey_whiteout(&insert->k);
- struct bkey_packed *_k;
- struct bkey unpacked;
-
- EBUG_ON(iter->level);
- EBUG_ON(!insert->k.size);
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
+ sectors, flags)) {
+ case 0:
break;
-
- if (!bkey_whiteout(k.k))
- do_update = true;
-
- if (!do_update) {
- struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-
- bch2_cut_front(cur_end, insert);
- bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
- } else {
- extent_squash(c, iter, insert, _k, k, overlap);
+ case -ENOSPC:
+ return BTREE_INSERT_ENOSPC;
+ default:
+ BUG();
}
-
- node_iter = l->iter;
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
}
- l->iter = node_iter;
- bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
-
- if (do_update) {
- if (insert->k.type == KEY_TYPE_deleted)
- insert->k.type = KEY_TYPE_discard;
-
- if (!bkey_whiteout(&insert->k) ||
- btree_node_old_extent_overwrite(l->b))
- extent_bset_insert(c, iter, insert);
-
- bch2_btree_journal_key(trans, iter, insert);
- }
-
- bch2_cut_front(insert->k.p, insert);
+ return BTREE_INSERT_OK;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index e9dc8091ba3f..38dc084627d2 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
- struct btree_iter *,
- struct bkey_i *);
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c4b0b9e15a8f..3f66457d2272 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -9,6 +9,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
@@ -214,6 +215,37 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+ pr_buf(out, "seq %llu sectors %u written %u min_key ",
+ le64_to_cpu(bp.v->seq),
+ le16_to_cpu(bp.v->sectors),
+ le16_to_cpu(bp.v->sectors_written));
+
+ bch2_bpos_to_text(out, bp.v->min_key);
+ pr_buf(out, " ");
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+ unsigned big_endian, int write,
+ struct bkey_s k)
+{
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
+
+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
+
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_node_type_is_extents(btree_id) &&
+ bkey_cmp(bp.v->min_key, POS_MIN))
+ bp.v->min_key = write
+ ? bkey_predecessor(bp.v->min_key)
+ : bkey_successor(bp.v->min_key);
+}
+
/* KEY_TYPE_extent: */
const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -337,7 +369,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
if (!bch2_checksum_mergeable(crc_l.csum_type))
return BCH_MERGE_NOMERGE;
- if (crc_l.compression_type)
+ if (crc_is_compressed(crc_l))
return BCH_MERGE_NOMERGE;
if (crc_l.csum_type &&
@@ -345,7 +377,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c,
crc_r.uncompressed_size > c->sb.encoded_extent_max)
return BCH_MERGE_NOMERGE;
- if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
bch2_crc_field_size_max[extent_entry_type(en_l)])
return BCH_MERGE_NOMERGE;
@@ -448,7 +480,7 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
struct bch_extent_crc_unpacked n)
{
- return !u.compression_type &&
+ return !crc_is_compressed(u) &&
u.csum_type &&
u.uncompressed_size > u.live_size &&
bch2_csum_type_is_encryption(u.csum_type) ==
@@ -492,7 +524,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
/* Find a checksum entry that covers only live data: */
if (!n.csum_type) {
bkey_for_each_crc(&k->k, ptrs, u, i)
- if (!u.compression_type &&
+ if (!crc_is_compressed(u) &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
n = u;
@@ -501,7 +533,7 @@ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
return false;
}
found:
- BUG_ON(n.compression_type);
+ BUG_ON(crc_is_compressed(n));
BUG_ON(n.offset);
BUG_ON(n.live_size != k->k.size);
@@ -563,15 +595,15 @@ void bch2_extent_crc_append(struct bkey_i *k,
enum bch_extent_entry_type type;
if (bch_crc_bytes[new.csum_type] <= 4 &&
- new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
+ new.uncompressed_size <= CRC32_SIZE_MAX &&
new.nonce <= CRC32_NONCE_MAX)
type = BCH_EXTENT_ENTRY_crc32;
else if (bch_crc_bytes[new.csum_type] <= 10 &&
- new.uncompressed_size - 1 <= CRC64_SIZE_MAX &&
+ new.uncompressed_size <= CRC64_SIZE_MAX &&
new.nonce <= CRC64_NONCE_MAX)
type = BCH_EXTENT_ENTRY_crc64;
else if (bch_crc_bytes[new.csum_type] <= 16 &&
- new.uncompressed_size - 1 <= CRC128_SIZE_MAX &&
+ new.uncompressed_size <= CRC128_SIZE_MAX &&
new.nonce <= CRC128_NONCE_MAX)
type = BCH_EXTENT_ENTRY_crc128;
else
@@ -610,8 +642,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- ret += !p.ptr.cached &&
- p.crc.compression_type == BCH_COMPRESSION_TYPE_none;
+ ret += !p.ptr.cached && !crc_is_compressed(p.crc);
}
return ret;
@@ -625,13 +656,24 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
unsigned ret = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_none)
+ if (!p.ptr.cached && crc_is_compressed(p.crc))
ret += p.crc.compressed_size;
return ret;
}
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ return true;
+ return false;
+}
+
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas)
{
@@ -739,6 +781,7 @@ void bch2_bkey_append_ptr(struct bkey_i *k,
switch (k->k.type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
case KEY_TYPE_extent:
EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
@@ -1021,6 +1064,8 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (k.k->type == KEY_TYPE_btree_ptr)
size_ondisk = c->opts.btree_node_size;
+ if (k.k->type == KEY_TYPE_btree_ptr_v2)
+ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
@@ -1069,17 +1114,19 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(struct bkey_s k)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
- u64 *d = (u64 *) bkeyp_val(f, k);
- unsigned i;
+ u64 *d;
- for (i = 0; i < bkeyp_val_u64s(f, k); i++)
- d[i] = swab64(d[i]);
+ for (d = (u64 *) ptrs.start;
+ d != (u64 *) ptrs.end;
+ d++)
+ *d = swab64(*d);
- for (entry = (union bch_extent_entry *) d;
- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+ for (entry = ptrs.start;
+ entry < ptrs.end;
entry = extent_entry_next(entry)) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 7c5a41e6d79d..29b15365d19c 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -175,6 +175,12 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
#undef common_fields
}
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@@ -219,6 +225,13 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
bkey_val_end(r),
};
}
+ case KEY_TYPE_btree_ptr_v2: {
+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+ return (struct bkey_ptrs_c) {
+ to_entry(&e.v->start[0]),
+ to_entry(extent_entry_last(e))
+ };
+ }
default:
return (struct bkey_ptrs_c) { NULL, NULL };
}
@@ -359,6 +372,11 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+ int, struct bkey_s);
+
#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
.key_debugcheck = bch2_btree_ptr_debugcheck, \
@@ -366,6 +384,14 @@ void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
.swab = bch2_ptr_swab, \
}
+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_btree_ptr_invalid, \
+ .key_debugcheck = bch2_btree_ptr_debugcheck, \
+ .val_to_text = bch2_btree_ptr_v2_to_text, \
+ .swab = bch2_ptr_swab, \
+ .compat = bch2_btree_ptr_v2_compat, \
+}
+
/* KEY_TYPE_extent: */
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -410,6 +436,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
{
switch (k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
return true;
@@ -483,6 +510,7 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -525,7 +553,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_ptr_swab(struct bkey_s);
/* Generic extent code: */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 96f7bbe0a3ed..878419d40992 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -19,14 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
struct posix_acl *acl)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter;
+ struct btree_iter *dir_iter = NULL;
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
u64 now = bch2_current_time(trans->c);
int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
@@ -37,20 +38,20 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (ret)
- return ret;
+ goto err;
if (default_acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
default_acl, ACL_TYPE_DEFAULT);
if (ret)
- return ret;
+ goto err;
}
if (acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
acl, ACL_TYPE_ACCESS);
if (ret)
- return ret;
+ goto err;
}
if (name) {
@@ -62,48 +63,55 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
ret = bch2_inode_write(trans, dir_iter, dir_u);
if (ret)
- return ret;
+ goto err;
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(new_inode->bi_mode),
name, new_inode->bi_inum,
BCH_HASH_SET_MUST_CREATE);
if (ret)
- return ret;
+ goto err;
}
-
- return 0;
+err:
+ bch2_trans_iter_put(trans, dir_iter);
+ return ret;
}
int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 inum, struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u, const struct qstr *name)
{
- struct btree_iter *dir_iter, *inode_iter;
+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(trans->c);
+ int ret;
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- if (IS_ERR(inode_iter))
- return PTR_ERR(inode_iter);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto err;
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_hash = bch2_hash_info_init(trans->c, dir_u);
- bch2_trans_iter_put(trans, dir_iter);
- return bch2_dirent_create(trans, dir_inum, &dir_hash,
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum, BCH_HASH_SET_MUST_CREATE) ?:
bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
+err:
+ bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_put(trans, inode_iter);
+ return ret;
}
int bch2_unlink_trans(struct btree_trans *trans,
@@ -111,39 +119,49 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct qstr *name)
{
- struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
+ *inode_iter = NULL;
struct bch_hash_info dir_hash;
u64 inum, now = bch2_current_time(trans->c);
struct bkey_s_c k;
+ int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
dir_hash = bch2_hash_info_init(trans->c, dir_u);
dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
name, BTREE_ITER_INTENT);
- if (IS_ERR(dirent_iter))
- return PTR_ERR(dirent_iter);
+ ret = PTR_ERR_OR_ZERO(dirent_iter);
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(dirent_iter);
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- if (IS_ERR(inode_iter))
- return PTR_ERR(inode_iter);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto err;
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
bch2_inode_nlink_dec(inode_u);
- return (S_ISDIR(inode_u->bi_mode)
+ ret = (S_ISDIR(inode_u->bi_mode)
? bch2_empty_dir_trans(trans, inum)
: 0) ?:
bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
+err:
+ bch2_trans_iter_put(trans, inode_iter);
+ bch2_trans_iter_put(trans, dirent_iter);
+ bch2_trans_iter_put(trans, dir_iter);
+ return ret;
}
bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
@@ -179,24 +197,26 @@ int bch2_rename_trans(struct btree_trans *trans,
const struct qstr *dst_name,
enum bch_rename_mode mode)
{
- struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
- struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
struct bch_hash_info src_hash, dst_hash;
u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
int ret;
src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
BTREE_ITER_INTENT);
- if (IS_ERR(src_dir_iter))
- return PTR_ERR(src_dir_iter);
+ ret = PTR_ERR_OR_ZERO(src_dir_iter);
+ if (ret)
+ goto err;
src_hash = bch2_hash_info_init(trans->c, src_dir_u);
if (dst_dir != src_dir) {
dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
BTREE_ITER_INTENT);
- if (IS_ERR(dst_dir_iter))
- return PTR_ERR(dst_dir_iter);
+ ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+ if (ret)
+ goto err;
dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
} else {
@@ -211,38 +231,48 @@ int bch2_rename_trans(struct btree_trans *trans,
dst_name, &dst_inode,
mode);
if (ret)
- return ret;
+ goto err;
src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
BTREE_ITER_INTENT);
- if (IS_ERR(src_inode_iter))
- return PTR_ERR(src_inode_iter);
+ ret = PTR_ERR_OR_ZERO(src_inode_iter);
+ if (ret)
+ goto err;
if (dst_inode) {
dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
BTREE_ITER_INTENT);
- if (IS_ERR(dst_inode_iter))
- return PTR_ERR(dst_inode_iter);
+ ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+ if (ret)
+ goto err;
}
if (mode == BCH_RENAME_OVERWRITE) {
if (S_ISDIR(src_inode_u->bi_mode) !=
- S_ISDIR(dst_inode_u->bi_mode))
- return -ENOTDIR;
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -ENOTDIR;
+ goto err;
+ }
if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inode))
- return -ENOTEMPTY;
+ bch2_empty_dir_trans(trans, dst_inode)) {
+ ret = -ENOTEMPTY;
+ goto err;
+ }
}
if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode))
- return -EXDEV;
+ S_ISDIR(src_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
if (mode == BCH_RENAME_EXCHANGE &&
bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode))
- return -EXDEV;
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
if (S_ISDIR(src_inode_u->bi_mode)) {
src_dir_u->bi_nlink--;
@@ -270,7 +300,7 @@ int bch2_rename_trans(struct btree_trans *trans,
if (dst_inode)
dst_inode_u->bi_ctime = now;
- return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
(src_dir != dst_dir
? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
: 0 ) ?:
@@ -278,4 +308,10 @@ int bch2_rename_trans(struct btree_trans *trans,
(dst_inode
? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
: 0 );
+err:
+ bch2_trans_iter_put(trans, dst_inode_iter);
+ bch2_trans_iter_put(trans, src_inode_iter);
+ bch2_trans_iter_put(trans, dst_dir_iter);
+ bch2_trans_iter_put(trans, src_dir_iter);
+ return ret;
}
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c0f8cd8942e4..0aa3afade4ea 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -602,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@@ -627,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -782,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1037,32 +1034,33 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1086,7 +1084,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1240,7 +1238,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
@@ -1805,10 +1803,11 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct address_space *mapping = req->ki_filp->f_mapping;
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
+ unsigned unaligned;
u64 new_i_size;
- bool sync;
+ bool sync = dio->sync;
long ret;
if (dio->loop)
@@ -1838,7 +1837,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1856,7 +1855,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
GFP_KERNEL);
if (unlikely(!iov)) {
- dio->sync = true;
+ dio->sync = sync = true;
goto do_io;
}
@@ -1870,7 +1869,7 @@ do_io:
dio->loop = true;
closure_call(&dio->op.cl, bch2_write, NULL, NULL);
- if (dio->sync)
+ if (sync)
wait_for_completion(&dio->done);
else
return -EIOCBQUEUED;
@@ -1886,7 +1885,7 @@ loop:
i_size_write(&inode->v, new_i_size);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
@@ -1904,7 +1903,6 @@ err:
if (dio->free_iov)
kfree(dio->iter.iov);
- sync = dio->sync;
bio_put(bio);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
@@ -2514,10 +2512,8 @@ reassemble:
bkey_on_stack_reassemble(&copy, c, k);
if (insert &&
- bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
bch2_cut_front(move_pos, copy.k);
- bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
- }
copy.k->k.p.offset += shift >> 9;
bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
@@ -2537,8 +2533,9 @@ reassemble:
}
bkey_init(&delete.k);
- delete.k.p = src->pos;
- bch2_key_resize(&delete.k, copy.k->k.size);
+ delete.k.p = copy.k->k.p;
+ delete.k.size = copy.k->k.size;
+ delete.k.p.offset -= shift >> 9;
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
@@ -2559,6 +2556,8 @@ reassemble:
BUG_ON(ret);
}
+ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
+
ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
bch2_trans_commit(&trans, &disk_res,
@@ -2649,7 +2648,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct bkey_i_reservation reservation;
struct bkey_s_c k;
- bch2_trans_reset(&trans, TRANS_RESET_MEM);
+ bch2_trans_begin(&trans);
k = bch2_btree_iter_peek_slot(iter);
if ((ret = bkey_err(k)))
@@ -2823,235 +2822,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3244,7 +3014,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 13b8bbcdb694..1c89a1b2c2d0 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -142,8 +142,6 @@ retry:
&inode->ei_journal_seq,
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
- if (ret == -EINTR)
- goto retry;
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -152,6 +150,11 @@ retry:
if (!ret)
bch2_inode_update_after_write(c, inode, &inode_u, fields);
+ bch2_trans_iter_put(&trans, iter);
+
+ if (ret == -EINTR)
+ goto retry;
+
bch2_trans_exit(&trans);
return ret < 0 ? ret : 0;
}
@@ -963,15 +966,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -989,7 +983,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1520,7 +1514,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9ef532d875e8..3ab621c62c43 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_on_stack.h"
#include "btree_update.h"
#include "dirent.h"
#include "error.h"
@@ -81,7 +82,6 @@ static int remove_dirent(struct btree_trans *trans,
return __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- TRANS_RESET_MEM,
__remove_dirent(trans, dirent));
}
@@ -182,8 +182,6 @@ static int hash_redo_key(const struct bch_hash_desc desc,
struct bkey_i delete;
struct bkey_i *tmp;
- bch2_trans_reset(trans, TRANS_RESET_MEM);
-
tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if (IS_ERR(tmp))
return PTR_ERR(tmp);
@@ -194,11 +192,8 @@ static int hash_redo_key(const struct bch_hash_desc desc,
delete.k.p = k_iter->pos;
bch2_trans_update(trans, k_iter, &delete, 0);
- return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
- tmp, BCH_HASH_SET_MUST_CREATE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+ tmp, BCH_HASH_SET_MUST_CREATE);
}
static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -320,10 +315,9 @@ static int hash_check_key(struct btree_trans *trans,
desc.btree_id, k.k->p.offset,
hashed, h->chain->pos.offset,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
- do {
- ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
- } while (ret == -EINTR);
-
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ hash_redo_key(desc, trans, h, k_iter, k, hashed));
if (ret) {
bch_err(c, "hash_redo_key err %i", ret);
return ret;
@@ -387,7 +381,6 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- TRANS_RESET_MEM,
(bch2_trans_update(trans, iter, &d->k_i, 0), 0));
if (ret)
goto err;
@@ -410,11 +403,10 @@ err_redo:
k->k->p.offset, hash, h->chain->pos.offset,
(bch2_bkey_val_to_text(&PBUF(buf), c,
*k), buf))) {
- do {
- ret = hash_redo_key(bch2_dirent_hash_desc, trans,
- h, iter, *k, hash);
- } while (ret == -EINTR);
-
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ hash_redo_key(bch2_dirent_hash_desc, trans,
+ h, iter, *k, hash));
if (ret)
bch_err(c, "hash_redo_key err %i", ret);
else
@@ -431,6 +423,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
POS(inode_nr + 1, 0), NULL);
}
+static int bch2_fix_overlapping_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k, struct bpos cut_at)
+{
+ struct btree_iter *u_iter;
+ struct bkey_i *u;
+ int ret;
+
+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(u, k);
+ bch2_cut_front(cut_at, u);
+
+ u_iter = bch2_trans_copy_iter(trans, iter);
+ ret = PTR_ERR_OR_ZERO(u_iter);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't want to go through the
+ * extent_handle_overwrites path:
+ */
+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+
+ /*
+ * XXX: this is going to leave disk space
+ * accounting slightly wrong
+ */
+ ret = bch2_trans_update(trans, u_iter, u, 0);
+ bch2_trans_iter_put(trans, u_iter);
+ return ret;
+}
+
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
@@ -442,17 +470,40 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey_on_stack prev;
u64 i_sectors;
int ret = 0;
+ bkey_on_stack_init(&prev);
+ prev.k->k = KEY(0, 0, 0);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS(BCACHEFS_ROOT_INO, 0), 0);
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[200];
+ char buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+ bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_fix_overlapping_extent(&trans,
+ iter, k, prev.k->k.p));
+ if (ret)
+ goto err;
+ }
+ }
+ bkey_on_stack_reassemble(&prev, c, k);
+
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
@@ -477,7 +528,8 @@ retry:
!(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
w.inode.bi_sectors !=
(i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
- c, "i_sectors wrong: got %llu, should be %llu",
+ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu",
+ w.inode.bi_inum,
w.inode.bi_sectors, i_sectors)) {
struct bkey_inode_buf p;
@@ -519,6 +571,7 @@ err:
fsck_err:
if (ret == -EINTR)
goto retry;
+ bkey_on_stack_exit(&prev, c);
return bch2_trans_exit(&trans) ?: ret;
}
@@ -660,7 +713,6 @@ retry:
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- TRANS_RESET_MEM,
(bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
kfree(n);
if (ret)
@@ -986,12 +1038,12 @@ retry:
if (!ret)
continue;
- if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
+ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c,
"unreachable directory found (inum %llu)",
- k.k->p.inode)) {
+ k.k->p.offset)) {
bch2_trans_unlock(&trans);
- ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
+ ret = reattach_inode(c, lostfound_inode, k.k->p.offset);
if (ret) {
goto err;
}
@@ -1275,7 +1327,6 @@ static int check_inode(struct btree_trans *trans,
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- TRANS_RESET_MEM,
(bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
if (ret)
bch_err(c, "error in fsck: error %i "
@@ -1302,18 +1353,18 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
- POS(range_start, 0), 0);
+ POS(0, range_start), 0);
nlinks_iter = genradix_iter_init(links, 0);
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret2 = bkey_err(k))) {
peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
- if (!link && (!k.k || iter->pos.inode >= range_end))
+ if (!link && (!k.k || iter->pos.offset >= range_end))
break;
nlinks_pos = range_start + nlinks_iter.pos;
- if (iter->pos.inode > nlinks_pos) {
+ if (iter->pos.offset > nlinks_pos) {
/* Should have been caught by dirents pass: */
need_fsck_err_on(link && link->count, c,
"missing inode %llu (nlink %u)",
@@ -1322,7 +1373,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
goto peek_nlinks;
}
- if (iter->pos.inode < nlinks_pos || !link)
+ if (iter->pos.offset < nlinks_pos || !link)
link = &zero_links;
if (k.k && k.k->type == KEY_TYPE_inode) {
@@ -1338,7 +1389,7 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
nlinks_pos, link->count);
}
- if (nlinks_pos == iter->pos.inode)
+ if (nlinks_pos == iter->pos.offset)
genradix_iter_advance(&nlinks_iter, links);
bch2_btree_iter_next(iter);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index e811b98d0f03..7d20f082ad45 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -98,7 +98,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
unsigned bytes;
bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.inode = inode->bi_inum;
+ packed->inode.k.p.offset = inode->bi_inum;
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
@@ -149,7 +149,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
unsigned fieldnr = 0, field_bits;
int ret;
- unpacked->bi_inum = inode.k->p.inode;
+ unpacked->bi_inum = inode.k->p.offset;
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
@@ -188,7 +188,7 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
BTREE_ITER_SLOTS|flags);
if (IS_ERR(iter))
return iter;
@@ -232,13 +232,13 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
struct bch_inode_unpacked unpacked;
- if (k.k->p.offset)
- return "nonzero offset";
+ if (k.k->p.inode)
+ return "nonzero k.p.inode";
if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
return "incorrect value size";
- if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX)
return "fs inode in blockdev range";
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
@@ -280,8 +280,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
const char *bch2_inode_generation_invalid(const struct bch_fs *c,
struct bkey_s_c k)
{
- if (k.k->p.offset)
- return "nonzero offset";
+ if (k.k->p.inode)
+ return "nonzero k.p.inode";
if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
return "incorrect value size";
@@ -362,16 +362,16 @@ int bch2_inode_create(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
u64 min, u64 max, u64 *hint)
{
- struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
+ struct bkey_s_c k;
u64 start;
int ret;
if (!max)
max = ULLONG_MAX;
- if (c->opts.inodes_32bit)
+ if (trans->c->opts.inodes_32bit)
max = min_t(u64, max, U32_MAX);
start = READ_ONCE(*hint);
@@ -382,48 +382,37 @@ int bch2_inode_create(struct btree_trans *trans,
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
-
- iter = bch2_trans_get_iter(trans,
- BTREE_ID_INODES, POS(start, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
again:
- while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
- ret = bkey_err(k);
- if (ret)
- return ret;
+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_cmp(iter->pos, POS(0, max)) > 0)
+ break;
- switch (k.k->type) {
- case KEY_TYPE_inode:
- /* slot used */
- if (iter->pos.inode >= max)
- goto out;
+ if (k.k->type != KEY_TYPE_inode)
+ goto found_slot;
+ }
- bch2_btree_iter_next_slot(iter);
- break;
+ bch2_trans_iter_put(trans, iter);
- default:
- *hint = k.k->p.inode;
- inode_u->bi_inum = k.k->p.inode;
- inode_u->bi_generation = bkey_generation(k);
+ if (ret)
+ return ret;
- bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
- return 0;
- }
- }
-out:
if (start != min) {
/* Retry from start */
start = min;
- bch2_btree_iter_set_pos(iter, POS(start, 0));
goto again;
}
return -ENOSPC;
+found_slot:
+ *hint = k.k->p.offset;
+ inode_u->bi_inum = k.k->p.offset;
+ inode_u->bi_generation = bkey_generation(k);
+
+ bch2_inode_pack(inode_p, inode_u);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ bch2_trans_iter_put(trans, iter);
+ return 0;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -454,7 +443,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(inode_nr, 0),
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
@@ -486,10 +475,10 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
if (!bi_generation) {
bkey_init(&delete.k);
- delete.k.p.inode = inode_nr;
+ delete.k.p.offset = inode_nr;
} else {
bkey_inode_generation_init(&delete.k_i);
- delete.k.p.inode = inode_nr;
+ delete.k.p.offset = inode_nr;
delete.v.bi_generation = cpu_to_le32(bi_generation);
}
@@ -511,21 +500,20 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(inode_nr, 0), BTREE_ITER_SLOTS);
+ POS(0, inode_nr), BTREE_ITER_SLOTS);
if (IS_ERR(iter))
return PTR_ERR(iter);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
- return ret;
+ goto err;
ret = k.k->type == KEY_TYPE_inode
? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
: -ENOENT;
-
+err:
bch2_trans_iter_put(trans, iter);
-
return ret;
}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index f2a2c45a02ad..19059702428a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -124,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -325,7 +325,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
- bch2_trans_reset(trans, TRANS_RESET_MEM);
+ bch2_trans_begin(trans);
ret = bkey_err(k);
if (ret)
@@ -399,7 +399,7 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
- bch2_trans_reset(&trans, TRANS_RESET_MEM);
+ bch2_trans_begin(&trans);
k = bch2_keylist_front(keys);
@@ -546,9 +546,14 @@ static void __bch2_write_index(struct bch_write_op *op)
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
- for_each_keylist_key(keys, k)
+ for_each_keylist_key(keys, k) {
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
+
+ }
+
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
@@ -784,8 +789,9 @@ static enum prep_encoded_ret {
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.compressed_size <= wp->sectors_free &&
- op->crc.compression_type == op->compression_type) {
- if (!op->crc.compression_type &&
+ (op->crc.compression_type == op->compression_type ||
+ op->incompressible)) {
+ if (!crc_is_compressed(op->crc) &&
op->csum_type != op->crc.csum_type &&
bch2_write_rechecksum(c, op, op->csum_type))
return PREP_ENCODED_CHECKSUM_ERR;
@@ -797,7 +803,7 @@ static enum prep_encoded_ret {
* If the data is compressed and we couldn't write the entire extent as
* is, we have to decompress it:
*/
- if (op->crc.compression_type) {
+ if (crc_is_compressed(op->crc)) {
struct bch_csum csum;
if (bch2_write_decrypt(op))
@@ -864,6 +870,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
ret = -EIO;
goto err;
case PREP_ENCODED_CHECKSUM_ERR:
+ BUG();
goto csum_err;
case PREP_ENCODED_DO_WRITE:
/* XXX look for bug here */
@@ -908,11 +915,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bch2_csum_type_is_encryption(op->crc.csum_type));
BUG_ON(op->compression_type && !bounce);
- crc.compression_type = op->compression_type
- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_type)
+ crc.compression_type = op->incompressible
+ ? BCH_COMPRESSION_TYPE_incompressible
+ : op->compression_type
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+ op->compression_type)
: 0;
- if (!crc.compression_type) {
+ if (!crc_is_compressed(crc)) {
dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
@@ -933,7 +942,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
if (bch2_csum_type_is_encryption(op->csum_type)) {
if (bversion_zero(version)) {
- version.lo = atomic64_inc_return(&c->key_version) + 1;
+ version.lo = atomic64_inc_return(&c->key_version);
} else {
crc.nonce = op->nonce;
op->nonce += src_len >> 9;
@@ -941,7 +950,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
}
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- !crc.compression_type &&
+ !crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
/*
@@ -1060,6 +1069,12 @@ again:
BKEY_EXTENT_U64s_MAX))
goto flush_io;
+ if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
+ percpu_ref_is_dying(&c->writes)) {
+ ret = -EROFS;
+ goto err;
+ }
+
wp = bch2_alloc_sectors_start(c,
op->target,
op->opts.erasure_code,
@@ -1212,7 +1227,8 @@ void bch2_write(struct closure *cl)
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
- __bcache_io_error(c, "read only");
+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
+ __bcache_io_error(c, "read only");
op->error = -EROFS;
goto err;
}
@@ -1338,6 +1354,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
static struct promote_op *__promote_alloc(struct bch_fs *c,
enum btree_id btree_id,
+ struct bkey_s_c k,
struct bpos pos,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
@@ -1394,8 +1411,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
(struct data_opts) {
.target = opts.promote_target
},
- btree_id,
- bkey_s_c_null);
+ btree_id, k);
BUG_ON(ret);
return op;
@@ -1437,7 +1453,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_REFLINK
: BTREE_ID_EXTENTS,
- pos, pick, opts, sectors, rbio);
+ k, pos, pick, opts, sectors, rbio);
if (!promote)
return NULL;
@@ -1690,33 +1706,39 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
}
}
-static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+ struct bch_read_bio *rbio)
{
struct bch_fs *c = rbio->c;
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_on_stack new;
- struct bch_extent_crc_unpacked new_crc;
u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
- int ret;
-
- if (rbio->pick.crc.compression_type)
- return;
+ struct bch_extent_crc_unpacked new_crc;
+ struct btree_iter *iter = NULL;
+ struct bkey_i *new;
+ struct bkey_s_c k;
+ int ret = 0;
- bkey_on_stack_init(&new);
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
+ if (crc_is_compressed(rbio->pick.crc))
+ return 0;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ if ((ret = PTR_ERR_OR_ZERO(iter)))
+ goto out;
+
k = bch2_btree_iter_peek_slot(iter);
- if (IS_ERR_OR_NULL(k.k))
+ if ((ret = bkey_err(k)))
goto out;
- bkey_on_stack_reassemble(&new, c, k);
- k = bkey_i_to_s_c(new.k);
+ /*
+ * going to be temporarily appending another checksum entry:
+ */
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+ BKEY_EXTENT_U64s_MAX * 8);
+ if ((ret = PTR_ERR_OR_ZERO(new)))
+ goto out;
+
+ bkey_reassemble(new, k);
+ k = bkey_i_to_s_c(new);
if (bversion_cmp(k.k->version, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
@@ -1732,21 +1754,23 @@ retry:
bkey_start_offset(k.k) - data_offset, k.k->size,
rbio->pick.crc.csum_type)) {
bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+ ret = 0;
goto out;
}
- if (!bch2_bkey_narrow_crcs(new.k, new_crc))
+ if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
- bch2_trans_update(&trans, iter, new.k, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOWAIT);
- if (ret == -EINTR)
- goto retry;
+ bch2_trans_update(trans, iter, new, 0);
out:
- bch2_trans_exit(&trans);
- bkey_on_stack_exit(&new, c);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ __bch2_rbio_narrow_crcs(&trans, rbio));
}
/* Inner part that may run in process context */
@@ -1786,7 +1810,7 @@ static void __bch2_read_endio(struct work_struct *work)
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
+ if (crc_is_compressed(crc)) {
bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
@@ -1883,7 +1907,7 @@ static void bch2_read_endio(struct bio *bio)
}
if (rbio->narrow_crcs ||
- rbio->pick.crc.compression_type ||
+ crc_is_compressed(rbio->pick.crc) ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
else if (rbio->pick.crc.csum_type)
@@ -1994,7 +2018,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
- if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
+ if (crc_is_compressed(pick.crc) ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
@@ -2009,7 +2033,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
&rbio, &bounce, &read_full);
if (!read_full) {
- EBUG_ON(pick.crc.compression_type);
+ EBUG_ON(crc_is_compressed(pick.crc));
EBUG_ON(pick.crc.csum_type &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
bvec_iter_sectors(iter) != pick.crc.live_size ||
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 45c950942d78..e45dcf9635ae 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -31,10 +31,11 @@ enum bch_write_flags {
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
BCH_WRITE_WROTE_DATA_INLINE = (1 << 8),
+ BCH_WRITE_FROM_INTERNAL = (1 << 9),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -78,6 +79,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
+ op->incompressible = 0;
op->open_buckets.nr = 0;
op->devs_have.nr = 0;
op->target = 0;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index c37b7d7401e9..684e4c9a5d98 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -104,7 +104,8 @@ struct bch_write_op {
unsigned compression_type:4;
unsigned nr_replicas:4;
unsigned nr_replicas_required:4;
- unsigned alloc_reserve:4;
+ unsigned alloc_reserve:3;
+ unsigned incompressible:1;
struct bch_devs_list devs_have;
u16 target;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 9f03a479c9a2..0a4538b3dc60 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -376,7 +376,8 @@ unlock:
goto retry;
if (ret == -ENOSPC) {
- BUG_ON(!can_discard && (flags & JOURNAL_RES_GET_RESERVED));
+ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
+ "JOURNAL_RES_GET_RESERVED set but journal full");
/*
* Journal is full - can't rely on reclaim from work item due to
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7112a25d0600..39bb2154cce1 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "btree_io.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
@@ -138,7 +139,8 @@ static void journal_entry_null_range(void *start, void *end)
static int journal_validate_key(struct bch_fs *c, struct jset *jset,
struct jset_entry *entry,
- struct bkey_i *k, enum btree_node_type key_type,
+ unsigned level, enum btree_id btree_id,
+ struct bkey_i *k,
const char *type, int write)
{
void *next = vstruct_next(entry);
@@ -171,14 +173,13 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
return 0;
}
- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(NULL, bkey_to_packed(k));
+ if (!write)
+ bch2_bkey_compat(level, btree_id, version,
+ JSET_BIG_ENDIAN(jset), write,
+ NULL, bkey_to_packed(k));
- if (!write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
-
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id));
if (invalid) {
char buf[160];
@@ -192,9 +193,10 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
return 0;
}
- if (write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+ if (write)
+ bch2_bkey_compat(level, btree_id, version,
+ JSET_BIG_ENDIAN(jset), write,
+ NULL, bkey_to_packed(k));
fsck_err:
return ret;
}
@@ -207,10 +209,10 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
struct bkey_i *k;
vstruct_for_each(entry, k) {
- int ret = journal_validate_key(c, jset, entry, k,
- __btree_node_type(entry->level,
- entry->btree_id),
- "key", write);
+ int ret = journal_validate_key(c, jset, entry,
+ entry->level,
+ entry->btree_id,
+ k, "key", write);
if (ret)
return ret;
}
@@ -240,7 +242,7 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
return 0;
}
- return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
"btree root", write);
fsck_err:
return ret;
@@ -1016,8 +1018,7 @@ void bch2_journal_write(struct closure *cl)
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
- if (le32_to_cpu(jset->version) <
- bcachefs_metadata_version_bkey_renumber)
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
validate_before_checksum = true;
if (validate_before_checksum &&
@@ -1041,9 +1042,16 @@ void bch2_journal_write(struct closure *cl)
bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+retry_alloc:
spin_lock(&j->lock);
ret = journal_write_alloc(j, w, sectors);
+ if (ret && j->can_discard) {
+ spin_unlock(&j->lock);
+ bch2_journal_do_discards(j);
+ goto retry_alloc;
+ }
+
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 695b2c8ba03b..db3afd908474 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -290,38 +290,6 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
}
}
-static inline void __journal_pin_add(struct journal *j,
- u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- BUG_ON(journal_pin_active(pin));
- BUG_ON(!atomic_read(&pin_list->count));
-
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
-
- list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-void bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- __journal_pin_add(j, seq, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
static inline void __journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
@@ -354,42 +322,46 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-void bch2_journal_pin_update(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+void __bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
spin_lock(&j->lock);
- if (pin->seq != seq) {
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, seq, pin, flush_fn);
- } else {
- struct journal_entry_pin_list *pin_list =
- journal_seq_pin(j, seq);
+ __journal_pin_drop(j, pin);
+
+ BUG_ON(!atomic_read(&pin_list->count));
- list_move(&pin->list, &pin_list->list);
- }
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+
+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
spin_unlock(&j->lock);
+
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
}
-void bch2_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- src_pin->seq < pin->seq)) {
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->seq, pin, flush_fn);
- }
-
- spin_unlock(&j->lock);
+ if (journal_pin_active(src) &&
+ (!journal_pin_active(dst) || src->seq < dst->seq))
+ __bch2_journal_pin_add(j, src->seq, dst, flush_fn);
}
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ */
void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
{
BUG_ON(journal_pin_active(pin));
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 9bf982a17797..883a0a5680af 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -29,16 +29,24 @@ journal_seq_pin(struct journal *j, u64 seq)
}
void bch2_journal_pin_put(struct journal *, u64);
-
-void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
-void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
+
+void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ if (unlikely(!journal_pin_active(pin)))
+ __bch2_journal_pin_add(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
+
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_do_discards(struct journal *);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 1ef62a189e33..e26fa1608f39 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -123,23 +123,21 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
for_each_btree_node(&trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH, b) {
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct bkey_i_btree_ptr *new_key;
retry:
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
continue;
bkey_copy(&tmp.k, &b->key);
- new_key = bkey_i_to_btree_ptr(&tmp.k);
- ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
goto err;
}
- ret = bch2_btree_node_update_key(c, iter, b, new_key);
+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
goto retry;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 20885b605b50..4afda95f4017 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -215,6 +215,9 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
enum btree_id btree_id,
struct bkey_s_c k)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
int ret;
m->btree_id = btree_id;
@@ -223,9 +226,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
m->nr_ptrs_reserved = 0;
bch2_write_op_init(&m->op, c, io_opts);
- m->op.compression_type =
- bch2_compression_opt_to_type[io_opts.background_compression ?:
- io_opts.compression];
+
+ if (!bch2_bkey_is_incompressible(k))
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
+ io_opts.compression];
+ else
+ m->op.incompressible = true;
+
m->op.target = data_opts.target,
m->op.write_point = wp;
@@ -235,7 +243,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
- BCH_WRITE_DATA_ENCODED;
+ BCH_WRITE_DATA_ENCODED|
+ BCH_WRITE_FROM_INTERNAL;
m->op.nr_replicas = 1;
m->op.nr_replicas_required = 1;
@@ -265,14 +274,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
break;
}
case DATA_REWRITE: {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
unsigned compressed_sectors = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+ crc_is_compressed(p.crc) &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
compressed_sectors += p.crc.compressed_size;
@@ -300,12 +306,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1c05effa71e6..ba4903352343 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -255,6 +255,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
+ x(keep_journal, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Don't free journal entries/keys after startup")\
x(noexcl, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 612385e9d4e4..ab1934325948 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -17,50 +17,52 @@
#include <linux/sched/cputime.h>
#include <trace/events/bcachefs.h>
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
- struct extent_ptr_decoded p,
- struct bch_io_opts *io_opts)
+/*
+ * Check if an extent should be moved:
+ * returns -1 if it should not be moved, or
+ * device of pointer that should be moved, if known, or INT_MAX if unknown
+ */
+static int __bch2_rebalance_pred(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts)
{
- if (io_opts->background_target &&
- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) &&
- !p.ptr.cached)
- return true;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
if (io_opts->background_compression &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type[io_opts->background_compression])
- return true;
-
- return false;
+ !bch2_bkey_is_incompressible(k))
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ p.crc.compression_type !=
+ bch2_compression_opt_to_type[io_opts->background_compression])
+ return p.ptr.dev;
+
+ if (io_opts->background_target)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
+ return p.ptr.dev;
+
+ return -1;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ atomic64_t *counter;
+ int dev;
- if (!io_opts->background_target &&
- !io_opts->background_compression)
+ dev = __bch2_rebalance_pred(c, k, io_opts);
+ if (dev < 0)
return;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (rebalance_ptr_pred(c, p, io_opts)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ counter = dev < INT_MAX
+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work
+ : &c->rebalance.work_unknown_dev;
- if (atomic64_add_return(p.crc.compressed_size,
- &ca->rebalance_work) ==
- p.crc.compressed_size)
- rebalance_wakeup(c);
- }
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
+ if (atomic64_add_return(k.k->size, counter) == k.k->size)
rebalance_wakeup(c);
}
@@ -69,26 +71,20 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned nr_replicas = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- nr_replicas += !p.ptr.cached;
-
- if (rebalance_ptr_pred(c, p, io_opts))
- goto found;
+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
+ data_opts->target = io_opts->background_target;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
+ } else {
+ return DATA_SKIP;
}
+}
- if (nr_replicas < io_opts->data_replicas)
- goto found;
-
- return DATA_SKIP;
-found:
- data_opts->target = io_opts->background_target;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+ sectors)
+ rebalance_wakeup(c);
}
struct rebalance_work {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8ecd4abc8eeb..a4d0eec2ea3e 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -27,43 +27,173 @@
/* iterate over keys read from the journal: */
-struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
- enum btree_id id)
+static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
- return (struct journal_iter) {
- .keys = keys,
- .k = keys->d,
- .btree_id = id,
- };
+ size_t l = 0, r = journal_keys->nr, m;
+
+ while (l < r) {
+ m = l + ((r - l) >> 1);
+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
+ cmp_int(level, journal_keys->d[m].level) ?:
+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ BUG_ON(l < journal_keys->nr &&
+ (cmp_int(id, journal_keys->d[l].btree_id) ?:
+ cmp_int(level, journal_keys->d[l].level) ?:
+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+
+ BUG_ON(l &&
+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
+ cmp_int(level, journal_keys->d[l - 1].level) ?:
+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+
+ return l < journal_keys->nr ? journal_keys->d + l : NULL;
+}
+
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+{
+ if (iter->k &&
+ iter->k < iter->keys->d + iter->keys->nr &&
+ iter->k->btree_id == iter->btree_id &&
+ iter->k->level == iter->level)
+ return iter->k->k;
+
+ iter->k = NULL;
+ return NULL;
}
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
{
+ if (iter->k)
+ iter->k++;
+}
+
+static void bch2_journal_iter_init(struct journal_iter *iter,
+ struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ iter->btree_id = id;
+ iter->level = level;
+ iter->keys = journal_keys;
+ iter->k = journal_key_search(journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+ return iter->btree
+ ? bch2_btree_iter_peek(iter->btree)
+ : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+ iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+ if (iter->btree)
+ bch2_btree_iter_next(iter->btree);
+ else
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+ switch (iter->last) {
+ case none:
+ break;
+ case btree:
+ bch2_journal_iter_advance_btree(iter);
+ break;
+ case journal:
+ bch2_journal_iter_advance(&iter->journal);
+ break;
+ }
+
+ iter->last = none;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+ struct bkey_s_c ret;
+
while (1) {
- if (iter->k == iter->keys->d + iter->keys->nr)
+ struct bkey_s_c btree_k =
+ bch2_journal_iter_peek_btree(iter);
+ struct bkey_s_c journal_k =
+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
+
+ if (btree_k.k && journal_k.k) {
+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+ if (!cmp)
+ bch2_journal_iter_advance_btree(iter);
+
+ iter->last = cmp < 0 ? btree : journal;
+ } else if (btree_k.k) {
+ iter->last = btree;
+ } else if (journal_k.k) {
+ iter->last = journal;
+ } else {
+ iter->last = none;
return bkey_s_c_null;
+ }
- if (iter->k->btree_id == iter->btree_id)
- return bkey_i_to_s_c(iter->k->k);
+ ret = iter->last == journal ? journal_k : btree_k;
- iter->k++;
+ if (iter->b &&
+ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+ iter->journal.k = NULL;
+ iter->last = none;
+ return bkey_s_c_null;
+ }
+
+ if (!bkey_deleted(ret.k))
+ break;
+
+ bch2_btree_and_journal_iter_advance(iter);
}
- return bkey_s_c_null;
+ return ret;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
+{
+ bch2_btree_and_journal_iter_advance(iter);
+
+ return bch2_btree_and_journal_iter_peek(iter);
+}
+
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
+ struct btree_trans *trans,
+ struct journal_keys *journal_keys,
+ enum btree_id id, struct bpos pos)
+{
+ memset(iter, 0, sizeof(*iter));
+
+ iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
}
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct journal_keys *journal_keys,
+ struct btree *b)
{
- if (iter->k == iter->keys->d + iter->keys->nr)
- return bkey_s_c_null;
+ memset(iter, 0, sizeof(*iter));
- iter->k++;
- return bch2_journal_iter_peek(iter);
+ iter->b = b;
+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
+ bch2_journal_iter_init(&iter->journal, journal_keys,
+ b->btree_id, b->level, b->data->min_key);
}
/* sort and dedup all keys in the journal: */
-static void journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct list_head *list)
{
while (!list_empty(list)) {
@@ -75,13 +205,17 @@ static void journal_entries_free(struct list_head *list)
}
}
+/*
+ * When keys compare equal, oldest compares first:
+ */
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos) ?:
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -91,27 +225,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos);
-}
-
-static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
-{
- while (i + 1 < keys->d + keys->nr &&
- journal_sort_key_cmp(i, i + 1) > 0) {
- swap(i[0], i[1]);
- i++;
- }
+ return cmp_int(r->level, l->level) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p);
}
-static void journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_free(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for_each_journal_key(*keys, i)
- if (i->allocated)
- kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
@@ -122,15 +243,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_replay *p;
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct journal_keys keys = { NULL }, keys_deduped = { NULL };
- struct journal_key *i;
+ struct journal_keys keys = { NULL };
+ struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
- keys.journal_seq_base = keys_deduped.journal_seq_base =
+ keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
@@ -139,91 +260,33 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (!keys.d)
goto err;
- keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
- if (!keys_deduped.d)
- goto err;
-
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
- .pos = bkey_start_pos(&k->k),
+ .level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
- sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
-
- i = keys.d;
- while (i < keys.d + keys.nr) {
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- !bkey_cmp(i[0].pos, i[1].pos)) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- i++;
- } else {
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- }
- continue;
- }
+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
- if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
- cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
- } else {
- struct bkey_i *split =
- kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
-
- if (!split)
- goto err;
-
- bkey_copy(split, i[0].k);
- bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
- keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
- .btree_id = i[0].btree_id,
- .allocated = true,
- .pos = bkey_start_pos(&split->k),
- .k = split,
- .journal_seq = i[0].journal_seq,
- .journal_offset = i[0].journal_offset,
- };
-
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- continue;
- }
- } else {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
- i[1] = i[0];
- i++;
- continue;
- } else {
- bch2_cut_front(i[0].k->k.p, i[1].k);
- i[1].pos = i[0].k->k.p;
- journal_keys_sift(&keys, i + 1);
- continue;
- }
- }
- }
+ src = dst = keys.d;
+ while (src < keys.d + keys.nr) {
+ while (src + 1 < keys.d + keys.nr &&
+ src[0].btree_id == src[1].btree_id &&
+ src[0].level == src[1].level &&
+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+ src++;
- keys_deduped.d[keys_deduped.nr++] = *i++;
+ *dst++ = *src++;
}
- kvfree(keys.d);
- return keys_deduped;
+ keys.nr = dst - keys.d;
err:
- journal_keys_free(&keys_deduped);
- kvfree(keys.d);
- return (struct journal_keys) { NULL };
+ return keys;
}
/* journal replay: */
@@ -274,11 +337,6 @@ retry:
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
- split_iter = bch2_trans_copy_iter(&trans, iter);
- ret = PTR_ERR_OR_ZERO(split_iter);
- if (ret)
- goto err;
-
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
@@ -297,12 +355,25 @@ retry:
}
bkey_copy(split, k);
- bch2_cut_front(split_iter->pos, split);
+ bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
+ split_iter = bch2_trans_copy_iter(&trans, iter);
+ ret = PTR_ERR_OR_ZERO(split_iter);
+ if (ret)
+ goto err;
+
+ /*
+ * It's important that we don't go through the
+ * extent_handle_overwrites() and extent_update_to_keys() path
+ * here: journal replay is supposed to treat extents like
+ * regular keys
+ */
+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split, !remark
? BTREE_TRIGGER_NORUN
: BTREE_TRIGGER_NOOVERWRITES);
+
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
@@ -328,27 +399,40 @@ err:
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
- enum btree_id id, struct bkey_i *k)
+ enum btree_id id, unsigned level,
+ struct bkey_i *k)
{
struct btree_iter *iter;
+ int ret;
- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
+ iter = bch2_trans_get_node_iter(trans, id, k->k.p,
+ BTREE_MAX_DEPTH, level,
+ BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
- return 0;
+ /*
+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't
+ * want that here, journal replay is supposed to treat extents like
+ * regular keys:
+ */
+ __bch2_btree_iter_set_pos(iter, k->k.p, false);
+
+ ret = bch2_btree_iter_traverse(iter) ?:
+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k)
+ unsigned level, struct bkey_i *k)
{
return bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_journal_replay_key(&trans, id, k));
+ __bch2_journal_replay_key(&trans, id, level, k));
}
static int bch2_journal_replay(struct bch_fs *c,
@@ -360,15 +444,21 @@ static int bch2_journal_replay(struct bch_fs *c,
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+ if (keys.nr)
+ replay_now_at(j, keys.journal_seq_base);
+
for_each_journal_key(keys, i) {
- replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+ if (!i->level)
+ replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+ if (i->level)
+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
- else if (btree_node_type_is_extents(i->btree_id))
+ else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
- ret = bch2_journal_replay_key(c, i->btree_id, i->k);
+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
@@ -707,8 +797,6 @@ int bch2_fs_recovery(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
- LIST_HEAD(journal_entries);
- struct journal_keys journal_keys = { NULL };
bool wrote = false, write_sb = false;
int ret;
@@ -727,33 +815,33 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
- if (!c->sb.clean || c->opts.fsck) {
+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct jset *j;
- ret = bch2_journal_read(c, &journal_entries);
+ ret = bch2_journal_read(c, &c->journal_entries);
if (ret)
goto err;
- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
+ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
- if (!c->sb.clean && list_empty(&journal_entries)) {
+ if (!c->sb.clean && list_empty(&c->journal_entries)) {
bch_err(c, "no journal entries found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
}
- journal_keys = journal_keys_sort(&journal_entries);
- if (!journal_keys.d) {
+ c->journal_keys = journal_keys_sort(&c->journal_entries);
+ if (!c->journal_keys.d) {
ret = -ENOMEM;
goto err;
}
- j = &list_last_entry(&journal_entries,
+ j = &list_last_entry(&c->journal_entries,
struct journal_replay, list)->j;
ret = verify_superblock_clean(c, &clean, j);
@@ -765,7 +853,14 @@ int bch2_fs_recovery(struct bch_fs *c)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
- ret = journal_replay_early(c, clean, &journal_entries);
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = journal_replay_early(c, clean, &c->journal_entries);
if (ret)
goto err;
@@ -783,15 +878,15 @@ int bch2_fs_recovery(struct bch_fs *c)
ret = bch2_blacklist_table_initialize(c);
- if (!list_empty(&journal_entries)) {
+ if (!list_empty(&c->journal_entries)) {
ret = verify_journal_entries_not_blacklisted_or_missing(c,
- &journal_entries);
+ &c->journal_entries);
if (ret)
goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,
- &journal_entries);
+ &c->journal_entries);
if (ret)
goto err;
@@ -801,14 +896,14 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c, &journal_keys);
+ ret = bch2_alloc_read(c, &c->journal_keys);
if (ret)
goto err;
bch_verbose(c, "alloc read done");
bch_verbose(c, "starting stripes_read");
err = "error reading stripes";
- ret = bch2_stripes_read(c, &journal_keys);
+ ret = bch2_stripes_read(c, &c->journal_keys);
if (ret)
goto err;
bch_verbose(c, "stripes_read done");
@@ -824,7 +919,7 @@ int bch2_fs_recovery(struct bch_fs *c)
*/
bch_info(c, "starting metadata mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, NULL, true, true);
+ ret = bch2_gc(c, &c->journal_keys, true, true);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
@@ -835,7 +930,7 @@ int bch2_fs_recovery(struct bch_fs *c)
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, &journal_keys, true, false);
+ ret = bch2_gc(c, &c->journal_keys, true, false);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
@@ -856,7 +951,7 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_verbose(c, "starting journal replay");
err = "journal replay failed";
- ret = bch2_journal_replay(c, journal_keys);
+ ret = bch2_journal_replay(c, c->journal_keys);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
@@ -922,8 +1017,7 @@ int bch2_fs_recovery(struct bch_fs *c)
c->disk_sb.sb->version_min =
le16_to_cpu(bcachefs_metadata_version_min);
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
write_sb = true;
}
@@ -953,8 +1047,10 @@ fsck_err:
set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
- journal_keys_free(&journal_keys);
- journal_entries_free(&journal_entries);
+ if (!c->opts.keep_journal) {
+ bch2_journal_keys_free(&c->journal_keys);
+ bch2_journal_entries_free(&c->journal_entries);
+ }
kfree(clean);
if (ret)
bch_err(c, "Error in recovery: %s (%i)", err, ret);
@@ -1042,8 +1138,7 @@ int bch2_fs_initialize(struct bch_fs *c)
c->disk_sb.sb->version = c->disk_sb.sb->version_min =
le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 479ea46f8dcb..19f2f172a26b 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,32 +2,50 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-struct journal_keys {
- struct journal_key {
- enum btree_id btree_id:8;
- unsigned allocated:1;
- struct bpos pos;
- struct bkey_i *k;
- u32 journal_seq;
- u32 journal_offset;
- } *d;
- size_t nr;
- u64 journal_seq_base;
-};
-
#define for_each_journal_key(keys, i) \
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
struct journal_iter {
+ enum btree_id btree_id;
+ unsigned level;
struct journal_keys *keys;
struct journal_key *k;
- enum btree_id btree_id;
};
-struct journal_iter bch2_journal_iter_init(struct journal_keys *,
- enum btree_id);
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+ struct btree_iter *btree;
+
+ struct btree *b;
+ struct btree_node_iter node_iter;
+ struct bkey unpacked;
+
+ struct journal_iter journal;
+
+ enum last_key_returned {
+ none,
+ btree,
+ journal,
+ } last;
+};
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
+ struct btree_trans *,
+ struct journal_keys *,
+ enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct journal_keys *,
+ struct btree *);
+
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct list_head *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3b8c74ca3725..2f223be74926 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -128,10 +128,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
err:
- if (!IS_ERR(reflink_iter)) {
+ if (!IS_ERR(reflink_iter))
c->reflink_hint = reflink_iter->pos.offset;
- bch2_trans_iter_put(trans, reflink_iter);
- }
+ bch2_trans_iter_put(trans, reflink_iter);
return ret;
}
@@ -185,7 +184,7 @@ s64 bch2_remap_range(struct bch_fs *c,
BTREE_ITER_INTENT);
while (1) {
- bch2_trans_reset(&trans, TRANS_RESET_MEM);
+ bch2_trans_begin(&trans);
trans.mem_top = 0;
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index ac23b855858c..5445c1cf0797 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -22,6 +22,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
+ .swab = bch2_ptr_swab, \
}
s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 366888b1b36d..be4908575f72 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -112,6 +112,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
e->data_type = BCH_DATA_BTREE;
extent_to_replicas(k, e);
break;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index f2779159a6b8..dea9b7252b88 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -163,6 +163,7 @@ bch2_hash_lookup(struct btree_trans *trans,
break;
}
}
+ bch2_trans_iter_put(trans, iter);
return ERR_PTR(ret ?: -ENOENT);
}
@@ -187,6 +188,9 @@ bch2_hash_hole(struct btree_trans *trans,
return iter;
}
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ bch2_trans_iter_put(trans, iter);
+
return ERR_PTR(ret ?: -ENOSPC);
}
@@ -262,10 +266,8 @@ int bch2_hash_set(struct btree_trans *trans,
if (!ret)
ret = -ENOSPC;
out:
- if (!IS_ERR_OR_NULL(slot))
- bch2_trans_iter_put(trans, slot);
- if (!IS_ERR_OR_NULL(iter))
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_put(trans, slot);
+ bch2_trans_iter_put(trans, iter);
return ret;
found:
@@ -319,13 +321,16 @@ int bch2_hash_delete(struct btree_trans *trans,
u64 inode, const void *key)
{
struct btree_iter *iter;
+ int ret;
iter = bch2_hash_lookup(trans, desc, info, inode, key,
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
- return bch2_hash_delete_at(trans, desc, info, iter);
+ ret = bch2_hash_delete_at(trans, desc, info, iter);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 43927853210a..6596764c8421 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -956,6 +956,9 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1086,6 +1089,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 38920fff4500..d2c275ce79ab 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -500,6 +500,8 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
+ bch2_journal_keys_free(&c->journal_keys);
+ bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
kfree(c->usage_scratch);
free_percpu(c->usage[1]);
@@ -549,6 +551,10 @@ void bch2_fs_stop(struct bch_fs *c)
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+ mutex_lock(&c->state_lock);
+ bch2_fs_read_only(c);
+ mutex_unlock(&c->state_lock);
+
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
@@ -572,10 +578,6 @@ void bch2_fs_stop(struct bch_fs *c)
closure_sync(&c->cl);
closure_debug_destroy(&c->cl);
- mutex_lock(&c->state_lock);
- bch2_fs_read_only(c);
- mutex_unlock(&c->state_lock);
-
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
@@ -674,6 +676,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->list);
INIT_LIST_HEAD(&c->btree_interior_update_list);
+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
mutex_init(&c->btree_reserve_cache_lock);
mutex_init(&c->btree_interior_update_lock);
@@ -688,6 +691,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
+ INIT_LIST_HEAD(&c->journal_entries);
+
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 602def1ee95a..d78ffcc0e8a4 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -276,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
struct extent_ptr_decoded p;
extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) {
+ if (!crc_is_compressed(p.crc)) {
nr_uncompressed_extents++;
uncompressed_sectors += e.k->size;
} else {
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 8f9b0cca17da..4dcace650416 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -18,7 +18,7 @@ static void delete_test_keys(struct bch_fs *c)
NULL);
BUG_ON(ret);
- ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
POS(0, 0), POS(0, U64_MAX),
NULL);
BUG_ON(ret);
@@ -37,14 +37,14 @@ static void test_delete(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, iter, &k.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
BUG_ON(ret);
pr_info("deleting once");
@@ -69,14 +69,14 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, k.k.p,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, iter, &k.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
BUG_ON(ret);
bch2_journal_flush_all_pins(&c->journal);
@@ -107,7 +107,7 @@ static void test_iterate(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i;
- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
NULL, NULL, 0);
BUG_ON(ret);
}
@@ -116,9 +116,13 @@ static void test_iterate(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
- POS_MIN, 0, k, ret)
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+ POS_MIN, 0, k, ret) {
+ if (k.k->p.inode)
+ break;
+
BUG_ON(k.k->p.offset != i++);
+ }
BUG_ON(i != nr);
@@ -202,7 +206,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i * 2;
- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
NULL, NULL, 0);
BUG_ON(ret);
}
@@ -211,8 +215,11 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
0, k, ret) {
+ if (k.k->p.inode)
+ break;
+
BUG_ON(k.k->p.offset != i);
i += 2;
}
@@ -224,11 +231,12 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
BTREE_ITER_SLOTS, k, ret) {
+ BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
- BUG_ON(k.k->p.offset != i++);
+ i++;
if (i == nr * 2)
break;
}
@@ -307,7 +315,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS_MIN, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
@@ -409,18 +417,24 @@ static u64 test_rand(void)
static void rand_insert(struct bch_fs *c, u64 nr)
{
+ struct btree_trans trans;
struct bkey_i_cookie k;
int ret;
u64 i;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < nr; i++) {
bkey_cookie_init(&k.k_i);
k.k.p.offset = test_rand();
- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
- NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
+
BUG_ON(ret);
}
+
+ bch2_trans_exit(&trans);
}
static void rand_lookup(struct bch_fs *c, u64 nr)
@@ -433,7 +447,7 @@ static void rand_lookup(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
POS(0, test_rand()), 0);
k = bch2_btree_iter_peek(iter);
@@ -454,7 +468,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
POS(0, test_rand()), 0);
k = bch2_btree_iter_peek(iter);
@@ -465,8 +479,9 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p = iter->pos;
- bch2_trans_update(&trans, iter, &k.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
+
BUG_ON(ret);
}
@@ -476,20 +491,50 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
bch2_trans_exit(&trans);
}
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter *iter;
+ struct bkey_i delete;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
+ BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ goto err;
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete.k);
+ delete.k.p = k.k->p;
+
+ bch2_trans_update(trans, iter, &delete, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static void rand_delete(struct bch_fs *c, u64 nr)
{
- struct bkey_i k;
+ struct btree_trans trans;
int ret;
u64 i;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < nr; i++) {
- bkey_init(&k.k);
- k.k.p.offset = test_rand();
+ struct bpos pos = POS(0, test_rand());
- ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
- NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ __do_delete(&trans, pos));
BUG_ON(ret);
}
+
+ bch2_trans_exit(&trans);
}
static void seq_insert(struct bch_fs *c, u64 nr)
@@ -505,12 +550,13 @@ static void seq_insert(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
insert.k.p = iter->pos;
- bch2_trans_update(&trans, iter, &insert.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_trans_update(&trans, iter, &insert.k_i, 0));
+
BUG_ON(ret);
if (++i == nr)
@@ -528,7 +574,7 @@ static void seq_lookup(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret)
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
;
bch2_trans_exit(&trans);
}
@@ -542,14 +588,15 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
BTREE_ITER_INTENT, k, ret) {
struct bkey_i_cookie u;
bkey_reassemble(&u.k_i, k);
- bch2_trans_update(&trans, iter, &u.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_trans_update(&trans, iter, &u.k_i, 0));
+
BUG_ON(ret);
}
bch2_trans_exit(&trans);
@@ -559,7 +606,7 @@ static void seq_delete(struct bch_fs *c, u64 nr)
{
int ret;
- ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
POS(0, 0), POS(0, U64_MAX),
NULL);
BUG_ON(ret);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 2b19a0038045..0128daba5970 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));