summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-12-20 00:48:07 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-12-23 23:40:46 -0500
commitd78d34661415d81357c229a8628ed1fc8a2c5ba4 (patch)
treeda5cd2b746a4e35b871fad69a72164629265db3c
parent1b4fba9837e14617ca187963135ebdea742fcce4 (diff)
delalloc btree nodes, wip
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/bcachefs_format.h1
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_io.c114
-rw-r--r--fs/bcachefs/btree_io.h7
-rw-r--r--fs/bcachefs/btree_update_interior.c48
-rw-r--r--fs/bcachefs/extents.c4
-rw-r--r--fs/bcachefs/opts.h5
7 files changed, 134 insertions, 47 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 2105198daf3b..9443e8de8806 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -675,6 +675,7 @@ struct bch_btree_ptr_v2 {
} __packed __aligned(8);
LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+LE16_BITMASK(BTREE_PTR_NONCE, struct bch_btree_ptr_v2, flags, 1, 16);
struct bch_extent {
struct bch_val v;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 4e1af5882052..4629f90a1c56 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -51,7 +51,7 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
*/
return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
default:
- return 0;
+ BUG();
}
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 33115744d8f3..3363e2394466 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_sort.h"
#include "btree_cache.h"
@@ -524,7 +525,9 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u/%zu", b->written,
+ btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) ?:
+ btree_sectors(c));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
prt_str(out, ": ");
@@ -939,10 +942,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
- unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+ unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
+ BUG_ON(!ptr_written);
+
b->version_ondisk = U16_MAX;
/* We might get called multiple times on read retry: */
b->written = 0;
@@ -1017,7 +1022,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bset_bad_csum,
"invalid checksum");
- ret = bset_encrypt(c, i, b->written << 9);
+ unsigned nonce_offset = BTREE_PTR_NONCE(&bkey_i_to_btree_ptr_v2(&b->key)->v) * btree_bytes(c);
+ ret = bset_encrypt(c, i, nonce_offset);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting btree node: %i", ret))
goto fsck_err;
@@ -1805,17 +1811,20 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
+ BUG_ON(wbio->key.k.type != KEY_TYPE_btree_ptr_v2);
+
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_check_rw,
- !wbio->wbio.failed.nr));
+ !wbio->wbio.failed.nr && !c->opts.delalloc_btree_nodes));
if (ret)
goto err;
}
out:
+ bch2_open_buckets_put(c, &wbio->obs);
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
return;
@@ -1867,21 +1876,22 @@ static void btree_node_write_endio(struct bio *bio)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- struct printbuf buf = PRINTBUF;
- bool saw_error;
int ret;
- ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
- BKEY_TYPE_btree, WRITE, &buf);
-
- if (ret)
- bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
- printbuf_exit(&buf);
- if (ret)
- return ret;
+ if (!c->opts.delalloc_btree_nodes) {
+ struct printbuf buf = PRINTBUF;
+ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree, WRITE, &buf);
+ if (ret)
+ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
+ }
+ bool saw_error;
+ unsigned offset = !c->opts.delalloc_btree_nodes ? b->written : 0;
ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+ validate_bset(c, NULL, b, i, offset, sectors, WRITE, false, &saw_error);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -1896,6 +1906,38 @@ static void btree_write_submit(struct work_struct *work)
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->b;
+ if (c->opts.delalloc_btree_nodes) {
+ struct closure cl;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ BUG_ON(wbio->key.k.type != KEY_TYPE_btree_ptr_v2);
+ set_bkey_val_bytes(&wbio->key.k, sizeof(struct bch_btree_ptr_v2));
+
+ do {
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_alloc_sectors_trans(trans,
+ c->opts.metadata_target ?:
+ c->opts.foreground_target,
+ false,
+ writepoint_ptr(&c->btree_write_point),
+ c->opts.metadata_replicas,
+ c->opts.metadata_replicas_required,
+ BCH_WATERMARK_reclaim, 0,
+ bio_sectors(&wbio->wbio.bio),
+ &cl, &wbio->obs, &wbio->key));
+ closure_sync(&cl);
+ } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+
+ if (unlikely(ret)) {
+ bch2_fs_fatal_error(c, "error allocating btree node: %s",
+ bch2_err_str(ret));
+ /* how do we unwind this? */
+ BUG();
+ }
+ }
+
struct btree_update *as = (void *) (b->will_make_reachable & ~1UL);
if (as) {
spin_lock(&as->journal_entries_lock);
@@ -1999,12 +2041,19 @@ do_write:
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+ BUG_ON(BTREE_PTR_NONCE(&bkey_i_to_btree_ptr_v2(&b->key)->v) == BTREE_PTR_NONCE_MAX);
+
+ unsigned nonce_counter = BTREE_PTR_NONCE(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+ BUG_ON(nonce_counter == BTREE_PTR_NONCE_MAX);
+ nonce_counter++;
bch2_sort_whiteouts(c, b);
sort_iter_stack_init(&sort_iter, b);
- bytes = !b->written
+ bool btree_node_header = !b->written || c->opts.delalloc_btree_nodes;
+
+ bytes = btree_node_header
? sizeof(struct btree_node)
: sizeof(struct btree_node_entry);
@@ -2013,7 +2062,7 @@ do_write:
for_each_bset(b, t) {
i = bset(b, t);
- if (bset_written(b, i))
+ if (bset_written(b, i) && !c->opts.delalloc_btree_nodes)
continue;
bytes += le16_to_cpu(i->u64s) * sizeof(u64);
@@ -2033,7 +2082,7 @@ do_write:
data = btree_bounce_alloc(c, bytes, &used_mempool);
- if (!b->written) {
+ if (btree_node_header) {
bn = data;
*bn = *b->data;
i = &bn->keys;
@@ -2069,17 +2118,21 @@ do_write:
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
- BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+ BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
- BUG_ON(b->written + sectors_to_write > btree_sectors(c));
+ unsigned new_written = sectors_to_write +
+ (c->opts.delalloc_btree_nodes ? 0 : b->written);
+
+ BUG_ON(new_written > btree_sectors(c));
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
i->version = cpu_to_le16(c->sb.version);
- SET_BSET_OFFSET(i, b->written);
+ if (!c->opts.delalloc_btree_nodes)
+ SET_BSET_OFFSET(i, b->written);
SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
@@ -2094,7 +2147,11 @@ do_write:
validate_bset_for_write(c, b, i, sectors_to_write))
goto err;
- ret = bset_encrypt(c, i, b->written << 9);
+ unsigned nonce_offset = !c->opts.delalloc_btree_nodes
+ ? b->written << 9
+ : btree_bytes(c) * nonce_counter;
+
+ ret = bset_encrypt(c, i, nonce_offset);
if (bch2_fs_fatal_err_on(ret, c,
"error encrypting btree node: %i\n", ret))
goto err;
@@ -2143,9 +2200,10 @@ do_write:
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
wbio->b = b;
+ wbio->obs.nr = 0;
wbio->data = data;
wbio->data_bytes = bytes;
- wbio->sector_offset = b->written;
+ wbio->sector_offset = !c->opts.delalloc_btree_nodes ? b->written : 0;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
@@ -2155,7 +2213,13 @@ do_write:
bkey_copy(&wbio->key, &b->key);
- b->written += sectors_to_write;
+ if (!c->opts.delalloc_btree_nodes && bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) {
+ SET_BTREE_PTR_NONCE(&bkey_i_to_btree_ptr_v2(&wbio->key)->v, nonce_counter);
+ if (nonce_counter == BTREE_PTR_NONCE_MAX)
+ set_btree_node_need_rewrite(b);
+ }
+
+ b->written = new_written;
if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
@@ -2169,7 +2233,7 @@ do_write:
return;
err:
set_btree_node_noevict(b);
- b->written += sectors_to_write;
+ b->written = new_written;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
__btree_node_write_done(c, b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 2fd6926a71c7..0aec54b18ce9 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b
atomic_dec(&c->btree_cache.dirty);
}
-static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
{
- return k->k.type == KEY_TYPE_btree_ptr_v2
- ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+ return k.k->type == KEY_TYPE_btree_ptr_v2
+ ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
: 0;
}
@@ -53,6 +53,7 @@ struct btree_write_bio {
void *data;
unsigned data_bytes;
unsigned sector_offset;
+ struct open_buckets obs;
struct bch_write_bio wbio;
};
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index de796dfb3df8..d375747d19c2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -258,6 +258,11 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
: 0;
int ret;
+ bkey_btree_ptr_v2_init(&tmp.k);
+
+ if (c->opts.delalloc_btree_nodes)
+ goto mem_alloc;
+
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
@@ -270,7 +275,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
}
mutex_unlock(&c->btree_reserve_cache_lock);
- bkey_btree_ptr_v2_init(&tmp.k);
ret = bch2_alloc_sectors_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
@@ -1236,11 +1240,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_trans *trans,
unsigned long old, new, v;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
- !btree_ptr_sectors_written(insert));
+ !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
-
+#if 0
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf) ?:
bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
@@ -1255,7 +1259,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
}
-
+#endif
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
@@ -2116,20 +2120,25 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bool skip_triggers)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter2 = { NULL };
+ struct btree_iter iter2 = {};
struct btree *parent;
+ struct disk_reservation disk_res = {};
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key), 0);
- if (ret)
- return ret;
+ int ptrs_delta = bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)) -
+ bch2_bkey_nr_ptrs(bkey_i_to_s_c(&b->key));
- ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
- new_key, 0);
+ bch2_disk_reservation_add(c, &disk_res,
+ max(ptrs_delta, 0) * btree_sectors(c),
+ BCH_DISK_RESERVATION_NOFAIL);
+
+ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key), 0) ?:
+ bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+ new_key, 0);
if (ret)
- return ret;
+ goto err;
}
if (new_hash) {
@@ -2174,7 +2183,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
new_key, new_key->k.u64s);
}
- ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
+ ret = bch2_trans_commit(trans, &disk_res, NULL, commit_flags);
if (ret)
goto err;
@@ -2196,6 +2205,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
out:
bch2_trans_iter_exit(trans, &iter2);
+ bch2_disk_reservation_put(c, &disk_res);
return ret;
err:
if (new_hash) {
@@ -2216,6 +2226,8 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
struct closure cl;
int ret = 0;
+ BUG_ON(new_key->k.type != KEY_TYPE_btree_ptr_v2);
+
ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
if (ret)
return ret;
@@ -2262,6 +2274,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
struct btree_iter iter;
int ret;
+ BUG_ON(new_key->k.type != KEY_TYPE_btree_ptr_v2);
+
bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_INTENT);
@@ -2278,9 +2292,11 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
BUG_ON(!btree_node_hashed(b));
- struct bch_extent_ptr *ptr;
- bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
- !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+ if (!trans->c->opts.delalloc_btree_nodes) {
+ struct bch_extent_ptr *ptr;
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+ !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+ }
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 82ec056f4cdb..648f1daffb3b 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1117,7 +1117,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (bkey_is_btree_ptr(k.k))
- size_ondisk = btree_sectors(c);
+ size_ondisk = btree_ptr_sectors_written(k) ?: btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
@@ -1206,7 +1206,7 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
}
}
- bkey_fsck_err_on(!nr_ptrs, c, err,
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_btree_ptr_v2 && !nr_ptrs, c, err,
extent_ptrs_no_ptrs,
"no ptrs");
bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 42cad83efb48..67d39eab413f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -419,6 +419,11 @@ enum fsck_err_opts {
OPT_UINT(0, S64_MAX), \
BCH2_NO_SB_OPT, false, \
NULL, "Pointer to a struct log_output") \
+ x(delalloc_btree_nodes, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, NULL) \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \