summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-01-01 21:22:24 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2022-01-01 21:22:41 -0500
commitc50379128bb027d43c76ac27d7cade7260db7f66 (patch)
treedfd393d7bfa247bb9125fecb23872872d87bd11a
parent8a632ea60db6c2a78c5f4f412c12b605f8505511 (diff)
Update bcachefs sources to aa439f3b94 bcachefs: btree_gc no longer uses main in-memory bucket array
-rw-r--r--.bcachefs_revision2
-rw-r--r--cmd_migrate.c2
-rw-r--r--libbcachefs/alloc_background.c185
-rw-r--r--libbcachefs/alloc_background.h42
-rw-r--r--libbcachefs/bcachefs.h4
-rw-r--r--libbcachefs/bcachefs_format.h1
-rw-r--r--libbcachefs/btree_gc.c490
-rw-r--r--libbcachefs/btree_iter.c213
-rw-r--r--libbcachefs/btree_key_cache.h3
-rw-r--r--libbcachefs/btree_types.h9
-rw-r--r--libbcachefs/btree_update.h17
-rw-r--r--libbcachefs/btree_update_interior.c9
-rw-r--r--libbcachefs/btree_update_leaf.c9
-rw-r--r--libbcachefs/buckets.c16
-rw-r--r--libbcachefs/ec.c60
-rw-r--r--libbcachefs/journal_reclaim.c5
-rw-r--r--libbcachefs/journal_types.h1
-rw-r--r--libbcachefs/opts.h4
-rw-r--r--libbcachefs/recovery.c330
-rw-r--r--libbcachefs/recovery.h10
-rw-r--r--libbcachefs/super-io.c21
21 files changed, 664 insertions, 769 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index e086bdcf..193d8bd7 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-42284b8b2bb980c80140b640de7cb12bc1e4541c
+aa439f3b94eb3141f9b6d71f780300e7fef44af9
diff --git a/cmd_migrate.c b/cmd_migrate.c
index bde7288b..fc863f89 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -605,8 +605,6 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
darray_free(s.extents);
genradix_free(&s.hardlinks);
-
- bch2_alloc_write_all(c, false);
}
static void find_superblock_space(ranges extents,
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index eb2e6642..df340ebb 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -38,15 +38,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
-struct bkey_alloc_buf {
- struct bkey_i k;
- struct bch_alloc_v3 v;
-
-#define x(_name, _bits) + _bits / 8
- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef x
-} __attribute__((packed, aligned(8)));
-
/* Persistent alloc info: */
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -253,24 +244,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
return ret;
}
-static void bch2_alloc_pack(struct bch_fs *c,
- struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+ const struct bkey_alloc_unpacked src)
{
- bch2_alloc_pack_v3(dst, src);
+ struct bkey_alloc_buf *dst;
+
+ dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (!IS_ERR(dst))
+ bch2_alloc_pack_v3(dst, src);
+
+ return dst;
}
int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_alloc_unpacked *u, unsigned trigger_flags)
{
- struct bkey_alloc_buf *a;
+ struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- bch2_alloc_pack(trans->c, a, *u);
- return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+ return PTR_ERR_OR_ZERO(a) ?:
+ bch2_trans_update(trans, iter, &a->k, trigger_flags);
}
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -340,119 +332,54 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
#undef x
}
-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
{
- struct bch_fs *c = trans->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
struct bucket *g;
struct bkey_alloc_unpacked u;
-
- if (!bkey_is_alloc(k.k))
- return 0;
-
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
- u = bch2_alloc_unpack(k);
-
- *bucket_gen(ca, k.k->p.offset) = u.gen;
- g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
-
- return 0;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
- struct btree_trans trans;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
- up_read(&c->gc_lock);
- bch2_trans_exit(&trans);
- if (ret) {
- bch_err(c, "error reading alloc info: %i", ret);
- return ret;
- }
- return 0;
-}
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = __bucket(ca, k.k->p.offset, gc);
+ u = bch2_alloc_unpack(k);
+
+ if (!gc)
+ *bucket_gen(ca, k.k->p.offset) = u.gen;
+
+ g->_mark.gen = u.gen;
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = !gc ? u.oldest_gen : u.gen;
+ g->gen_valid = 1;
+
+ if (!gc ||
+ (metadata_only &&
+ (u.data_type == BCH_DATA_user ||
+ u.data_type == BCH_DATA_cached ||
+ u.data_type == BCH_DATA_parity))) {
+ g->_mark.data_type = u.data_type;
+ g->_mark.dirty_sectors = u.dirty_sectors;
+ g->_mark.cached_sectors = u.cached_sectors;
+ g->_mark.stripe = u.stripe != 0;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
+ }
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked old_u, new_u;
- int ret;
-retry:
- bch2_trans_begin(trans);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_alloc, iter->pos);
- if (ret)
- goto err;
+ bch2_trans_exit(&trans);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
if (ret)
- goto err;
-
- old_u = bch2_alloc_unpack(k);
- new_u = alloc_mem_to_key(c, iter);
-
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
- return 0;
-
- ret = bch2_alloc_write(trans, iter, &new_u,
- BTREE_TRIGGER_NORUN) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
-err:
- if (ret == -EINTR)
- goto retry;
- return ret;
-}
-
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(&iter,
- POS(ca->dev_idx, ca->mi.first_bucket));
+ bch_err(c, "error reading alloc info: %i", ret);
- while (iter.pos.offset < ca->mi.nbuckets) {
- ret = bch2_alloc_write_key(&trans, &iter, flags);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
- bch2_btree_iter_advance(&iter);
- }
- }
-err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
return ret;
}
@@ -463,19 +390,20 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
+ struct bkey_s_c k;
struct bkey_alloc_unpacked u;
u64 *time, now;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto out;
- u = alloc_mem_to_key(c, &iter);
+ u = bch2_alloc_unpack(k);
time = rw == READ ? &u.read_time : &u.write_time;
now = atomic64_read(&c->io_clock[rw].now);
@@ -664,20 +592,20 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, b),
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto err;
- *u = alloc_mem_to_key(c, &iter);
-
+ *u = bch2_alloc_unpack(k);
u->gen++;
u->data_type = 0;
u->dirty_sectors = 0;
@@ -859,8 +787,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
static bool allocator_thread_running(struct bch_dev *ca)
{
unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) &&
- test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags)
+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
? ALLOCATOR_running
: ALLOCATOR_stopped;
alloc_thread_set_state(ca, state);
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index 86b64177..98c7866e 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
;
}
+struct bkey_alloc_buf {
+ struct bkey_i k;
+ struct bch_alloc_v3 v;
+
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+} __attribute__((packed, aligned(8)));
+
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+ const struct bkey_alloc_unpacked);
int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
struct bkey_alloc_unpacked *, unsigned);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
-{
- struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked ret;
-
- percpu_down_read(&c->mark_lock);
- ca = bch_dev_bkey_exists(c, iter->pos.inode);
- g = bucket(ca, iter->pos.offset);
- ret = (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = g->mark.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = g->mark.data_type,
- .dirty_sectors = g->mark.dirty_sectors,
- .cached_sectors = g->mark.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- .stripe = g->stripe,
- .stripe_redundancy = g->stripe_redundancy,
- };
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
k->type == KEY_TYPE_alloc_v3;
}
-int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_read(struct bch_fs *, bool, bool);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
@@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 696c7c93..ddd700c3 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -510,8 +510,6 @@ enum {
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
- BCH_FS_ALLOC_REPLAY_DONE,
- BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
BCH_FS_RW,
@@ -531,7 +529,6 @@ enum {
/* misc: */
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
- BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
@@ -860,7 +857,6 @@ struct bch_fs {
u64 reflink_hint;
reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr;
- size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 8ec718cd..5153f0e4 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1427,6 +1427,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
/*
* Features:
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 9e3213b9..101cef7e 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -505,7 +506,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
if (fsck_err_on(!g->gen_valid, c,
@@ -516,9 +516,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
} else {
do_update = true;
}
@@ -532,9 +531,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
p.ptr.gen, g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.data_type = data_type;
+ g->gen_valid = true;
}
if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
@@ -545,13 +543,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
p.ptr.gen, g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
+ g->_mark.data_type = 0;
+ g->_mark.dirty_sectors = 0;
+ g->_mark.cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
} else {
do_update = true;
}
@@ -588,9 +585,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bch2_data_types[data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.data_type = data_type;
+ g->gen_valid = true;
} else {
do_update = true;
}
@@ -691,10 +687,16 @@ found:
}
ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+
if (ret)
kfree(new);
- else
+ else {
+ bch2_bkey_val_to_text(&PBUF(buf), c, *k);
+ bch_info(c, "updated %s", buf);
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf);
*k = bkey_i_to_s_c(new);
+ }
}
fsck_err:
return ret;
@@ -1145,13 +1147,14 @@ static int bch2_gc_done(struct bch_fs *c,
unsigned i, dev;
int ret = 0;
+ percpu_down_write(&c->mark_lock);
+
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
@@ -1161,18 +1164,6 @@ static int bch2_gc_done(struct bch_fs *c,
iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
-#define copy_bucket_field(_f) \
- if (dst->b[b]._f != src->b[b]._f) { \
- if (verify) \
- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", dev, b, \
- dst->b[b].mark.gen, \
- bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b]._f, src->b[b]._f); \
- dst->b[b]._f = src->b[b]._f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
@@ -1183,36 +1174,18 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
- struct bucket_array *dst = __bucket_array(ca, 0);
- struct bucket_array *src = __bucket_array(ca, 1);
- size_t b;
-
- for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(_mark.gen);
- copy_bucket_field(_mark.data_type);
- copy_bucket_field(_mark.stripe);
- copy_bucket_field(_mark.dirty_sectors);
- copy_bucket_field(_mark.cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
-
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- }
-
- {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage_gc,
- dev_usage_u64s());
-
- copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
- }
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
};
@@ -1254,7 +1227,6 @@ static int bch2_gc_done(struct bch_fs *c,
#undef copy_fs_field
#undef copy_dev_field
-#undef copy_bucket_field
#undef copy_stripe_field
#undef copy_field
fsck_err:
@@ -1262,6 +1234,8 @@ fsck_err:
percpu_ref_put(&ca->ref);
if (ret)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ percpu_up_write(&c->mark_lock);
return ret;
}
@@ -1284,15 +1258,6 @@ static int bch2_gc_start(struct bch_fs *c,
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage_gc);
- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets[1]) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -ENOMEM;
- }
-
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
@@ -1301,94 +1266,165 @@ static int bch2_gc_start(struct bch_fs *c,
}
}
- percpu_down_write(&c->mark_lock);
+ return 0;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ bool initial, bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket *g;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked old_u, new_u, gc_u;
+ struct bkey_alloc_buf *a;
+ int ret;
/*
- * indicate to stripe code that we need to allocate for the gc stripes
- * radix tree, too
+ * For this to be correct at runtime, we'll need to figure out a way for
+ * it to actually lock the key in the btree key cache:
*/
- gc_pos_set(c, gc_phase(GC_PHASE_START));
- for_each_member_device(ca, c, i) {
- struct bucket_array *dst = __bucket_array(ca, 1);
- struct bucket_array *src = __bucket_array(ca, 0);
- size_t b;
-
- dst->first_bucket = src->first_bucket;
- dst->nbuckets = src->nbuckets;
+ if (!initial) {
+ ret = bch2_btree_key_cache_flush(trans,
+ BTREE_ID_alloc, iter->pos);
+ if (ret)
+ return ret;
+ }
- for (b = 0; b < src->nbuckets; b++) {
- struct bucket *d = &dst->b[b];
- struct bucket *s = &src->b[b];
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
- d->gen_valid = s->gen_valid;
+ old_u = new_u = bch2_alloc_unpack(k);
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached))
- d->_mark = s->mark;
- }
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, iter->pos.offset);
+ gc_u = (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
+ .gen = g->mark.gen,
+ .oldest_gen = g->oldest_gen,
+ .data_type = g->mark.data_type,
+ .dirty_sectors = g->mark.dirty_sectors,
+ .cached_sectors = g->mark.cached_sectors,
+ .read_time = g->io_time[READ],
+ .write_time = g->io_time[WRITE],
+ .stripe = g->stripe,
+ .stripe_redundancy = g->stripe_redundancy,
};
+ percpu_up_read(&c->mark_lock);
- percpu_up_write(&c->mark_lock);
+ if (metadata_only &&
+ gc_u.data_type != BCH_DATA_sb &&
+ gc_u.data_type != BCH_DATA_journal &&
+ gc_u.data_type != BCH_DATA_btree)
+ return 0;
- return 0;
-}
+ if (!bkey_alloc_unpacked_cmp(old_u, gc_u) ||
+ gen_after(old_u.gen, gc_u.gen))
+ return 0;
-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
- char buf[200];
- int ret = 0;
+#define copy_bucket_field(_f) \
+ if (fsck_err_on(new_u._f != gc_u._f, c, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ new_u.gen, \
+ bch2_data_types[new_u.data_type], \
+ new_u._f, gc_u._f)) \
+ new_u._f = gc_u._f; \
+
+ copy_bucket_field(gen);
+ copy_bucket_field(data_type);
+ copy_bucket_field(stripe);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ copy_bucket_field(stripe_redundancy);
+ copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+ new_u.oldest_gen = gc_u.oldest_gen;
- if (!refcount)
+ if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
- r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
- if (!r)
- return -ENOMEM;
+ a = bch2_alloc_pack(trans, new_u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+ : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
+}
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto fsck_err;
- }
+ bch2_trans_init(&trans, c, 0, 0);
- bkey_reassemble(new, k);
+ for_each_member_device(ca, c, i) {
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
- if (!r->refcount) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- } else {
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter,
+ initial, metadata_only));
+ if (ret)
+ break;
}
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
- kfree(new);
+ if (ret) {
+ bch_err(c, "error writing alloc info: %i", ret);
+ percpu_ref_put(&ca->ref);
+ break;
+ }
}
-fsck_err:
+
+ bch2_trans_exit(&trans);
return ret;
}
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ percpu_up_write(&c->mark_lock);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ return -ENOMEM;
+ }
+
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets[1], buckets);
+ };
+
+ return bch2_alloc_read(c, true, metadata_only);
+}
+
static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
@@ -1405,14 +1441,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0);
- if (initial) {
- c->reflink_gc_idx = 0;
-
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_done_initial_fn);
- goto out;
- }
-
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
@@ -1420,7 +1448,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
if (!refcount)
continue;
- r = genradix_ptr(&c->reflink_gc_table, idx);
+ r = genradix_ptr(&c->reflink_gc_table, idx++);
if (!r ||
r->offset != k.k->p.offset ||
r->size != k.k->size) {
@@ -1450,7 +1478,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
else
*bkey_refcount(new) = cpu_to_le64(r->refcount);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
kfree(new);
@@ -1460,57 +1490,47 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
}
fsck_err:
bch2_trans_iter_exit(&trans, &iter);
-out:
c->reflink_gc_nr = 0;
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+ bool metadata_only)
{
- struct bch_fs *c = trans->c;
- struct gc_stripe *m;
- const struct bch_stripe *s;
- char buf[200];
- unsigned i;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct reflink_gc *r;
int ret = 0;
- if (k.k->type != KEY_TYPE_stripe)
+ if (metadata_only)
return 0;
- s = bkey_s_c_to_stripe(k).v;
+ bch2_trans_init(&trans, c, 0, 0);
+ c->reflink_gc_nr = 0;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ const __le64 *refcount = bkey_refcount_c(k);
- for (i = 0; i < s->nr_blocks; i++)
- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
- goto inconsistent;
- return 0;
-inconsistent:
- if (fsck_err_on(true, c,
- "stripe has wrong block sector count %u:\n"
- " %s\n"
- " should be %u", i,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- m ? m->block_sectors[i] : 0)) {
- struct bkey_i_stripe *new;
+ if (!refcount)
+ continue;
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+ GFP_KERNEL);
+ if (!r) {
ret = -ENOMEM;
- goto fsck_err;
+ break;
}
- bkey_reassemble(&new->k_i, k);
-
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
-
- ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
- kfree(new);
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
}
-fsck_err:
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
return ret;
}
@@ -1518,6 +1538,12 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct gc_stripe *m;
+ const struct bch_stripe *s;
+ char buf[200];
+ unsigned i;
int ret = 0;
if (metadata_only)
@@ -1525,80 +1551,48 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0);
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_gc_stripes_done_initial_fn);
- } else {
- BUG();
- }
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
-{
-
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- return 0;
-
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- return 0;
-}
-
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
- bool metadata_only)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct reflink_gc *r;
- int ret = 0;
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- if (metadata_only)
- return 0;
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
- bch2_trans_init(&trans, c, 0, 0);
- c->reflink_gc_nr = 0;
+ for (i = 0; i < s->nr_blocks; i++)
+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+ goto inconsistent;
+ continue;
+inconsistent:
+ if (fsck_err_on(true, c,
+ "stripe has wrong block sector count %u:\n"
+ " %s\n"
+ " should be %u", i,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ m ? m->block_sectors[i] : 0)) {
+ struct bkey_i_stripe *new;
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_start_initial_fn);
- goto out;
- }
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
+ bkey_reassemble(&new->k_i, k);
- if (!refcount)
- continue;
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -ENOMEM;
- break;
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+ kfree(new);
}
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
}
+fsck_err:
bch2_trans_iter_exit(&trans, &iter);
-out:
+
bch2_trans_exit(&trans);
return ret;
}
@@ -1638,10 +1632,13 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
!bch2_btree_interior_updates_nr_pending(c));
again:
ret = bch2_gc_start(c, metadata_only) ?:
+ bch2_gc_alloc_start(c, initial, metadata_only) ?:
bch2_gc_reflink_start(c, initial, metadata_only);
if (ret)
goto out;
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
+
bch2_mark_superblocks(c);
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
@@ -1702,16 +1699,15 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- percpu_down_write(&c->mark_lock);
- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ bch2_gc_reflink_done(c, initial, metadata_only) ?:
+ bch2_gc_alloc_done(c, initial, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
- } else {
- percpu_down_write(&c->mark_lock);
}
+ percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 9ebb81d7..e8e0adac 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
@@ -1077,6 +1078,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
static void btree_path_verify_new_node(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_path_level *l;
unsigned plevel;
bool parent_locked;
@@ -1085,6 +1087,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ return;
+
plevel = b->c.level + 1;
if (!btree_path_node(path, plevel))
return;
@@ -1105,7 +1110,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
char buf4[100];
struct bkey uk = bkey_unpack_key(b, k);
- bch2_dump_btree_node(trans->c, l->b);
+ bch2_dump_btree_node(c, l->b);
bch2_bpos_to_text(&PBUF(buf1), path->pos);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1296,6 +1301,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
return ret;
}
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+ struct btree_and_journal_iter *jiter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_and_journal_iter_advance(jiter);
+ k = bch2_btree_and_journal_iter_peek(jiter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
struct btree_path *path,
unsigned plevel, struct btree *b)
@@ -1318,6 +1358,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
btree_node_unlock(path, plevel);
}
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ struct bkey_buf *out)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_and_journal_iter jiter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+ k = bch2_btree_and_journal_iter_peek(&jiter);
+
+ bch2_bkey_buf_reassemble(out, c, k);
+
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch_j(trans, path, &jiter);
+
+ bch2_btree_and_journal_iter_exit(&jiter);
+ return ret;
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -1328,14 +1392,28 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
+ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (unlikely(!replay_done)) {
+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ if (ret)
+ goto err;
+ } else {
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (flags & BTREE_ITER_PREFETCH) {
+ ret = btree_path_prefetch(trans, path);
+ if (ret)
+ goto err;
+ }
+ }
b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
@@ -1345,13 +1423,10 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
mark_btree_node_locked(path, level, lock_type);
btree_path_level_init(trans, path, b);
- if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b);
- if (flags & BTREE_ITER_PREFETCH)
- ret = btree_path_prefetch(trans, path);
-
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(path, level + 1);
path->level = level;
@@ -2107,6 +2182,59 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
return ret;
}
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if ((cmp_int(btree_id, i->btree_id) ?:
+ bpos_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->btree_id)
+ return i->k;
+ break;
+ }
+
+ return NULL;
+}
+
+static noinline
+struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, path->btree_id,
+ path->level, path->pos);
+
+ while (idx < keys->nr && keys->d[idx].overwritten)
+ idx++;
+
+ return (idx < keys->nr &&
+ keys->d[idx].btree_id == path->btree_id &&
+ keys->d[idx].level == path->level)
+ ? keys->d[idx].k
+ : NULL;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *next_journal =
+ __btree_trans_peek_journal(trans, iter->path);
+
+ if (next_journal &&
+ bpos_cmp(next_journal->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_journal->k;
+ k = bkey_i_to_s_c(next_journal);
+ }
+
+ return k;
+}
+
/**
* bch2_btree_iter_peek: returns first key greater than or equal to iterator's
* current position
@@ -2117,7 +2245,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_i *next_update;
struct bkey_s_c k;
- int ret, cmp;
+ int ret;
EBUG_ON(iter->path->cached || iter->path->level);
bch2_btree_iter_verify(iter);
@@ -2136,19 +2264,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out;
}
- next_update = iter->flags & BTREE_ITER_WITH_UPDATES
- ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
- : NULL;
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
- /* * In the btree, deleted keys sort before non deleted: */
- if (k.k && bkey_deleted(k.k) &&
- (!next_update ||
- bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
- search_key = k.k->p;
- continue;
- }
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ k = btree_trans_peek_journal(trans, iter, k);
+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+ : NULL;
if (next_update &&
bpos_cmp(next_update->k.p,
k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
@@ -2156,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
k = bkey_i_to_s_c(next_update);
}
+ if (k.k && bkey_deleted(k.k)) {
+ /*
+ * If we've got a whiteout, and it's after the search
+ * key, advance the search key to the whiteout instead
+ * of just after the whiteout - it might be a btree
+ * whiteout, with a real key at the same position, since
+ * in the btree deleted keys sort before non deleted.
+ */
+ search_key = bpos_cmp(search_key, k.k->p)
+ ? k.k->p
+ : bpos_successor(k.k->p);
+ continue;
+ }
+
if (likely(k.k)) {
/*
* We can never have a key in a leaf node at POS_MAX, so
@@ -2199,14 +2336,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
- cmp = bpos_cmp(k.k->p, iter->path->pos);
- if (cmp) {
- iter->path = bch2_btree_path_make_mut(trans, iter->path,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
- iter->path->pos = k.k->p;
- btree_path_check_sort(trans, iter->path, cmp);
- }
+ iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ BUG_ON(!iter->path->nodes_locked);
out:
iter->path->should_be_locked = true;
@@ -2247,6 +2380,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+ if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ return bkey_s_c_err(-EIO);
+
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2397,17 +2534,24 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
- next_update = iter->flags & BTREE_ITER_WITH_UPDATES
- ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
- : NULL;
+ if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ (next_update = btree_trans_peek_updates(trans,
+ iter->btree_id, search_key)) &&
+ !bpos_cmp(next_update->k.p, iter->pos)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ goto out;
+ }
- if (next_update &&
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
!bpos_cmp(next_update->k.p, iter->pos)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
- } else {
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ goto out;
}
+
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} else {
struct bpos next;
@@ -2451,7 +2595,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = (struct bkey_s_c) { &iter->k, NULL };
}
}
-
+out:
iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
@@ -2618,6 +2762,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+ flags |= BTREE_ITER_WITH_JOURNAL;
+
iter->trans = trans;
iter->path = NULL;
iter->btree_id = btree_id;
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index 0768ef3c..b3d241b1 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty &&
- test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+ return nr_dirty > max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 08c49ae3..1ace7604 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -207,10 +207,11 @@ struct btree_node_iter {
#define BTREE_ITER_CACHED_NOFILL (1 << 8)
#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
#define BTREE_ITER_WITH_UPDATES (1 << 10)
-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
-#define BTREE_ITER_NOPRESERVE (1 << 14)
+#define BTREE_ITER_WITH_JOURNAL (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
+#define BTREE_ITER_NOPRESERVE (1 << 15)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index 16ebf1a2..90ea018d 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -135,21 +135,4 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
- break;
- }
-
- return NULL;
-}
-
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 6872e56b..e1a5e34e 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -44,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
BUG_ON(!b->c.level);
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return;
bch2_btree_node_iter_init_from_start(&iter, b);
@@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
@@ -1847,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
struct async_btree_rewrite *a;
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
- return;
-
if (!percpu_ref_tryget(&c->writes))
return;
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 09dc585b..e2e878b8 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- EBUG_ON(!insert->level &&
- !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k)))
return false;
@@ -476,13 +473,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
JOURNAL_RES_GET_NONBLOCK);
if (ret)
return ret;
+
+ if (unlikely(trans->journal_transaction_names))
+ journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
- if (unlikely(trans->journal_transaction_names))
- journal_transaction_name(trans);
-
if (unlikely(trans->extra_journal_entry_u64s)) {
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
trans->extra_journal_entries,
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 78d43997..c72fe777 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1458,24 +1458,22 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
- struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
+ struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+ POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+ BTREE_ITER_WITH_UPDATES|
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
return ret;
}
- *u = update && !bpos_cmp(update->k.p, pos)
- ? bch2_alloc_unpack(bkey_i_to_s_c(update))
- : alloc_mem_to_key(c, iter);
-
+ *u = bch2_alloc_unpack(k);
return 0;
}
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 9a1751d4..9b45640e 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
bch2_stripes_heap_insert(c, m, iter.pos);
}
-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_stripes_read(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
const struct bch_stripe *s;
- struct bch_fs *c = trans->c;
struct stripe *m;
unsigned i;
- int ret = 0;
+ int ret;
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
+ bch2_trans_init(&trans, c, 0, 0);
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- return ret;
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- s = bkey_s_c_to_stripe(k).v;
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->alive = true;
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ s = bkey_s_c_to_stripe(k).v;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->alive = true;
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
-
- return ret;
-}
+ for (i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-int bch2_stripes_read(struct bch_fs *c)
-{
- struct btree_trans trans;
- int ret;
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_init(&trans, c, 0, 0);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_stripes_read_fn);
bch2_trans_exit(&trans);
+
if (ret)
bch_err(c, "error reading stripes: %i", ret);
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index ab9a6d96..52a3935c 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
u64 seq;
int err;
- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
- return 0;
-
lockdep_assert_held(&j->reclaim_lock);
while (1) {
@@ -692,8 +689,6 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable();
- kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 54cc69bd..d6d75121 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -148,7 +148,6 @@ enum journal_space_from {
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
- JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 52c0b56a..c6880654 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -330,9 +330,9 @@ enum opt_type {
NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
x(journal_transaction_names, u8, \
- OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH_SB_JOURNAL_TRANSACTION_NAMES, false, \
NULL, "Log transaction function names in journal") \
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 40e1e991..5da6b3b4 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
- struct journal_key *r)
+ const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
- return (cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p));
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-static size_t journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
@@ -116,11 +114,18 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.btree_id = id,
.level = level,
.k = k,
- .allocated = true
+ .allocated = true,
+ /*
+ * Ensure these keys are done last by journal replay, to unblock
+ * journal reclaim:
+ */
+ .journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
- unsigned idx = journal_key_search(keys, id, level, k->k.p);
+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+ BUG_ON(test_bit(BCH_FS_RW, &c->flags));
if (idx < keys->nr &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -157,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
return 0;
}
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_i *k)
{
@@ -189,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
struct journal_keys *keys = &c->journal_keys;
- size_t idx = journal_key_search(keys, btree, level, pos);
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->nr &&
keys->d[idx].btree_id == btree &&
@@ -200,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->idx - iter->keys->nr
- ? iter->keys->d + iter->idx : NULL;
+ struct journal_key *k = iter->keys->d + iter->idx;
- if (k &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level)
- return k->k;
+ while (k < iter->keys->d + iter->keys->nr &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level) {
+ if (!k->overwritten)
+ return k->k;
+
+ iter->idx++;
+ k = iter->keys->d + iter->idx;
+ }
- iter->idx = iter->keys->nr;
return NULL;
}
@@ -231,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->btree_id = id;
iter->level = level;
iter->keys = &c->journal_keys;
- iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
- list_add(&iter->list, &c->journal_iters);
+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -318,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
- bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(c, &iter->journal,
- b->c.btree_id, b->c.level, b->data->min_key);
+ iter->node_iter = node_iter;
+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ INIT_LIST_HEAD(&iter->journal.list);
}
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
- struct btree_and_journal_iter iter)
-{
- unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
- struct bkey_s_c k;
- struct bkey_buf tmp;
-
- BUG_ON(!b->c.level);
-
- bch2_bkey_buf_init(&tmp);
-
- while (i < nr &&
- (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
- b->c.btree_id, b->c.level - 1);
-
- bch2_btree_and_journal_iter_advance(&iter);
- i++;
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
- enum btree_id btree_id,
- btree_walk_key_fn key_fn)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- struct btree *child;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- child = bch2_btree_node_get_noiter(c, tmp.k,
- b->c.btree_id, b->c.level - 1,
- false);
-
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
-
- btree_and_journal_iter_prefetch(c, b, iter);
-
- ret = bch2_btree_and_journal_walk_recurse(trans, child,
- btree_id, key_fn);
- six_unlock_read(&child->c.lock);
- } else {
- ret = key_fn(trans, k);
- }
-
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
-}
-
-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
- btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b)
{
- struct bch_fs *c = trans->c;
- struct btree *b = c->btree_roots[btree_id].b;
- int ret = 0;
+ struct btree_node_iter node_iter;
- if (btree_node_fake(b))
- return 0;
-
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
- six_unlock_read(&b->c.lock);
-
- return ret;
+ bch2_btree_node_iter_init_from_start(&node_iter, b);
+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -442,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p) ?:
+ return journal_key_cmp(l, r) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -537,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int __bch2_journal_replay_key(struct btree_trans *trans,
- struct journal_key *k)
+static int bch2_journal_replay_key(struct btree_trans *trans,
+ struct journal_key *k)
{
struct btree_iter iter;
unsigned iter_flags =
@@ -546,34 +483,24 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
BTREE_ITER_NOT_EXTENTS;
int ret;
- /* Must be checked with btree locked: */
- if (k->overwritten)
- return 0;
-
if (!k->level && k->btree_id == BTREE_ID_alloc)
- iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
+ iter_flags |= BTREE_ITER_CACHED;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-{
- unsigned commit_flags =
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED;
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
- if (!k->allocated)
- commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+ /* Must be checked with btree locked: */
+ if (k->overwritten)
+ goto out;
- return bch2_trans_do(c, NULL, NULL, commit_flags,
- __bch2_journal_replay_key(&trans, k));
+ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
@@ -581,10 +508,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(r->level, l->level) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->btree_id, r->btree_id) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
+ return cmp_int(l->journal_seq, r->journal_seq);
}
static int bch2_journal_replay(struct bch_fs *c)
@@ -592,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k;
struct journal *j = &c->journal;
- struct bch_dev *ca;
- unsigned idx;
size_t i;
- u64 seq;
int ret;
keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
@@ -609,76 +530,30 @@ static int bch2_journal_replay(struct bch_fs *c)
sizeof(keys_sorted[0]),
journal_sort_seq_cmp, NULL);
- if (keys->nr)
+ if (keys->nr) {
+ bch_verbose(c, "starting journal replay, %zu keys", keys->nr);
replay_now_at(j, keys->journal_seq_base);
-
- seq = j->replay_journal_seq;
-
- /*
- * First replay updates to the alloc btree - these will only update the
- * btree key cache:
- */
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
-
- cond_resched();
-
- if (!k->level && k->btree_id == BTREE_ID_alloc) {
- j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
- ret = bch2_journal_replay_key(c, k);
- if (ret)
- goto err;
- }
- }
-
- /* Now we can start the allocator threads: */
- set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
- for_each_member_device(ca, c, idx)
- bch2_wake_allocator(ca);
-
- /*
- * Next replay updates to interior btree nodes:
- */
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
-
- cond_resched();
-
- if (k->level) {
- j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
- ret = bch2_journal_replay_key(c, k);
- if (ret)
- goto err;
- }
}
- /*
- * Now that the btree is in a consistent state, we can start journal
- * reclaim (which will be flushing entries from the btree key cache back
- * to the btree:
- */
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
- journal_reclaim_kick(j);
-
- j->replay_journal_seq = seq;
-
- /*
- * Now replay leaf node updates:
- */
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
- if (k->level || k->btree_id == BTREE_ID_alloc)
- continue;
-
- replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+ if (!k->allocated)
+ replay_now_at(j, keys->journal_seq_base + k->journal_seq);
- ret = bch2_journal_replay_key(c, k);
- if (ret)
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RESERVED|
+ (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+ bch2_journal_replay_key(&trans, k));
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+ ret, bch2_btree_ids[k->btree_id], k->level);
goto err;
+ }
}
replay_now_at(j, j->replay_journal_seq_end);
@@ -686,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
- kfree(keys_sorted);
-
- return bch2_journal_error(j);
+ ret = bch2_journal_error(j);
err:
- bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
- ret, bch2_btree_ids[k->btree_id], k->level);
kfree(keys_sorted);
-
return ret;
}
@@ -1217,7 +1087,11 @@ use_clean:
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c);
+
+ down_read(&c->gc_lock);
+ ret = bch2_alloc_read(c, false, false);
+ up_read(&c->gc_lock);
+
if (ret)
goto err;
bch_verbose(c, "alloc read done");
@@ -1231,6 +1105,13 @@ use_clean:
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ /*
+ * If we're not running fsck, this ensures bch2_fsck_err() calls are
+ * instead interpreted as bch2_inconsistent_err() calls:
+ */
+ if (!c->opts.fsck)
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
@@ -1265,24 +1146,8 @@ use_clean:
ret = bch2_journal_replay(c);
if (ret)
goto err;
- bch_verbose(c, "journal replay done");
-
- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
- !c->opts.nochanges) {
- /*
- * note that even when filesystem was clean there might be work
- * to do here, if we ran gc (because of fsck) which recalculated
- * oldest_gen:
- */
- bch_verbose(c, "writing allocation info");
- err = "error writing out alloc info";
- ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error writing alloc info");
- goto err;
- }
- bch_verbose(c, "alloc write done");
- }
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
bch2_fs_lazy_rw(c);
@@ -1430,14 +1295,11 @@ int bch2_fs_initialize(struct bch_fs *c)
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h
index a7a9496a..21bdad9d 100644
--- a/libbcachefs/recovery.h
+++ b/libbcachefs/recovery.h
@@ -31,6 +31,9 @@ struct btree_and_journal_iter {
} last;
};
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+ unsigned, struct bpos);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *, struct btree *,
+ struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-
-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index bbed24b7..8e28a13a 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -752,11 +752,24 @@ int bch2_write_super(struct bch_fs *c)
closure_sync(cl);
for_each_online_member(ca, c, i) {
- if (!ca->sb_write_error &&
- ca->disk_sb.seq !=
- le64_to_cpu(ca->sb_read_scratch->seq)) {
+ if (ca->sb_write_error)
+ continue;
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+ bch2_fs_fatal_error(c,
+ "Superblock write was silently dropped! (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
+ percpu_ref_put(&ca->io_ref);
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process");
+ "Superblock modified by another process (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
ret = -EROFS;
goto out;