summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r--fs/bcachefs/btree_io.c136
1 files changed, 82 insertions, 54 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..bd86dd7151a1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -24,8 +24,15 @@
#include "super-io.h"
#include "trace.h"
+#include <linux/moduleparam.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_btree_read_corrupt_ratio;
+module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
+#endif
+
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
{
bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
@@ -568,9 +575,9 @@ static int __btree_err(int ret,
bch2_mark_btree_validate_failure(failed, ca->dev_idx);
struct extent_ptr_decoded pick;
- have_retry = !bch2_bkey_pick_read_device(c,
+ have_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
- failed, &pick, -1);
+ failed, &pick, -1) == 1;
}
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
@@ -585,7 +592,7 @@ static int __btree_err(int ret,
!(test_bit(BCH_FS_in_fsck, &c->flags) &&
c->opts.fix_errors == FSCK_FIX_ask);
- struct printbuf out = PRINTBUF;
+ CLASS(printbuf, out)();
bch2_log_msg_start(c, &out);
if (!print_deferred)
@@ -612,14 +619,13 @@ static int __btree_err(int ret,
if (!have_retry)
ret = bch_err_throw(c, fsck_fix);
- goto out;
+ return ret;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
- goto out;
+ return ret;
}
if (rw == WRITE) {
@@ -641,17 +647,14 @@ static int __btree_err(int ret,
if (!have_retry)
ret = bch_err_throw(c, fsck_fix);
- goto out;
+ return ret;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
- ret = __bch2_topology_error(c, &out);
break;
}
print:
bch2_print_str(c, KERN_ERR, out.buf);
-out:
fsck_err:
- printbuf_exit(&out);
return ret;
}
@@ -730,8 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
+ CLASS(printbuf, buf1)();
+ CLASS(printbuf, buf2)();
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
@@ -750,10 +753,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
if (bch2_version_compatible(version)) {
- mutex_lock(&c->sb_lock);
+ guard(mutex)(&c->sb_lock);
c->disk_sb.sb->version_min = cpu_to_le16(version);
bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
} else {
/* We have no idea what's going on: */
i->version = cpu_to_le16(c->sb.version);
@@ -767,10 +769,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
- mutex_lock(&c->sb_lock);
+ guard(mutex)(&c->sb_lock);
c->disk_sb.sb->version = cpu_to_le16(version);
bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
@@ -870,8 +871,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
&bn->format);
}
fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
return ret;
}
@@ -941,7 +940,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
int ret = 0;
@@ -1046,7 +1045,6 @@ got_good_key:
set_btree_node_need_rewrite_error(b);
}
fsck_err:
- printbuf_exit(&buf);
return ret;
}
@@ -1065,7 +1063,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
int ret = 0, write = READ;
u64 start_time = local_clock();
@@ -1297,9 +1295,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
- if (updated_range)
- bch2_btree_node_drop_keys_outside_node(b);
-
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
@@ -1337,15 +1332,45 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+ if (updated_range)
+ bch2_btree_node_drop_keys_outside_node(b);
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);
@@ -1353,7 +1378,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
}
fsck_err:
mempool_free(iter, &c->fill_iter);
- printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return ret;
}
@@ -1369,7 +1393,7 @@ static void btree_node_read_work(struct work_struct *work)
struct bch_io_failures failed = { .nr = 0 };
int ret = 0;
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
prt_printf(&buf, "btree node read error at btree ");
@@ -1381,7 +1405,7 @@ static void btree_node_read_work(struct work_struct *work)
ret = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1);
- if (ret) {
+ if (ret <= 0) {
set_btree_node_read_error(b);
break;
}
@@ -1412,6 +1436,11 @@ start:
continue;
}
+ memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
+
+ bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
+
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
ret == -BCH_ERR_btree_node_read_err_must_retry)
@@ -1456,7 +1485,6 @@ start:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
- printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1538,7 +1566,7 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
closure_type(ra, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
bool dump_bset_maps = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
@@ -1647,11 +1675,10 @@ fsck_err:
if (ret) {
set_btree_node_read_error(b);
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
bch2_btree_lost_data(c, &buf, b->c.btree_id);
if (buf.pos)
bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@@ -1662,7 +1689,6 @@ fsck_err:
closure_debug_destroy(&ra->cl);
kfree(ra);
- printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
@@ -1782,7 +1808,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
if (ret <= 0) {
bool ratelimit = true;
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
bch2_log_msg_start(c, &buf);
prt_str(&buf, "btree node read error: no device to read from\n at ");
@@ -1799,7 +1825,6 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
DEFAULT_RATELIMIT_BURST);
if (!ratelimit || __ratelimit(&rs))
bch2_print_str(c, KERN_ERR, buf.buf);
- printbuf_exit(&buf);
set_btree_node_read_error(b);
clear_btree_node_read_in_flight(b);
@@ -1881,9 +1906,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
- mutex_unlock(&c->btree_cache.lock);
+ scoped_guard(mutex, &c->btree_cache.lock)
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
ret = bch_err_throw(c, btree_node_read_error);
goto err;
@@ -1900,7 +1924,8 @@ err:
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
const struct bkey_i *k, unsigned level)
{
- return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
+ CLASS(btree_trans, trans)(c);
+ return __bch2_btree_root_read(trans, id, k, level);
}
struct btree_node_scrub {
@@ -1979,7 +2004,7 @@ static void btree_node_scrub_work(struct work_struct *work)
{
struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
struct bch_fs *c = scrub->c;
- struct printbuf err = PRINTBUF;
+ CLASS(printbuf, err)();
__bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
bkey_i_to_s_c(scrub->key.k));
@@ -1994,7 +2019,6 @@ static void btree_node_scrub_work(struct work_struct *work)
bch_err_fn_ratelimited(c, ret);
}
- printbuf_exit(&err);
bch2_bkey_buf_exit(&scrub->key, c);;
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
@@ -2175,7 +2199,8 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
- ret = bch2_trans_do(c,
+ CLASS(btree_trans, trans)(c);
+ ret = lockrestart_do(trans,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
@@ -2194,11 +2219,10 @@ err:
set_btree_node_noevict(b);
if (!bch2_err_matches(ret, EROFS)) {
- struct printbuf buf = PRINTBUF;
+ CLASS(printbuf, buf)();
prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
bch2_btree_pos_to_text(&buf, c, b);
bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
}
goto out;
}
@@ -2217,13 +2241,12 @@ static void btree_node_write_endio(struct bio *bio)
wbio->submit_time, !bio->bi_status);
if (ca && bio->bi_status) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
+ CLASS(printbuf, buf)();
+ guard(printbuf_atomic)(&buf);
prt_printf(&buf, "btree write error: %s\n ",
bch2_blk_status_to_str(bio->bi_status));
bch2_btree_pos_to_text(&buf, c, b);
bch_err_dev_ratelimited(ca, "%s", buf.buf);
- printbuf_exit(&buf);
}
if (bio->bi_status) {
@@ -2517,9 +2540,14 @@ do_write:
}
count_event(c, btree_node_write);
+ /*
+ * blk-wbt.c throttles all writes except those that have both REQ_SYNC
+ * and REQ_IDLE set...
+ */
+
wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),
- REQ_OP_WRITE|REQ_META,
+ REQ_OP_WRITE|REQ_META|REQ_SYNC|REQ_IDLE,
GFP_NOFS,
&c->btree_bio),
struct btree_write_bio, wbio.bio);