summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2025-07-01 13:36:51 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2025-07-01 13:41:13 -0400
commitd321d5c84188b9b43d53d5b3ab822dbe4f754456 (patch)
tree23bd1c32c62fd4d7c9b4f5d820d0599dc801109e
parent63b14f899c4603c5220769b1bb6ec8ada36023a2 (diff)
bcachefs: Work around deadlock to btree node rewrites in journal replay
Don't mark btree nodes for rewrites, if they are or would be degraded, if journal replay hasn't finished, to avoid a deadlock. This is because btree node rewrites generate more updates for the interior updates (alloc, backpointers), and if those updates touch new nodes and generate more rewrites - we can only have so many interior btree updates in flight before we deadlock on open_buckets. The biggest cause is that we don't use the btree write buffer (for the backpointer updates - this needs some real thought on locking in order to fix. The problem with this workaround (not doing the rewrite for degraded nodes in journal replay) is that those degraded nodes persist, and we don't want that (this is a real bug when a btree node write completes with fewer replicas than we wanted and leaves a degraded node due to device _removal_, i.e. the device went away mid write). It's less of a bug here, but still a problem because we don't yet have a way of tracking degraded data - we another index (all extents/btree nodes, by replicas entry) in order to fix properly (re-replicate degraded data at the earliest possible time). Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/btree_io.c43
1 files changed, 35 insertions, 8 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..84e302afc8fc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);