5 files changed, 60 insertions, 18 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 41c6d8865a74..fb3156ed7f0b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1288,7 +1288,7 @@ static inline int bch2_fs_casefold_enabled(struct bch_fs *c)
 {
 	if (!IS_ENABLED(CONFIG_UNICODE))
 		return bch_err_throw(c, no_casefolding_without_utf8);
-	if (!c->opts.casefold_disabled)
+	if (c->opts.casefold_disabled)
 		return bch_err_throw(c, casefolding_disabled);
 	return 0;
 }
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..84e302afc8fc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 	btree_node_reset_sib_u64s(b);
 
-	scoped_guard(rcu)
-		bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-			struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
-			if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
-				set_btree_node_need_rewrite(b);
-				set_btree_node_need_rewrite_degraded(b);
+	/*
+	 * XXX:
+	 *
+	 * We deadlock if too many btree updates require node rewrites while
+	 * we're still in journal replay.
+	 *
+	 * This is because btree node rewrites generate more updates for the
+	 * interior updates (alloc, backpointers), and if those updates touch
+	 * new nodes and generate more rewrites - well, you see the problem.
+	 *
+	 * The biggest cause is that we don't use the btree write buffer (for
+	 * the backpointer updates - this needs some real thought on locking in
+	 * order to fix.
+	 *
+	 * The problem with this workaround (not doing the rewrite for degraded
+	 * nodes in journal replay) is that those degraded nodes persist, and we
+	 * don't want that (this is a real bug when a btree node write completes
+	 * with fewer replicas than we wanted and leaves a degraded node due to
+	 * device _removal_, i.e. the device went away mid write).
+	 *
+	 * It's less of a bug here, but still a problem because we don't yet
+	 * have a way of tracking degraded data - we another index (all
+	 * extents/btree nodes, by replicas entry) in order to fix properly
+	 * (re-replicate degraded data at the earliest possible time).
+	 */
+	if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+		scoped_guard(rcu)
+			bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+				struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+				if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+					set_btree_node_need_rewrite(b);
+					set_btree_node_need_rewrite_degraded(b);
+				}
 			}
-		}
+	}
 
 	if (!ptr_written) {
 		set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index e848e210a9bf..3968f3be7f3b 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -783,6 +783,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 	darray_for_each(m->op.devs_have, i)
 		__clear_bit(*i, devs.d);
 
+	CLASS(printbuf, buf)();
+	buf.atomic++;
+
 	guard(rcu)();
 
 	unsigned nr_replicas = 0, i;
@@ -794,7 +797,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 		struct bch_dev_usage usage;
 		bch2_dev_usage_read_fast(ca, &usage);
 
-		if (!dev_buckets_free(ca, usage, m->op.watermark))
+		u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
+
+		prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
+
+		if (!nr_free)
 			continue;
 
 		nr_replicas += ca->mi.durability;
@@ -802,8 +809,10 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 			break;
 	}
 
-	if (!nr_replicas)
+	if (!nr_replicas) {
+		trace_data_update_done_no_rw_devs(c, buf.buf);
 		return bch_err_throw(c, data_update_done_no_rw_devs);
+	}
 	if (nr_replicas < m->op.nr_replicas)
 		return bch_err_throw(c, insufficient_devices);
 	return 0;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7340c1118579..6980cd5b0ca8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1024,13 +1024,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 			goto err;
 	}
 
-#if !IS_ENABLED(CONFIG_UNICODE)
-	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
-		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
-		ret = -EINVAL;
-		goto err;
-	}
-#endif
+#if IS_ENABLED(CONFIG_UNICODE)
 	if (!bch2_fs_casefold_enabled(c)) {
 		/* Default encoding until we can potentially have more as an option. */
 		c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
@@ -1043,6 +1037,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
 			goto err;
 		}
 	}
+#else
+	if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
+		printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
+		ret = -EINVAL;
+		goto err;
+	}
+#endif
 
 	for (i = 0; i < c->sb.nr_devices; i++) {
 		if (!bch2_member_exists(c->disk_sb.sb, i))
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b5dae1145afa..9324ef32903d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1330,6 +1330,11 @@ DEFINE_EVENT(fs_str, data_update,
 	TP_ARGS(c, str)
 );
 
+DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
 DEFINE_EVENT(fs_str, io_move_pred,
 	TP_PROTO(struct bch_fs *c, const char *str),
 	TP_ARGS(c, str)