summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/btree_io.c43
-rw-r--r--fs/bcachefs/data_update.c13
-rw-r--r--fs/bcachefs/fs.c20
-rw-r--r--fs/bcachefs/trace.h5
4 files changed, 64 insertions, 17 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 19fd951495ac..84e302afc8fc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- scoped_guard(rcu)
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
-
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
- set_btree_node_need_rewrite(b);
- set_btree_node_need_rewrite_degraded(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
}
- }
+ }
if (!ptr_written) {
set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index e848e210a9bf..3968f3be7f3b 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -783,6 +783,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
+ CLASS(printbuf, buf)();
+ buf.atomic++;
+
guard(rcu)();
unsigned nr_replicas = 0, i;
@@ -794,7 +797,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
- if (!dev_buckets_free(ca, usage, m->op.watermark))
+ u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark);
+
+ prt_printf(&buf, "%s=%llu ", ca->name, nr_free);
+
+ if (!nr_free)
continue;
nr_replicas += ca->mi.durability;
@@ -802,8 +809,10 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
break;
}
- if (!nr_replicas)
+ if (!nr_replicas) {
+ trace_data_update_done_no_rw_devs(c, buf.buf);
return bch_err_throw(c, data_update_done_no_rw_devs);
+ }
if (nr_replicas < m->op.nr_replicas)
return bch_err_throw(c, insufficient_devices);
return 0;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f9bc99eb2d02..3b0783f117ae 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
s.mask = map_defined(bch_flags_to_xflags);
s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
- if (fa->fsx_xflags)
- return bch_err_throw(c, unsupported_fsx_flag);
+ if (fa->fsx_xflags) {
+ ret = bch_err_throw(c, unsupported_fsx_flag);
+ goto err;
+ }
- if (fa->fsx_projid >= U32_MAX)
- return bch_err_throw(c, projid_too_big);
+ if (fa->fsx_projid >= U32_MAX) {
+ ret = bch_err_throw(c, projid_too_big);
+ goto err;
+ }
/*
* inode fields accessible via the xattr interface are stored with a +1
@@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
fa->flags &= ~FS_CASEFOLD_FL;
s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
- if (fa->flags)
- return bch_err_throw(c, unsupported_fa_flag);
+ if (fa->flags) {
+ ret = bch_err_throw(c, unsupported_fa_flag);
+ goto err;
+ }
}
mutex_lock(&inode->ei_update_lock);
@@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap,
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
-
+err:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index b5dae1145afa..9324ef32903d 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1330,6 +1330,11 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
+DEFINE_EVENT(fs_str, data_update_done_no_rw_devs,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
DEFINE_EVENT(fs_str, io_move_pred,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)