diff options
-rw-r--r-- | fs/bcachefs/btree_io.c | 43 | ||||
-rw-r--r-- | fs/bcachefs/data_update.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/fs.c | 20 | ||||
-rw-r--r-- | fs/bcachefs/trace.h | 5 |
4 files changed, 64 insertions, 17 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 19fd951495ac..84e302afc8fc 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1337,15 +1337,42 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - } + } if (!ptr_written) { set_btree_node_need_rewrite(b); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index e848e210a9bf..3968f3be7f3b 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -783,6 +783,9 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); + CLASS(printbuf, buf)(); + buf.atomic++; + guard(rcu)(); unsigned nr_replicas = 0, i; @@ -794,7 +797,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); - if (!dev_buckets_free(ca, usage, m->op.watermark)) + u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); + + prt_printf(&buf, "%s=%llu ", ca->name, nr_free); + + if (!nr_free) continue; nr_replicas += ca->mi.durability; @@ -802,8 +809,10 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) break; } - if (!nr_replicas) + if (!nr_replicas) { + trace_data_update_done_no_rw_devs(c, buf.buf); return bch_err_throw(c, data_update_done_no_rw_devs); + } if (nr_replicas < m->op.nr_replicas) return bch_err_throw(c, insufficient_devices); return 0; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f9bc99eb2d02..3b0783f117ae 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1692,11 +1692,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, s.mask = map_defined(bch_flags_to_xflags); s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); - if (fa->fsx_xflags) - return bch_err_throw(c, unsupported_fsx_flag); + if (fa->fsx_xflags) { + ret = bch_err_throw(c, unsupported_fsx_flag); + goto err; + } - if (fa->fsx_projid >= U32_MAX) - return bch_err_throw(c, projid_too_big); + if (fa->fsx_projid >= U32_MAX) { + ret = bch_err_throw(c, projid_too_big); + goto err; + } /* * inode fields accessible via the xattr interface are stored with a +1 @@ -1718,8 +1722,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, fa->flags &= ~FS_CASEFOLD_FL; s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); - if (fa->flags) - return bch_err_throw(c, unsupported_fa_flag); + if (fa->flags) { + ret = bch_err_throw(c, unsupported_fa_flag); + goto err; + } } mutex_lock(&inode->ei_update_lock); @@ -1730,7 +1736,7 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - +err: return bch2_err_class(ret); } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index b5dae1145afa..9324ef32903d 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1330,6 +1330,11 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, data_update_done_no_rw_devs, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, io_move_pred, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) |