summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/bcachefs/btree_io.c7
-rw-r--r--fs/bcachefs/btree_trans_commit.c19
-rw-r--r--fs/bcachefs/btree_update_interior.c41
-rw-r--r--fs/bcachefs/dirent.c11
-rw-r--r--fs/bcachefs/ec.c3
-rw-r--r--fs/bcachefs/extents.c14
-rw-r--r--fs/bcachefs/extents.h2
-rw-r--r--fs/bcachefs/fs.c8
-rw-r--r--fs/bcachefs/journal.c2
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c6
-rw-r--r--fs/bcachefs/journal_reclaim.c29
-rw-r--r--fs/bcachefs/move.c96
-rw-r--r--fs/bcachefs/move.h12
-rw-r--r--fs/bcachefs/recovery.c59
-rw-r--r--fs/bcachefs/recovery_passes.c38
-rw-r--r--fs/bcachefs/recovery_passes.h3
-rw-r--r--fs/bcachefs/str_hash.c1
-rw-r--r--fs/bcachefs/super-io.c7
-rw-r--r--fs/bcachefs/super.c8
-rw-r--r--fs/bcachefs/sysfs.c8
-rw-r--r--fs/bcachefs/util.h7
-rw-r--r--include/linux/workqueue.h12
-rw-r--r--kernel/workqueue.c14
-rw-r--r--lib/closure.c12
-rw-r--r--mm/shrinker.c14
26 files changed, 274 insertions, 161 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c0fba9016d6a..bd86dd7151a1 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -2540,9 +2540,14 @@ do_write:
}
count_event(c, btree_node_write);
+ /*
+ * blk-wbt.c throttles all writes except those that have both REQ_SYNC
+ * and REQ_IDLE set...
+ */
+
wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),
- REQ_OP_WRITE|REQ_META,
+ REQ_OP_WRITE|REQ_META|REQ_SYNC|REQ_IDLE,
GFP_NOFS,
&c->btree_bio),
struct btree_write_bio, wbio.bio);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 1f9965ae610c..58590ccc26bd 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -772,12 +772,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
trans->journal_res.offset += trans->journal_entries.u64s;
trans->journal_res.u64s -= trans->journal_entries.u64s;
- memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_write_buffer_keys,
- BTREE_ID_accounting, 0,
- trans->accounting.u64s)->_data,
- btree_trans_subbuf_base(trans, &trans->accounting),
- trans->accounting.u64s);
+ if (trans->accounting.u64s)
+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_write_buffer_keys,
+ BTREE_ID_accounting, 0,
+ trans->accounting.u64s)->_data,
+ btree_trans_subbuf_base(trans, &trans->accounting),
+ trans->accounting.u64s);
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
@@ -1065,11 +1066,15 @@ int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags f
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- journal_u64s = jset_u64s(trans->accounting.u64s);
+ journal_u64s = 0;
+
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+ if (trans->accounting.u64s)
+ journal_u64s += jset_u64s(trans->accounting.u64s);
+
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 312ef203b27b..e4aa4fa749bc 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -14,6 +14,7 @@
#include "btree_locking.h"
#include "buckets.h"
#include "clock.h"
+#include "disk_groups.h"
#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
@@ -277,6 +278,36 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
bch2_trans_node_drop(trans, b);
}
+static bool can_use_btree_node(struct bch_fs *c,
+ struct disk_reservation *res,
+ unsigned target,
+ struct bkey_s_c k)
+{
+ if (!bch2_bkey_devs_rw(c, k))
+ return false;
+
+ if (target && !bch2_bkey_in_target(c, k, target))
+ return false;
+
+ unsigned durability = bch2_bkey_durability(c, k);
+
+ if (durability >= res->nr_replicas)
+ return true;
+
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_btree, target);
+
+ guard(rcu)();
+
+ unsigned durability_available = 0, i;
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
+ if (ca)
+ durability_available += ca->mi.durability;
+ }
+
+ return durability >= durability_available;
+}
+
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
@@ -303,10 +334,14 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
for (struct btree_alloc *a = c->btree_reserve_cache;
- a < c->btree_reserve_cache + c->btree_reserve_cache_nr;
- a++) {
- if (target && !bch2_bkey_in_target(c, bkey_i_to_s_c(&a->k), target))
+ a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) {
+ /* check if it has sufficient durability */
+
+ if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) {
+ bch2_open_buckets_put(c, &a->ob);
+ *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr];
continue;
+ }
bkey_copy(&b->key, &a->k);
b->ob = a->ob;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index e27cf62d3a5e..dd60c47528da 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -262,6 +262,7 @@ int bch2_dirent_init_name(struct bch_fs *c,
memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
+ void *val_end = bkey_val_end(bkey_i_to_s(&dirent->k_i));
if (cf_name) {
cf_len = cf_name->len;
@@ -269,16 +270,14 @@ int bch2_dirent_init_name(struct bch_fs *c,
memcpy(cf_out, cf_name->name, cf_name->len);
} else {
cf_len = utf8_casefold(hash_info->cf_encoding, name,
- cf_out,
- bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
+ cf_out, val_end - (void *) cf_out);
if (cf_len <= 0)
return cf_len;
}
- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_cf_name_block.d_names) -
- name->len + cf_len);
+ void *name_end = &dirent->v.d_cf_name_block.d_names[name->len + cf_len];
+ BUG_ON(name_end > val_end);
+ memset(name_end, 0, val_end - name_end);
dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 62dda821247e..bea14f02114f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -2060,6 +2060,9 @@ allocated:
BUG_ON(trans->restarted);
return h;
err:
+ if (waiting &&
+ !bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ closure_wake_up(&c->freelist_wait);
bch2_ec_stripe_head_put(c, h);
return ERR_PTR(ret);
}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b36ecfc0ab9d..8152ef1cbbcd 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1006,6 +1006,20 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
return NULL;
}
+bool bch2_bkey_devs_rw(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr) {
+ CLASS(bch2_dev_tryget, ca)(c, ptr->dev);
+ if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw)
+ return false;
+ }
+
+ return true;
+}
+
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index f212f91c278d..35ee03cd5065 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -614,6 +614,8 @@ static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsig
return (void *) bch2_bkey_has_device_c(k.s_c, dev);
}
+bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c);
+
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 2789b30add10..56b7126bc31d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1295,8 +1295,14 @@ static int bch2_fill_extent(struct bch_fs *c,
flags|
FIEMAP_EXTENT_DELALLOC|
FIEMAP_EXTENT_UNWRITTEN);
+ } else if (k.k->type == KEY_TYPE_error) {
+ return 0;
} else {
- BUG();
+ WARN_ONCE(1, "unhandled key type %s",
+ k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_types[k.k->type]
+ : "(unknown)");
+ return 0;
}
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 3ba1f9fd3402..de03e20f6e30 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1063,6 +1063,8 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
? ERR_PTR(-EAGAIN)
: buf;
+ if (!ret)
+ smp_mb();
break;
}
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index b46b9718d841..c05aa94237f8 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -267,7 +267,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
{
union journal_res_state s;
- s.v = atomic64_sub_return(((union journal_res_state) {
+ s.v = atomic64_sub_return_release(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 94a065ee0b40..2835250a14c4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1867,7 +1867,11 @@ static CLOSURE_CALLBACK(journal_write_submit)
jbio->submit_time = local_clock();
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+ /*
+ * blk-wbt.c throttles all writes except those that have both
+ * REQ_SYNC and REQ_IDLE set...
+ */
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index be50455c7f13..f23e5ee9ad75 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -874,7 +874,34 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
--type)
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
*did_work = true;
- return ret;
+
+ /*
+ * Question from Dan Carpenter, on the early return:
+ *
+ * If journal_flush_pins_or_still_flushing() returns
+ * true, then the flush hasn't complete and we must
+ * return 0; we want the outer closure_wait_event() in
+ * journal_flush_pins() to continue.
+ *
+ * The early return is there because we don't want to
+ * call journal_entry_close() until we've finished
+ * flushing all outstanding journal pins - otherwise
+ * seq_to_flush can be U64_MAX, and we'll close a bunch
+ * of journal entries and write tiny ones completely
+ * unnecessarily.
+ *
+ * Having the early return be in the loop where we loop
+ * over types is important, because flushing one journal
+ * pin can cause new journal pins to be added (even of
+ * the same type, btree node writes may generate more
+ * btree node writes, when updating the parent pointer
+ * has a full node and has to trigger a split/compact).
+ *
+ * This is part of our shutdown sequence, where order of
+ * flushing is important in order to make sure that it
+ * terminates...
+ */
+ return 0;
}
if (seq_to_flush > journal_cur_seq(j))
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 3f44bb54f91a..54dd6fec81db 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -795,50 +795,50 @@ out:
return ret;
}
-int __bch2_move_data(struct moving_context *ctxt,
- struct bbpos start,
- struct bbpos end,
- move_pred_fn pred, void *arg)
+static int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ unsigned min_depth,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
{
- struct bch_fs *c = ctxt->trans->c;
- enum btree_id id;
int ret = 0;
- for (id = start.btree;
+ struct moving_context ctxt;
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+
+ for (enum btree_id id = start.btree;
id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- ctxt->stats->pos = BBPOS(id, POS_MIN);
+ ctxt.stats->pos = BBPOS(id, POS_MIN);
- if (!btree_type_has_ptrs(id) ||
- !bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, id)->b)
continue;
- ret = bch2_move_data_btree(ctxt,
- id == start.btree ? start.pos : POS_MIN,
- id == end.btree ? end.pos : POS_MAX,
- pred, arg, id, 0);
+ unsigned min_depth_this_btree = min_depth;
+
+ if (!btree_type_has_ptrs(id))
+ min_depth_this_btree = max(min_depth_this_btree, 1);
+
+ for (unsigned level = min_depth_this_btree;
+ level < BTREE_MAX_DEPTH;
+ level++) {
+ ret = bch2_move_data_btree(&ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
+ pred, arg, id, level);
+ if (ret)
+ break;
+ }
+
if (ret)
break;
}
- return ret;
-}
-
-int bch2_move_data(struct bch_fs *c,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
-{
- struct moving_context ctxt;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
-
return ret;
}
@@ -1206,14 +1206,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
return data_opts->rewrite_ptrs != 0;
}
-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
/*
* Ancient versions of bcachefs produced packed formats which could represent
* keys that the in memory format cannot represent; this checks for those
@@ -1293,15 +1285,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
return data_opts->kill_ptrs != 0;
}
-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
- io_opts, data_opts);
-}
-
static bool scrub_pred(struct bch_fs *c, void *_arg,
enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
@@ -1359,14 +1342,11 @@ int bch2_data_job(struct bch_fs *c,
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, start, end,
- rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
+ ret = bch2_move_data(c, start, end, 0, NULL, stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
+ bch2_btree_interior_updates_flush(c);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_migrate:
@@ -1389,12 +1369,10 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_scan_old_btree_nodes(c, stats);
break;
case BCH_DATA_OP_drop_extra_replicas:
- ret = bch2_move_btree(c, start, end,
- drop_extra_replicas_btree_pred, c, stats) ?: ret;
- ret = bch2_move_data(c, start, end, NULL, stats,
- writepoint_hashed((unsigned long) current),
- true,
- drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_move_data(c, start, end, 0, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index fe92ca6d418d..481026ff99ab 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -128,18 +128,6 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
move_pred_fn, void *, enum btree_id, unsigned);
-int __bch2_move_data(struct moving_context *,
- struct bbpos,
- struct bbpos,
- move_pred_fn, void *);
-int bch2_move_data(struct bch_fs *,
- struct bbpos start,
- struct bbpos end,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool,
- move_pred_fn, void *);
int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
struct bch_ratelimit *, struct bch_move_stats *,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index a8eea4787a3e..58c159e5f10d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -37,78 +37,79 @@ int bch2_btree_lost_data(struct bch_fs *c,
struct printbuf *msg,
enum btree_id btree)
{
- u64 b = BIT_ULL(btree);
int ret = 0;
guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- if (!(c->sb.btrees_lost_data & b)) {
+ if (!(c->sb.btrees_lost_data & BIT_ULL(btree))) {
prt_printf(msg, "flagging btree ");
bch2_btree_id_to_text(msg, btree);
prt_printf(msg, " lost data\n");
- ext->btrees_lost_data |= cpu_to_le64(b);
+ write_sb |= !__test_and_set_bit_le64(btree, &ext->btrees_lost_data);
}
/* Once we have runtime self healing for topology errors we won't need this: */
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
/* Btree node accounting will be off: */
- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret;
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* These are much more minor, and don't need to be corrected right away,
* but in debug mode we want the next fsck run to be clean:
*/
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret;
#endif
switch (btree) {
case BTREE_ID_alloc:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
-
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
+
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
goto out;
case BTREE_ID_backpointers:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_need_discard:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_freespace:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_bucket_gens:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_lru:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_accounting:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret;
goto out;
case BTREE_ID_snapshots:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret;
goto out;
default:
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret;
goto out;
}
out:
- bch2_write_super(c);
+ if (write_sb)
+ bch2_write_super(c);
return ret;
}
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index f9d1c4921392..b2cdd111fd0e 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -340,7 +340,8 @@ static bool recovery_pass_needs_set(struct bch_fs *c,
int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
struct printbuf *out,
enum bch_recovery_pass pass,
- enum bch_run_recovery_pass_flags flags)
+ enum bch_run_recovery_pass_flags flags,
+ bool *write_sb)
{
struct bch_fs_recovery *r = &c->recovery;
int ret = 0;
@@ -362,7 +363,8 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass),
+ ext->recovery_passes_required);
}
if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
@@ -408,14 +410,19 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass,
enum bch_run_recovery_pass_flags flags)
{
- int ret = 0;
+ /*
+ * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs
+ * sb_lock
+ */
+ if (!(flags & RUN_RECOVERY_PASS_ratelimit) &&
+ !recovery_pass_needs_set(c, pass, &flags))
+ return 0;
- if (recovery_pass_needs_set(c, pass, &flags)) {
- guard(mutex)(&c->sb_lock);
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
+ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb);
+ if (write_sb)
bch2_write_super(c);
- }
-
return ret;
}
@@ -438,14 +445,13 @@ int bch2_require_recovery_pass(struct bch_fs *c,
return 0;
enum bch_run_recovery_pass_flags flags = 0;
- int ret = 0;
- if (recovery_pass_needs_set(c, pass, &flags)) {
- ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ bool write_sb = false;
+ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?:
+ bch_err_throw(c, recovery_pass_will_run);
+ if (write_sb)
bch2_write_super(c);
- }
-
- return ret ?: bch_err_throw(c, recovery_pass_will_run);
+ return ret;
}
int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
@@ -459,8 +465,10 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pa
bch2_log_msg_start(c, &buf);
guard(mutex)(&c->sb_lock);
+ bool write_sb = false;
int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
- RUN_RECOVERY_PASS_nopersistent);
+ RUN_RECOVERY_PASS_nopersistent,
+ &write_sb);
bch2_print_str(c, KERN_NOTICE, buf.buf);
return ret;
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index 2117f0ce1922..4f2c2f811d5e 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -30,7 +30,8 @@ int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pas
int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass,
- enum bch_run_recovery_pass_flags);
+ enum bch_run_recovery_pass_flags,
+ bool *);
int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass,
enum bch_run_recovery_pass_flags);
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index dfe4b6ae0733..3e08e55d2dc1 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -329,7 +329,6 @@ duplicate_entries:
out:
fsck_err:
bch2_trans_iter_exit(trans, dup_iter);
- printbuf_exit(&buf);
if (free_snapshots_seen)
darray_exit(&s->ids);
return ret;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 820cb0f4fe57..40fa87ce1d09 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -991,7 +991,12 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
null_nonce(), sb);
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+ /*
+ * blk-wbt.c throttles all writes except those that have both REQ_SYNC
+ * and REQ_IDLE set...
+ */
+
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4e038f655f83..0fc0b2221036 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -514,6 +514,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
return ret;
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ return ret;
+
clear_bit(BCH_FS_clean_shutdown, &c->flags);
scoped_guard(rcu)
@@ -537,10 +541,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_journal_space_available(&c->journal);
}
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- return ret;
-
/*
* Don't jump to our error path, and call bch2_fs_read_only(), unless we
* successfully marked the filesystem dirty
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 158f526e3dcc..bd3fa9c3372d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -18,6 +18,7 @@
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -150,6 +151,7 @@ write_attribute(trigger_journal_flush);
write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_btree_write_buffer_flush);
write_attribute(trigger_btree_updates);
write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_recalc_capacity);
@@ -539,6 +541,11 @@ STORE(bch2_fs)
c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
+ if (attr == &sysfs_trigger_btree_write_buffer_flush)
+ bch2_trans_do(c,
+ (bch2_btree_write_buffer_flush_sync(trans),
+ bch2_trans_begin(trans)));
+
if (attr == &sysfs_trigger_gc)
bch2_gc_gens(c);
@@ -709,6 +716,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
+ &sysfs_trigger_btree_write_buffer_flush,
&sysfs_trigger_btree_updates,
&sysfs_trigger_freelist_wakeup,
&sysfs_trigger_recalc_capacity,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 768528c2bae7..52ac8230be9f 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -733,6 +733,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr)
return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
}
+static inline bool __test_and_set_bit_le64(size_t bit, __le64 *addr)
+{
+ bool ret = test_bit_le64(bit, addr);
+ __set_bit_le64(bit, addr);
+ return ret;
+}
+
static inline void memcpy_swab(void *_dst, void *_src, size_t len)
{
u8 *dst = _dst + len;
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 6e30f275da77..e907c9bb840c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -6,6 +6,7 @@
#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H
+#include <linux/alloc_tag.h>
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
@@ -505,7 +506,8 @@ void workqueue_softirq_dead(unsigned int cpu);
* Pointer to the allocated workqueue on success, %NULL on failure.
*/
__printf(1, 4) struct workqueue_struct *
-alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);
+alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...);
+#define alloc_workqueue(...) alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__))
#ifdef CONFIG_LOCKDEP
/**
@@ -544,8 +546,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
* Pointer to the allocated workqueue on success, %NULL on failure.
*/
#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \
- alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), \
- 1, lockdep_map, ##args)
+ alloc_hooks(alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),\
+ 1, lockdep_map, ##args))
#endif
/**
@@ -577,7 +579,9 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
extern void destroy_workqueue(struct workqueue_struct *wq);
-struct workqueue_attrs *alloc_workqueue_attrs(void);
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void);
+#define alloc_workqueue_attrs(...) alloc_hooks(alloc_workqueue_attrs_noprof(__VA_ARGS__))
+
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9f9148075828..992cb0467c21 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4629,7 +4629,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
struct workqueue_attrs *attrs;
@@ -5682,12 +5682,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
else
wq_size = sizeof(*wq);
- wq = kzalloc(wq_size, GFP_KERNEL);
+ wq = kzalloc_noprof(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs();
+ wq->unbound_attrs = alloc_workqueue_attrs_noprof();
if (!wq->unbound_attrs)
goto err_free_wq;
}
@@ -5777,9 +5777,9 @@ err_destroy:
}
__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
@@ -5794,7 +5794,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
}
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
diff --git a/lib/closure.c b/lib/closure.c
index 2bfe7d2a0048..4fb78d18ee1b 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -13,23 +13,25 @@
#include <linux/seq_file.h>
#include <linux/sched/debug.h>
-static inline void closure_put_after_sub_checks(int flags)
+static inline void closure_put_after_sub_checks(struct closure *cl, int flags)
{
int r = flags & CLOSURE_REMAINING_MASK;
if (WARN(flags & CLOSURE_GUARD_MASK,
- "closure has guard bits set: %x (%u)",
+ "closure %ps has guard bits set: %x (%u)",
+ cl->fn,
flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
r &= ~CLOSURE_GUARD_MASK;
WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
- "closure ref hit 0 with incorrect flags set: %x (%u)",
+ "closure %ps ref hit 0 with incorrect flags set: %x (%u)",
+ cl->fn,
flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
}
static inline void closure_put_after_sub(struct closure *cl, int flags)
{
- closure_put_after_sub_checks(flags);
+ closure_put_after_sub_checks(cl, flags);
if (!(flags & CLOSURE_REMAINING_MASK)) {
smp_acquire__after_ctrl_dep();
@@ -167,7 +169,7 @@ void __sched closure_return_sync(struct closure *cl)
unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
&cl->remaining);
- closure_put_after_sub_checks(flags);
+ closure_put_after_sub_checks(cl, flags);
if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
while (1) {
diff --git a/mm/shrinker.c b/mm/shrinker.c
index c56c1f824f79..c94eedf2cfd8 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -825,7 +825,12 @@ EXPORT_SYMBOL_GPL(shrinker_free);
void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
{
- struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+#ifdef CONFIG_MEMCG
+ .memcg = root_mem_cgroup,
+#endif
+ };
unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed);
seq_buf_puts(out, shrinker->name);
@@ -867,7 +872,12 @@ void shrinkers_to_text(struct seq_buf *out)
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+#ifdef CONFIG_MEMCG
+ .memcg = root_mem_cgroup,
+#endif
+ };
unsigned long mem = shrinker->count_objects(shrinker, &sc);
if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)