summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-02-03 20:31:47 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-02-05 19:14:43 -0500
commit32186926f6a3d23dd18e8b9c80e95593e7e6f535 (patch)
treef14c45e4aa02368c8cba4016f28f804f95667eac
parentc80d5c05f73fd5229fd8a6b9799e9ba3df7def00 (diff)
fixup bcachefs: Btree write buffer
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/btree_iter.c12
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/btree_update_leaf.c88
-rw-r--r--fs/bcachefs/btree_write_buffer.c129
-rw-r--r--fs/bcachefs/btree_write_buffer.h2
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h1
-rw-r--r--fs/bcachefs/opts.h2
-rw-r--r--include/trace/events/bcachefs.h45
9 files changed, 224 insertions, 61 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 60592f738ff0..a29260110a38 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1412,7 +1412,8 @@ struct bch_sb_field_disk_groups {
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74)
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6682acccc820..5a385ab62140 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1357,6 +1357,7 @@ noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
@@ -1381,6 +1382,17 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
+ trans_for_each_wb_update(trans, wb) {
+ prt_printf(buf, "update: btree=%s wb=1 %pS",
+ bch2_btree_ids[wb->btree],
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+ prt_newline(buf);
+ }
+
printbuf_indent_sub(buf, 2);
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index c55458a0491d..23c959758a15 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,6 +15,9 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+inline void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, u64);
+
enum btree_insert_flags {
/* First two bits for journal watermark: */
__BTREE_INSERT_NOFAIL = 2,
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 475a26a3736e..edc7faacd460 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -101,9 +101,6 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- if (path->cached)
- return;
-
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_trans_node_reinit_iter(trans, b);
@@ -253,25 +250,26 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
/**
* btree_insert_key - insert a key one key into a leaf node
*/
-static void btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bkey_i *insert,
+ u64 journal_seq)
{
struct bch_fs *c = trans->c;
- struct btree *b = insert_l(insert)->b;
+ struct btree *b = path_l(path)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
- &insert_l(insert)->iter, insert->k)))
+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+ &path_l(path)->iter, insert)))
return;
- i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
- le64_to_cpu(i->journal_seq)));
+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
- bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
+ bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty_acct(c, b);
@@ -289,6 +287,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_trans_node_reinit_iter(trans, b);
}
+static void btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_insert_entry *insert)
+{
+ bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
+}
+
/* Cached btree updates: */
/* Normal update interface: */
@@ -641,6 +645,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
marking = true;
}
+ if (trans->nr_wb_updates &&
+ trans->nr_wb_updates + c->btree_write_buffer.nr > c->btree_write_buffer.size)
+ return -BCH_ERR_btree_insert_need_flush_buffer;
+
/*
* Don't get journal reservation until after we know insert will
* succeed:
@@ -791,7 +799,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
@@ -995,11 +1004,36 @@ int bch2_trans_commit_error(struct btree_trans *trans,
if (ret)
trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
- case -BCH_ERR_btree_insert_need_flush_buffer:
- bch2_trans_reset_updates(trans);
- ret = __bch2_btree_write_buffer_flush(trans, trans->flags) ?:
- btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ case -BCH_ERR_btree_insert_need_flush_buffer: {
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ bool need_flush;
+
+ ret = 0;
+
+ mutex_lock(&wb->lock);
+ need_flush = wb->nr > wb->size * 3 / 4;
+ mutex_unlock(&wb->lock);
+
+ if (need_flush) {
+ bch2_trans_reset_updates(trans);
+ bch2_trans_unlock(trans);
+
+ mutex_lock(&wb->flush_lock);
+
+ if (wb->nr > wb->size * 3 / 4)
+ ret = __bch2_btree_write_buffer_flush(trans,
+ trans->flags|BTREE_INSERT_NOCHECK_RW,
+ true);
+ else
+ mutex_unlock(&wb->flush_lock);
+
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ }
break;
+ }
default:
BUG_ON(ret >= 0);
break;
@@ -1088,6 +1122,20 @@ int __bch2_trans_commit(struct btree_trans *trans)
goto out_reset;
}
+ if (c->btree_write_buffer.nr > c->btree_write_buffer.size / 2 &&
+ mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+ bch2_trans_begin(trans);
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_btree_write_buffer_flush(trans,
+ trans->flags|BTREE_INSERT_NOCHECK_RW, true);
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ goto out;
+ }
+
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
@@ -1128,12 +1176,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
- trans_for_each_wb_update(trans, wb) {
- u64s = jset_u64s(wb->k.k.u64s);
-
- trans->journal_preres_u64s += u64s;
- trans->journal_u64s += u64s;
- }
+ trans_for_each_wb_update(trans, wb)
+ trans->journal_u64s += jset_u64s(wb->k.k.u64s);
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index c9208596628c..760269ce763f 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
@@ -28,26 +30,66 @@ static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags)
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ bool *write_locked,
+ size_t *fast)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path = iter->path;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ if (!*write_locked) {
+ ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+ *write_locked = true;
+ }
+
+ if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ goto trans_commit;
+ }
+
+ bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+ (*fast)++;
+ return 0;
+trans_commit:
+ return bch2_trans_update(trans, iter, &wb->k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+ bool locked)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct journal_entry_pin pin;
- struct journal_preres res = { 0 };
struct btree_write_buffered_key *i, *dst;
- size_t nr = 0;
+ struct btree_iter iter = { NULL };
+ size_t nr = 0, skipped = 0, fast = 0;
+ bool write_locked = false;
int ret = 0;
memset(&pin, 0, sizeof(pin));
- if (!mutex_trylock(&wb->flush_lock))
+ if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
mutex_lock(&wb->lock);
swap(wb->keys, wb->flushing);
swap(wb->nr, nr);
- swap(wb->res, res);
bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
bch2_journal_pin_drop(j, &wb->journal_pin);
@@ -75,32 +117,58 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
NULL);
for (i = wb->flushing;
- i < wb->flushing + nr;
+ i < wb->flushing + nr && !ret;
i++) {
if (i + 1 < wb->flushing + nr &&
i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
- if (bkey_deleted(&i[1].k.k))
+ skipped++;
+ if (bkey_deleted(&i[1].k.k)) {
+ skipped++;
i++;
+ }
continue;
}
- ret = commit_do(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
- __bch2_btree_insert(trans, i->btree, &i->k));
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
- goto slowpath;
- if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
- break;
+ if (write_locked &&
+ (iter.path->btree_id != i->btree ||
+ bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ write_locked = false;
+ }
+
+ if (!iter.path || iter.path->btree_id != i->btree) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+ iter.path->preserve = false;
+ }
+
+ bch2_btree_iter_set_pos(&iter, i->k.k.p);
+
+ while (1) {
+ ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, &write_locked, &fast);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ break;
+ bch2_trans_begin(trans);
+ }
}
+ if (write_locked)
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ bch2_trans_iter_exit(trans, &iter);
+
+ trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ goto slowpath;
+
+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
out:
bch2_journal_pin_drop(j, &pin);
- bch2_journal_preres_put(j, &res);
mutex_unlock(&wb->flush_lock);
return ret;
slowpath:
+ trace_write_buffer_flush_slowpath(trans, i - wb->flushing, nr);
+
dst = wb->flushing;
for (;
i < wb->flushing + nr;
@@ -113,7 +181,6 @@ slowpath:
continue;
}
-
*dst = *i;
dst++;
}
@@ -153,7 +220,7 @@ slowpath:
int bch2_btree_write_buffer_flush(struct btree_trans *trans)
{
- return __bch2_btree_write_buffer_flush(trans, 0);
+ return __bch2_btree_write_buffer_flush(trans, 0, false);
}
static int bch2_btree_write_buffer_journal_flush(struct journal *j,
@@ -162,7 +229,8 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW));
+ __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW,
+ false));
}
int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
@@ -170,23 +238,20 @@ int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct btree_write_buffered_key *i;
- unsigned u64s = 0;
EBUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
- mutex_lock(&wb->lock);
- if (wb->nr + trans->nr_wb_updates > wb->size) {
- mutex_unlock(&wb->lock);
- return -BCH_ERR_btree_insert_need_flush_buffer;
- }
-
trans_for_each_wb_update(trans, i) {
EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
i->journal_seq = trans->journal_res.seq;
i->journal_offset = trans->journal_res.offset;
+ }
- u64s += jset_u64s(i->k.k.u64s);
+ mutex_lock(&wb->lock);
+ if (wb->nr + trans->nr_wb_updates > wb->size) {
+ mutex_unlock(&wb->lock);
+ return -BCH_ERR_btree_insert_need_flush_buffer;
}
memcpy(wb->keys + wb->nr,
@@ -194,15 +259,9 @@ int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
wb->nr += trans->nr_wb_updates;
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- EBUG_ON(u64s > trans->journal_preres.u64s);
-
- trans->journal_preres.u64s -= u64s;
- wb->res.u64s += u64s;
-
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
- }
mutex_unlock(&wb->lock);
return 0;
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index ffac314d61a5..155dc5c9da1a 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -2,7 +2,7 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned);
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
int bch2_btree_write_buffer_flush(struct btree_trans *);
int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index a5bcb196e40f..86d6d1df5a82 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -18,7 +18,6 @@ struct btree_write_buffer {
struct mutex lock;
struct mutex flush_lock;
struct journal_entry_pin journal_pin;
- struct journal_preres res;
struct btree_write_buffered_key *keys;
struct btree_write_buffered_key *flushing;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1a89f2d4c360..613fe1fa2899 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -209,7 +209,7 @@ enum opt_type {
x(btree_write_buffer_size, u32, \
OPT_FS|OPT_MOUNT, \
OPT_UINT(16, (1U << 20) - 1), \
- BCH2_NO_SB_OPT, 1U << 10, \
+ BCH2_NO_SB_OPT, 1U << 16, \
NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 87e238b5d4c9..8ec8b4cf7eec 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -1142,6 +1142,51 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+ TP_ARGS(trans, nr, skipped, fast, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, skipped )
+ __field(size_t, fast )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->skipped = skipped;
+ __entry->fast = fast;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu skipped %zu fast %zu",
+ __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+ TP_ARGS(trans, nr, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
#endif /* _TRACE_BCACHE_H */
/* This part must be outside protection */