summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/bcache.h3
-rw-r--r--drivers/md/bcache/btree_cache.c13
-rw-r--r--drivers/md/bcache/btree_cache.h2
-rw-r--r--drivers/md/bcache/btree_gc.c26
-rw-r--r--drivers/md/bcache/btree_io.c50
-rw-r--r--drivers/md/bcache/btree_iter.c4
-rw-r--r--drivers/md/bcache/btree_types.h12
-rw-r--r--drivers/md/bcache/btree_update.c359
-rw-r--r--drivers/md/bcache/btree_update.h63
-rw-r--r--drivers/md/bcache/journal.c25
-rw-r--r--drivers/md/bcache/sysfs.c6
11 files changed, 347 insertions, 216 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 18d1e13b30c1..e1b3eea39395 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -497,7 +497,6 @@ enum {
CACHE_SET_GC_STOPPING,
CACHE_SET_GC_FAILURE,
CACHE_SET_BDEV_MOUNTED,
- CACHE_SET_BTREE_WRITE_ERROR,
};
struct btree_debug {
@@ -562,7 +561,7 @@ struct cache_set {
/* BTREE CACHE */
struct bio_set btree_read_bio;
- struct btree *btree_roots[BTREE_ID_NR];
+ struct btree_root btree_roots[BTREE_ID_NR];
spinlock_t btree_root_lock;
bool btree_cache_table_init_done;
diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c
index a005f4b30279..5a5674af3964 100644
--- a/drivers/md/bcache/btree_cache.c
+++ b/drivers/md/bcache/btree_cache.c
@@ -21,13 +21,13 @@ void bch_recalc_btree_reserve(struct cache_set *c)
{
unsigned i, reserve = 16;
- if (!c->btree_roots[0])
+ if (!c->btree_roots[0].b)
reserve += 8;
for (i = 0; i < BTREE_ID_NR; i++)
- if (c->btree_roots[i])
+ if (c->btree_roots[i].b)
reserve += min_t(unsigned, 1,
- c->btree_roots[i]->level) * 8;
+ c->btree_roots[i].b->level) * 8;
c->btree_cache_reserve = reserve;
}
@@ -165,6 +165,9 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->lock))
goto out_unlock_intent;
+ if (btree_node_write_error(b))
+ goto out_unlock;
+
if (!list_empty(&b->write_blocked))
goto out_unlock;
@@ -353,8 +356,8 @@ void bch_btree_cache_free(struct cache_set *c)
#endif
for (i = 0; i < BTREE_ID_NR; i++)
- if (c->btree_roots[i])
- list_add(&c->btree_roots[i]->list, &c->btree_cache);
+ if (c->btree_roots[i].b)
+ list_add(&c->btree_roots[i].b->list, &c->btree_cache);
list_splice(&c->btree_cache_freeable,
&c->btree_cache);
diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h
index 03a1cf5f30df..e774ddf30e02 100644
--- a/drivers/md/bcache/btree_cache.h
+++ b/drivers/md/bcache/btree_cache.h
@@ -46,6 +46,6 @@ static inline unsigned btree_blocks(struct cache_set *c)
return CACHE_BTREE_NODE_SIZE(&c->disk_sb) >> c->block_bits;
}
-#define btree_node_root(_b) ((_b)->c->btree_roots[(_b)->btree_id])
+#define btree_node_root(_b) ((_b)->c->btree_roots[(_b)->btree_id].b)
#endif /* _BCACHE_BTREE_CACHE_H */
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index 3257c6221ddb..975293880358 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -208,7 +208,7 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id)
spin_lock(&c->btree_root_lock);
- b = c->btree_roots[btree_id];
+ b = c->btree_roots[btree_id].b;
__bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
@@ -398,7 +398,7 @@ void bch_gc(struct cache_set *c)
/* Walk btree: */
while (c->gc_pos.phase < (int) BTREE_ID_NR) {
- int ret = c->btree_roots[c->gc_pos.phase]
+ int ret = c->btree_roots[c->gc_pos.phase].b
? bch_gc_btree(c, (int) c->gc_pos.phase)
: 0;
@@ -454,7 +454,6 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
struct keylist keylist;
struct bkey_format_state format_state;
struct bkey_format new_format;
- int ret;
memset(new_nodes, 0, sizeof(new_nodes));
bch_keylist_init(&keylist, NULL, 0);
@@ -592,7 +591,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
struct bkey_i delete;
unsigned j;
- bch_pending_btree_node_free_init(c, as, old_nodes[i]);
+ bch_btree_node_free_start(c, as, old_nodes[i]);
for (j = 0; j < nr_new_nodes; j++)
if (!bkey_cmp(old_nodes[i]->key.k.p,
@@ -616,9 +615,7 @@ next:
bch_keylist_add_in_order(&keylist, &new_nodes[i]->key);
/* Insert the newly coalesced nodes */
- ret = bch_btree_insert_node(parent, iter, &keylist, res, as);
- if (ret)
- goto err;
+ bch_btree_insert_node(parent, iter, &keylist, res, as);
BUG_ON(!bch_keylist_empty(&keylist));
@@ -631,7 +628,7 @@ next:
/* Free the old nodes and update our sliding window */
for (i = 0; i < nr_old_nodes; i++) {
- bch_btree_node_free(iter, old_nodes[i]);
+ bch_btree_node_free_inmem(iter, old_nodes[i]);
six_unlock_intent(&old_nodes[i]->lock);
/*
@@ -651,13 +648,6 @@ next:
out:
bch_keylist_free(&keylist);
bch_btree_reserve_put(c, res);
- return;
-err:
- for (i = 0; i < nr_new_nodes; i++) {
- bch_btree_node_free_never_inserted(c, new_nodes[i]);
- six_unlock_intent(&new_nodes[i]->lock);
- }
- goto out;
}
static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id)
@@ -751,7 +741,7 @@ static void bch_coalesce(struct cache_set *c)
trace_bcache_gc_coalesce_start(c);
for (id = 0; id < BTREE_ID_NR; id++) {
- int ret = c->btree_roots[id]
+ int ret = c->btree_roots[id].b
? bch_coalesce_btree(c, id)
: 0;
@@ -848,7 +838,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
btree_node_range_checks_init(&r);
- if (!c->btree_roots[id])
+ if (!c->btree_roots[id].b)
return;
for_each_btree_node(&iter, c, id, POS_MIN, b) {
@@ -870,7 +860,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
bch_btree_iter_unlock(&iter);
__bch_btree_mark_key(c, BKEY_TYPE_BTREE,
- bkey_i_to_s_c(&c->btree_roots[id]->key));
+ bkey_i_to_s_c(&c->btree_roots[id].b->key));
}
int bch_initial_gc(struct cache_set *c, struct list_head *journal)
diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c
index f1e9c57e6e3e..4f3a7a2cdb76 100644
--- a/drivers/md/bcache/btree_io.c
+++ b/drivers/md/bcache/btree_io.c
@@ -499,9 +499,7 @@ static void btree_node_write_done(struct closure *cl)
if (btree_node_write_error(b))
bch_journal_halt(&c->journal);
- /* XXX: pin btree node in memory somehow */
- if (!btree_node_write_error(b))
- bch_btree_complete_write(c, b, w);
+ bch_btree_complete_write(c, b, w);
if (btree_node_dirty(b) && c->btree_flush_delay)
schedule_delayed_work(&b->work, c->btree_flush_delay * HZ);
@@ -516,10 +514,8 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
if (cache_fatal_io_err_on(bio->bi_error, wbio->bio.ca, "btree write") ||
- bch_meta_write_fault("btree")) {
+ bch_meta_write_fault("btree"))
set_btree_node_write_error(b);
- set_bit(CACHE_SET_BTREE_WRITE_ERROR, &b->c->flags);
- }
if (wbio->orig)
bio_endio(wbio->orig);
@@ -580,6 +576,29 @@ static void do_btree_node_write(struct closure *cl)
BUG_ON(b->written + blocks_to_write > btree_blocks(c));
+ /*
+ * We handle btree write errors by immediately halting the journal -
+ * after we've done that, we can't issue any subsequent btree writes
+ * because they might have pointers to new nodes that failed to write.
+ *
+ * Furthermore, there's no point in doing any more btree writes because
+ * with the journal stopped, we're never going to update the journal to
+ * reflect that those writes were done and the data flushed from the
+ * journal:
+ *
+ * Make sure to update b->written so bch_btree_init_next() doesn't
+ * break:
+ */
+ if (bch_journal_error(&b->c->journal)) {
+ struct btree_write *w = btree_prev_write(b);
+
+ set_btree_node_write_error(b);
+ b->written += blocks_to_write;
+ bch_btree_complete_write(c, b, w);
+
+ closure_return_with_destructor(cl, btree_node_write_unlock);
+ }
+
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->bio_write);
wbio = to_wbio(bio);
@@ -663,15 +682,22 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent,
if (!test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
return;
- BUG_ON(!list_empty(&b->write_blocked));
-
/*
* io_mutex ensures only a single IO in flight to a btree node at a
* time, and also protects use of the b->io closure.
* do_btree_node_write() will drop it asynchronously.
*/
down(&b->io_mutex);
+
+ BUG_ON(!list_empty(&b->write_blocked));
#if 0
+ /*
+ * This is an optimization for when journal flushing races with the
+ * btree node being written for some other reason, and the write the
+ * journal wanted to flush has already happened - in that case we'd
+ * prefer not to write a mostly empty bset. It seemed to be buggy,
+ * though:
+ */
if (idx_to_write != -1 &&
idx_to_write != btree_node_write_idx(b)) {
up(&b->io_mutex);
@@ -699,6 +725,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent,
static void bch_btree_node_write_dirty(struct btree *b, struct closure *parent)
{
six_lock_read(&b->lock);
+ BUG_ON(b->level);
+
__bch_btree_node_write(b, parent, -1);
six_unlock_read(&b->lock);
}
@@ -752,7 +780,11 @@ restart:
for (; i < tbl->size; i++)
rht_for_each_entry_rcu(b, pos, tbl, i, hash)
- if (btree_node_dirty(b)) {
+ /*
+ * XXX - locking for b->level, when called from
+ * bch_journal_move()
+ */
+ if (!b->level && btree_node_dirty(b)) {
rcu_read_unlock();
bch_btree_node_write_dirty(b, &cl);
dropped_lock = true;
diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c
index 6b9477f740e4..71e0ec18871e 100644
--- a/drivers/md/bcache/btree_iter.c
+++ b/drivers/md/bcache/btree_iter.c
@@ -357,12 +357,12 @@ static void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos)
memset(iter->nodes, 0, sizeof(iter->nodes));
while (1) {
- struct btree *b = iter->c->btree_roots[iter->btree_id];
+ struct btree *b = iter->c->btree_roots[iter->btree_id].b;
iter->level = b->level;
if (btree_node_lock(b, iter, iter->level,
- (b != iter->c->btree_roots[iter->btree_id]))) {
+ (b != iter->c->btree_roots[iter->btree_id].b))) {
btree_iter_node_set(iter, b, pos);
break;
}
diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h
index f3af91e3d23c..05b70dc53e37 100644
--- a/drivers/md/bcache/btree_types.h
+++ b/drivers/md/bcache/btree_types.h
@@ -14,6 +14,7 @@
struct cache_set;
struct open_bucket;
+struct async_split;
struct btree_write {
unsigned index;
@@ -22,6 +23,17 @@ struct btree_write {
struct closure_waitlist wait;
};
+struct btree_root {
+ struct btree *b;
+
+ struct async_split *as;
+
+ /* On disk root - see async splits: */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ u8 level;
+ u8 alive;
+};
+
struct btree {
/* Hottest entries first */
struct rhash_head hash;
diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c
index 2764a6470cdb..cef1f7bb8d28 100644
--- a/drivers/md/bcache/btree_update.c
+++ b/drivers/md/bcache/btree_update.c
@@ -17,6 +17,9 @@
#include <linux/random.h>
#include <trace/events/bcache.h>
+static void async_split_updated_root(struct async_split *,
+ struct btree *);
+
/* Calculate ideal packed bkey format for new btree nodes: */
void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
@@ -88,9 +91,12 @@ bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f)
/* Btree node freeing/allocation: */
-void bch_pending_btree_node_free_init(struct cache_set *c,
- struct async_split *as,
- struct btree *b)
+/*
+ * @b is going to be freed, allocate a pending_btree_node_free in @as:
+ */
+void bch_btree_node_free_start(struct cache_set *c,
+ struct async_split *as,
+ struct btree *b)
{
struct pending_btree_node_free *d;
@@ -105,9 +111,16 @@ void bch_pending_btree_node_free_init(struct cache_set *c,
mutex_unlock(&c->btree_node_pending_free_lock);
}
-static void bch_pending_btree_node_free_insert_done(struct cache_set *c,
- struct btree *b, enum btree_id id, struct bkey_s_c k,
- struct bucket_stats_cache_set *stats)
+/*
+ * We're doing the index update that makes @b unreachable, update stuff to
+ * reflect that:
+ *
+ * Must be called _before_ async_split_updated_root() or
+ * async_split_updated_btree:
+ */
+static void bch_btree_node_free_index(struct cache_set *c, struct btree *b,
+ enum btree_id id, struct bkey_s_c k,
+ struct bucket_stats_cache_set *stats)
{
struct pending_btree_node_free *d;
@@ -201,7 +214,7 @@ void bch_btree_node_free_never_inserted(struct cache_set *c, struct btree *b)
bch_open_bucket_put(c, ob);
}
-void bch_btree_node_free(struct btree_iter *iter, struct btree *b)
+void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
{
bch_btree_iter_node_drop_linked(iter, b);
@@ -349,7 +362,7 @@ struct btree *btree_node_alloc_replacement(struct cache_set *c,
return __btree_node_alloc_replacement(c, b, new_f, reserve);
}
-static void __bch_btree_set_root(struct cache_set *c, struct btree *b,
+static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b,
struct bucket_stats_cache_set *stats)
{
/* Root nodes cannot be reaped */
@@ -369,6 +382,20 @@ static void __bch_btree_set_root(struct cache_set *c, struct btree *b,
bch_recalc_btree_reserve(c);
}
+static void bch_btree_set_root_ondisk(struct cache_set *c, struct btree *b)
+{
+ struct btree_root *r = &c->btree_roots[b->btree_id];
+
+ spin_lock(&c->btree_root_lock);
+
+ BUG_ON(b != r->b);
+ bkey_copy(&r->key, &b->key);
+ r->level = b->level;
+ r->alive = true;
+
+ spin_unlock(&c->btree_root_lock);
+}
+
/*
* Only for cache set bringup, when first reading the btree roots or allocating
* btree roots when initializing a new cache set:
@@ -380,7 +407,8 @@ void bch_btree_set_root_initial(struct cache_set *c, struct btree *b,
BUG_ON(btree_node_root(b));
- __bch_btree_set_root(c, b, &stats);
+ bch_btree_set_root_inmem(c, b, &stats);
+ bch_btree_set_root_ondisk(c, b);
if (btree_reserve)
bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res);
@@ -398,14 +426,13 @@ void bch_btree_set_root_initial(struct cache_set *c, struct btree *b,
* is nothing new to be done. This just guarantees that there is a
* journal write.
*/
-static int bch_btree_set_root(struct btree_iter *iter, struct btree *b,
- struct journal_res *res,
- struct btree_reserve *btree_reserve)
+static void bch_btree_set_root(struct btree_iter *iter, struct btree *b,
+ struct async_split *as,
+ struct btree_reserve *btree_reserve)
{
struct bucket_stats_cache_set stats = { 0 };
struct cache_set *c = iter->c;
struct btree *old;
- u64 seq;
trace_bcache_btree_set_root(b);
BUG_ON(!b->written);
@@ -418,7 +445,13 @@ static int bch_btree_set_root(struct btree_iter *iter, struct btree *b,
*/
btree_node_lock_write(old, iter);
- __bch_btree_set_root(c, b, &stats);
+ bch_btree_set_root_inmem(c, b, &stats);
+
+ bch_btree_node_free_index(c, NULL, old->btree_id,
+ bkey_i_to_s_c(&old->key),
+ &stats);
+
+ async_split_updated_root(as, b);
/*
* Unlock old root after new root is visible:
@@ -429,21 +462,8 @@ static int bch_btree_set_root(struct btree_iter *iter, struct btree *b,
*/
btree_node_unlock_write(old, iter);
- bch_pending_btree_node_free_insert_done(c, NULL, old->btree_id,
- bkey_i_to_s_c(&old->key),
- &stats);
-
stats.sectors_meta -= c->sb.btree_node_size;
bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res);
-
- /*
- * Ensure new btree root is persistent (reachable via the
- * journal) before returning and the caller unlocking it:
- */
- seq = c->journal.seq;
- bch_journal_res_put(&c->journal, res, NULL);
-
- return bch_journal_flush_seq(&c->journal, seq);
}
static struct btree *__btree_root_alloc(struct cache_set *c, unsigned level,
@@ -627,9 +647,8 @@ static bool bch_insert_fixup_btree_ptr(struct btree_iter *iter,
break;
if (!cmp && !bkey_deleted(k)) {
- bch_pending_btree_node_free_insert_done(c, b,
- iter->btree_id,
- u, &stats);
+ bch_btree_node_free_index(c, b, iter->btree_id,
+ u, &stats);
/*
* Look up pending delete, mark so that gc marks it on
* the pending delete list
@@ -876,25 +895,15 @@ struct async_split *__bch_async_split_alloc(struct btree *nodes[],
{
struct cache_set *c = iter->c;
struct async_split *as;
- struct journal_res res;
struct journal_entry_pin_list *pin_list = NULL;
unsigned i, pin_idx = UINT_MAX;
- memset(&res, 0, sizeof(res));
-
- /*
- * must get journal res before getting a journal pin (else we deadlock)
- */
- if (bch_journal_res_get(&c->journal, &res,
- jset_u64s(0), jset_u64s(0)))
- return NULL;
-
as = mempool_alloc(&c->btree_async_split_pool, GFP_NOIO);
closure_init(&as->cl, &c->cl);
as->c = c;
+ as->mode = ASYNC_SPLIT_NO_UPDATE;
as->b = NULL;
- as->as = NULL;
- as->res = res;
+ as->parent_as = NULL;
as->nr_pending = 0;
init_llist_head(&as->wait.list);
@@ -964,7 +973,7 @@ static void async_split_free(struct closure *cl)
mempool_free(as, &as->c->btree_async_split_pool);
}
-static void async_split_update_done(struct closure *cl)
+static void async_split_pointers_written(struct closure *cl)
{
struct async_split *as = container_of(cl, struct async_split, cl);
struct cache_set *c = as->c;
@@ -980,10 +989,11 @@ static void async_split_update_done(struct closure *cl)
closure_return_with_destructor(cl, async_split_free);
}
-static void async_split_writes_done(struct closure *cl)
+static void async_split_nodes_written(struct closure *cl)
{
struct async_split *as = container_of(cl, struct async_split, cl);
struct cache_set *c = as->c;
+ struct btree *b;
/*
* We did an update to a parent node where the pointers we added pointed
@@ -991,14 +1001,15 @@ static void async_split_writes_done(struct closure *cl)
* been written so we can write out the update to the interior node.
*/
- /* XXX: error handling */
retry:
mutex_lock(&c->async_split_lock);
+ switch (as->mode) {
+ case ASYNC_SPLIT_NO_UPDATE:
+ BUG();
+ case ASYNC_SPLIT_UPDATING_BTREE:
+ /* The usual case: */
+ b = READ_ONCE(as->b);
- if (as->b) {
- struct btree *b = as->b;
-
- /* Normal case */
if (!six_trylock_read(&b->lock)) {
mutex_unlock(&c->async_split_lock);
six_lock_read(&b->lock);
@@ -1006,6 +1017,7 @@ retry:
goto retry;
}
+ BUG_ON(!btree_node_dirty(b));
closure_wait(&btree_current_write(b)->wait, cl);
list_del(&as->list);
@@ -1013,29 +1025,130 @@ retry:
if (list_empty(&b->write_blocked))
__bch_btree_node_write(b, NULL, -1);
six_unlock_read(&b->lock);
- } else if (as->as) {
+ break;
+
+ case ASYNC_SPLIT_UPDATING_AS:
/*
* The btree node we originally updated has been freed and is
* being rewritten - so we need to write anything here, we just
* need to signal to that async_split that it's ok to make the
* new replacement node visible:
*/
- closure_put(&as->as->cl);
+ closure_put(&as->parent_as->cl);
/*
* and then we have to wait on that async_split to finish:
*/
- closure_wait(&as->as->wait, cl);
- } else {
+ closure_wait(&as->parent_as->wait, cl);
+ break;
+
+ case ASYNC_SPLIT_UPDATING_ROOT:
+ /* b is the new btree root: */
+ b = READ_ONCE(as->b);
+
+ if (!six_trylock_read(&b->lock)) {
+ mutex_unlock(&c->async_split_lock);
+ six_lock_read(&b->lock);
+ six_unlock_read(&b->lock);
+ goto retry;
+ }
+
+ BUG_ON(c->btree_roots[b->btree_id].as != as);
+ c->btree_roots[b->btree_id].as = NULL;
+
+ bch_btree_set_root_ondisk(c, b);
+
/*
- * Instead of updating an interior node we set a new btree root
- * (synchronously) - nothing to do here:
+ * We don't have to wait anything anything here (before
+ * async_split_pointers_written frees the old nodes ondisk) -
+ * we've ensured that the very next journal write will have the
+ * pointer to the new root, and before the allocator can reuse
+ * the old nodes it'll have to do a journal commit:
*/
+ six_unlock_read(&b->lock);
}
-
mutex_unlock(&c->async_split_lock);
- continue_at(cl, async_split_update_done, system_wq);
+ continue_at(cl, async_split_pointers_written, system_wq);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void async_split_updated_btree(struct async_split *as,
+ struct btree *b)
+{
+ mutex_lock(&as->c->async_split_lock);
+
+ BUG_ON(as->mode != ASYNC_SPLIT_NO_UPDATE);
+ BUG_ON(!btree_node_dirty(b));
+
+ as->mode = ASYNC_SPLIT_UPDATING_BTREE;
+ as->b = b;
+ list_add(&as->list, &b->write_blocked);
+
+ mutex_unlock(&as->c->async_split_lock);
+
+ continue_at(&as->cl, async_split_nodes_written, system_wq);
+}
+
+static void async_split_updated_root(struct async_split *as,
+ struct btree *b)
+{
+ struct btree_root *r = &as->c->btree_roots[b->btree_id];
+
+ /*
+ * XXX: if there's an outstanding async_split updating the root, we
+ * have to do the dance with the old one
+ */
+
+ mutex_lock(&as->c->async_split_lock);
+
+ if (r->as) {
+ BUG_ON(r->as->mode != ASYNC_SPLIT_UPDATING_ROOT);
+
+ r->as->b = NULL;
+ r->as->mode = ASYNC_SPLIT_UPDATING_AS;
+ r->as->parent_as = as;
+ closure_get(&as->cl);
+ }
+
+ BUG_ON(as->mode != ASYNC_SPLIT_NO_UPDATE);
+ as->mode = ASYNC_SPLIT_UPDATING_ROOT;
+ as->b = b;
+ r->as = as;
+
+ mutex_unlock(&as->c->async_split_lock);
+
+ continue_at(&as->cl, async_split_nodes_written, system_wq);
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding async_splits - redirect @b's async_splits to point
+ * to this async_split:
+ */
+static void async_split_will_free_node(struct async_split *as,
+ struct btree *b)
+{
+ mutex_lock(&as->c->async_split_lock);
+
+ while (!list_empty(&b->write_blocked)) {
+ struct async_split *p =
+ list_first_entry(&b->write_blocked,
+ struct async_split, list);
+
+ BUG_ON(p->mode != ASYNC_SPLIT_UPDATING_BTREE);
+
+ p->mode = ASYNC_SPLIT_UPDATING_AS;
+ list_del(&p->list);
+ p->b = NULL;
+ p->parent_as = as;
+ closure_get(&as->cl);
+ }
+
+ mutex_unlock(&as->c->async_split_lock);
}
static void btree_node_interior_verify(struct btree *b)
@@ -1092,6 +1205,7 @@ bch_btree_insert_keys_interior(struct btree *b,
struct btree_node_iter *node_iter = &iter->node_iters[b->level];
const struct bkey_format *f = &b->keys.format;
struct bkey_packed *k;
+ bool inserted = false;
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level));
BUG_ON(!b->level);
@@ -1107,14 +1221,6 @@ bch_btree_insert_keys_interior(struct btree *b,
return BTREE_INSERT_NEED_SPLIT;
}
- /* not using the journal reservation, drop it now before blocking: */
- bch_journal_res_put(&iter->c->journal, &as->res, NULL);
-
- mutex_lock(&iter->c->async_split_lock);
- as->b = b;
- list_add(&as->list, &b->write_blocked);
- mutex_unlock(&iter->c->async_split_lock);
-
while (!bch_keylist_empty(insert_keys)) {
struct bkey_i *insert = bch_keylist_front(insert_keys);
@@ -1129,9 +1235,13 @@ bch_btree_insert_keys_interior(struct btree *b,
bch_insert_fixup_btree_ptr(iter, b, insert,
node_iter, &res->disk_res);
+ inserted = true;
bch_keylist_dequeue(insert_keys);
}
+ BUG_ON(!inserted);
+ async_split_updated_btree(as, b);
+
btree_node_unlock_write(b, iter);
/*
@@ -1143,8 +1253,6 @@ bch_btree_insert_keys_interior(struct btree *b,
(bkey_cmp_left_packed(f, k, iter->pos) >= 0))
;
- continue_at_noreturn(&as->cl, async_split_writes_done, system_wq);
-
btree_node_interior_verify(b);
return BTREE_INSERT_OK;
@@ -1354,10 +1462,10 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
btree_node_interior_verify(b);
}
-static int btree_split(struct btree *b, struct btree_iter *iter,
- struct keylist *insert_keys,
- struct btree_reserve *reserve,
- struct async_split *as)
+static void btree_split(struct btree *b, struct btree_iter *iter,
+ struct keylist *insert_keys,
+ struct btree_reserve *reserve,
+ struct async_split *as)
{
struct cache_set *c = iter->c;
struct btree *parent = iter->nodes[b->level + 1];
@@ -1365,28 +1473,11 @@ static int btree_split(struct btree *b, struct btree_iter *iter,
uint64_t start_time = local_clock();
unsigned u64s_to_insert = b->level
? bch_keylist_nkeys(insert_keys) : 0;
- int ret;
BUG_ON(!parent && (b != btree_node_root(b)));
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level));
- mutex_lock(&c->async_split_lock);
- while (!list_empty(&b->write_blocked)) {
- struct async_split *p =
- list_first_entry(&b->write_blocked,
- struct async_split, list);
-
- /*
- * If there were async splits blocking this node from being
- * written, we can't make the new replacement nodes visible
- * until those async splits finish - so point them at us:
- */
- list_del(&p->list);
- p->b = NULL;
- p->as = as;
- closure_get(&as->cl);
- }
- mutex_unlock(&c->async_split_lock);
+ async_split_will_free_node(as, b);
n1 = btree_node_alloc_replacement(c, b, reserve);
@@ -1473,38 +1564,19 @@ static int btree_split(struct btree *b, struct btree_iter *iter,
bch_btree_node_write(n1, &as->cl, NULL);
- bch_pending_btree_node_free_init(c, as, b);
+ bch_btree_node_free_start(c, as, b);
/* New nodes all written, now make them visible: */
if (parent) {
/* Split a non root node */
- ret = bch_btree_insert_node(parent, iter, &as->parent_keys,
- reserve, as);
- if (ret)
- goto err;
+ bch_btree_insert_node(parent, iter, &as->parent_keys,
+ reserve, as);
+ } else if (n3) {
+ bch_btree_set_root(iter, n3, as, reserve);
} else {
- /* Wait on journal flush and btree node writes: */
- closure_sync(&as->cl);
-
- /* Check for journal error after waiting on the journal flush: */
- if (bch_journal_error(&c->journal) ||
- test_bit(CACHE_SET_BTREE_WRITE_ERROR, &c->flags))
- goto err;
-
- if (n3) {
- ret = bch_btree_set_root(iter, n3, &as->res, reserve);
- if (ret)
- goto err;
- } else {
- /* Root filled up but didn't need to be split */
- ret = bch_btree_set_root(iter, n1, &as->res, reserve);
- if (ret)
- goto err;
- }
-
- continue_at_noreturn(&as->cl, async_split_writes_done,
- system_wq);
+ /* Root filled up but didn't need to be split */
+ bch_btree_set_root(iter, n1, as, reserve);
}
btree_open_bucket_put(c, n1);
@@ -1522,7 +1594,7 @@ static int btree_split(struct btree *b, struct btree_iter *iter,
* We have to free the node first because the bch_iter_node_replace()
* calls will drop _our_ iterator's reference - and intent lock - to @b.
*/
- bch_btree_node_free(iter, b);
+ bch_btree_node_free_inmem(iter, b);
/* Successful split, update the iterator to point to the new nodes: */
@@ -1533,20 +1605,6 @@ static int btree_split(struct btree *b, struct btree_iter *iter,
bch_btree_iter_node_replace(iter, n1);
bch_time_stats_update(&c->btree_split_time, start_time);
- return 0;
-err:
- /* IO error: */
- if (n3) {
- bch_btree_node_free_never_inserted(c, n3);
- six_unlock_intent(&n3->lock);
- }
- if (n2) {
- bch_btree_node_free_never_inserted(c, n2);
- six_unlock_intent(&n2->lock);
- }
- bch_btree_node_free_never_inserted(c, n1);
- six_unlock_intent(&n1->lock);
- return -EIO;
}
/**
@@ -1562,11 +1620,11 @@ err:
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
-int bch_btree_insert_node(struct btree *b,
- struct btree_iter *iter,
- struct keylist *insert_keys,
- struct btree_reserve *reserve,
- struct async_split *as)
+void bch_btree_insert_node(struct btree *b,
+ struct btree_iter *iter,
+ struct keylist *insert_keys,
+ struct btree_reserve *reserve,
+ struct async_split *as)
{
BUG_ON(!b->level);
BUG_ON(!reserve || !as);
@@ -1574,9 +1632,10 @@ int bch_btree_insert_node(struct btree *b,
switch (bch_btree_insert_keys_interior(b, iter, insert_keys,
as, reserve)) {
case BTREE_INSERT_OK:
- return 0;
+ break;
case BTREE_INSERT_NEED_SPLIT:
- return btree_split(b, iter, insert_keys, reserve, as);
+ btree_split(b, iter, insert_keys, reserve, as);
+ break;
default:
BUG();
}
@@ -1619,7 +1678,8 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags)
goto out_put_reserve;
}
- ret = btree_split(b, iter, NULL, reserve, as);
+ btree_split(b, iter, NULL, reserve, as);
+ ret = 0;
out_put_reserve:
bch_btree_reserve_put(c, reserve);
@@ -2043,7 +2103,6 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait)
struct btree *n, *parent = iter->nodes[b->level + 1];
struct btree_reserve *reserve;
struct async_split *as;
- int ret;
iter->locks_want = BTREE_MAX_DEPTH;
if (!bch_btree_iter_upgrade(iter))
@@ -2068,32 +2127,20 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait)
trace_bcache_btree_gc_rewrite_node(b);
bch_btree_node_write(n, &as->cl, NULL);
- bch_pending_btree_node_free_init(c, as, b);
+
+ bch_btree_node_free_start(c, as, b);
if (parent) {
- ret = bch_btree_insert_node(parent, iter,
+ bch_btree_insert_node(parent, iter,
&keylist_single(&n->key),
reserve, as);
- BUG_ON(ret);
} else {
- closure_sync(&as->cl);
-
- if (bch_journal_error(&c->journal) ||
- btree_node_write_error(n)) {
- bch_btree_node_free_never_inserted(c, n);
- six_unlock_intent(&n->lock);
- return -EIO;
- }
-
- bch_btree_set_root(iter, n, &as->res, reserve);
-
- continue_at_noreturn(&as->cl, async_split_writes_done,
- system_wq);
+ bch_btree_set_root(iter, n, as, reserve);
}
btree_open_bucket_put(iter->c, n);
- bch_btree_node_free(iter, b);
+ bch_btree_node_free_inmem(iter, b);
BUG_ON(!bch_btree_iter_node_replace(iter, n));
diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h
index ff2b42b32c5f..c8de69e8c47c 100644
--- a/drivers/md/bcache/btree_update.h
+++ b/drivers/md/bcache/btree_update.h
@@ -25,6 +25,11 @@ bool bch_btree_node_format_fits(struct btree *, struct bkey_format *);
/* Btree node freeing/allocation: */
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
struct pending_btree_node_free {
struct list_head list;
bool index_update_done;
@@ -32,22 +37,58 @@ struct pending_btree_node_free {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
struct async_split {
struct closure cl;
struct cache_set *c;
+ enum {
+ ASYNC_SPLIT_NO_UPDATE,
+ ASYNC_SPLIT_UPDATING_BTREE,
+ ASYNC_SPLIT_UPDATING_ROOT,
+ ASYNC_SPLIT_UPDATING_AS,
+ } mode;
+
+ /*
+ * ASYNC_SPLIT_UPDATING_BTREE:
+ * @b - node we're blocking from being written
+ * @list - corresponds to @b->write_blocked
+ */
struct btree *b;
- struct async_split *as;
struct list_head list;
- /* for setting a new btree root: */
- struct journal_res res;
+ /*
+ * ASYNC_SPLIT_UPDATING_AS: btree node we updated was freed, so now
+ * we're now blocking another async_split
+ * @parent_as - async_split that's waiting on our nodes to finish
+ * writing, before it can make new nodes visible on disk
+ * @wait - list of child async_splits that are waiting on this
+ * async_split to make all the new nodes visible before they can free
+ * their old btree nodes
+ */
+ struct async_split *parent_as;
+ struct closure_waitlist wait;
struct journal_entry_pin journal;
- struct closure_waitlist wait;
-
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
@@ -61,11 +102,11 @@ struct async_split {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
-void bch_pending_btree_node_free_init(struct cache_set *, struct async_split *,
- struct btree *);
+void bch_btree_node_free_start(struct cache_set *, struct async_split *,
+ struct btree *);
+void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *);
-void bch_btree_node_free(struct btree_iter *, struct btree *);
void btree_open_bucket_put(struct cache_set *c, struct btree *);
@@ -153,9 +194,9 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
#endif
}
-int bch_btree_insert_node(struct btree *, struct btree_iter *,
- struct keylist *, struct btree_reserve *,
- struct async_split *as);
+void bch_btree_insert_node(struct btree *, struct btree_iter *,
+ struct keylist *, struct btree_reserve *,
+ struct async_split *as);
/* Normal update interface: */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5a133c3f2a2f..5994c08c48ae 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -1485,7 +1485,6 @@ static void journal_write_locked(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct cache_set *c = container_of(j, struct cache_set, journal);
struct cache *ca;
- struct btree *b;
struct journal_write *w = journal_cur_write(j);
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
struct bch_extent_ptr *ptr;
@@ -1504,14 +1503,6 @@ static void journal_write_locked(struct closure *cl)
clear_bit(JOURNAL_DIRTY, &j->flags);
cancel_delayed_work(&j->write_work);
- spin_lock(&c->btree_root_lock);
-
- for (i = 0; i < BTREE_ID_NR; i++)
- if ((b = c->btree_roots[i]))
- bch_journal_add_btree_root(j, i, &b->key, b->level);
-
- spin_unlock(&c->btree_root_lock);
-
bch_journal_add_prios(j);
/* So last_seq is up to date */
@@ -1525,6 +1516,18 @@ static void journal_write_locked(struct closure *cl)
w->data->version = cpu_to_le32(BCACHE_JSET_VERSION);
w->data->last_seq = cpu_to_le64(last_seq(j));
+ /* _after_ last_seq(): */
+ spin_lock(&c->btree_root_lock);
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_root *r = &c->btree_roots[i];
+
+ if (r->alive)
+ bch_journal_add_btree_root(j, i, &r->key, r->level);
+ }
+
+ spin_unlock(&c->btree_root_lock);
+
SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
@@ -2170,6 +2173,10 @@ int bch_journal_move(struct cache *ca)
* journal entries written to ca become stale and are no
* longer needed.
*/
+
+ /*
+ * XXX: switch to normal journal reclaim machinery
+ */
bch_btree_flush(c);
/*
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 0a1daed0ff74..6229b633f9b1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -524,9 +524,9 @@ static unsigned bch_root_usage(struct cache_set *c)
do {
six_unlock_read(&b->lock);
lock_root:
- b = c->btree_roots[BTREE_ID_EXTENTS];
+ b = c->btree_roots[BTREE_ID_EXTENTS].b;
six_lock_read(&b->lock);
- } while (b != c->btree_roots[BTREE_ID_EXTENTS]);
+ } while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
for_each_btree_node_key(&b->keys, k, &iter)
bytes += bkey_bytes(k);
@@ -721,7 +721,7 @@ SHOW(bch_cache_set)
if (attr == &sysfs_alloc_debug)
return show_cache_set_alloc_debug(c, buf);
- sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS]->level);
+ sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
sysfs_print(root_usage_percent, bch_root_usage(c));
if (attr == &sysfs_compression_stats)