diff options
-rw-r--r-- | drivers/md/bcache/bcache.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/btree_cache.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/btree_gc.c | 26 | ||||
-rw-r--r-- | drivers/md/bcache/btree_io.c | 50 | ||||
-rw-r--r-- | drivers/md/bcache/btree_iter.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/btree_types.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.c | 359 | ||||
-rw-r--r-- | drivers/md/bcache/btree_update.h | 63 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 25 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 6 |
11 files changed, 347 insertions, 216 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 18d1e13b30c1..e1b3eea39395 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -497,7 +497,6 @@ enum { CACHE_SET_GC_STOPPING, CACHE_SET_GC_FAILURE, CACHE_SET_BDEV_MOUNTED, - CACHE_SET_BTREE_WRITE_ERROR, }; struct btree_debug { @@ -562,7 +561,7 @@ struct cache_set { /* BTREE CACHE */ struct bio_set btree_read_bio; - struct btree *btree_roots[BTREE_ID_NR]; + struct btree_root btree_roots[BTREE_ID_NR]; spinlock_t btree_root_lock; bool btree_cache_table_init_done; diff --git a/drivers/md/bcache/btree_cache.c b/drivers/md/bcache/btree_cache.c index a005f4b30279..5a5674af3964 100644 --- a/drivers/md/bcache/btree_cache.c +++ b/drivers/md/bcache/btree_cache.c @@ -21,13 +21,13 @@ void bch_recalc_btree_reserve(struct cache_set *c) { unsigned i, reserve = 16; - if (!c->btree_roots[0]) + if (!c->btree_roots[0].b) reserve += 8; for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i]) + if (c->btree_roots[i].b) reserve += min_t(unsigned, 1, - c->btree_roots[i]->level) * 8; + c->btree_roots[i].b->level) * 8; c->btree_cache_reserve = reserve; } @@ -165,6 +165,9 @@ static int mca_reap_notrace(struct cache_set *c, struct btree *b, bool flush) if (!six_trylock_write(&b->lock)) goto out_unlock_intent; + if (btree_node_write_error(b)) + goto out_unlock; + if (!list_empty(&b->write_blocked)) goto out_unlock; @@ -353,8 +356,8 @@ void bch_btree_cache_free(struct cache_set *c) #endif for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i]) - list_add(&c->btree_roots[i]->list, &c->btree_cache); + if (c->btree_roots[i].b) + list_add(&c->btree_roots[i].b->list, &c->btree_cache); list_splice(&c->btree_cache_freeable, &c->btree_cache); diff --git a/drivers/md/bcache/btree_cache.h b/drivers/md/bcache/btree_cache.h index 03a1cf5f30df..e774ddf30e02 100644 --- a/drivers/md/bcache/btree_cache.h +++ b/drivers/md/bcache/btree_cache.h @@ -46,6 +46,6 @@ static inline unsigned btree_blocks(struct cache_set *c) return CACHE_BTREE_NODE_SIZE(&c->disk_sb) >> c->block_bits; } -#define btree_node_root(_b) ((_b)->c->btree_roots[(_b)->btree_id]) +#define btree_node_root(_b) ((_b)->c->btree_roots[(_b)->btree_id].b) #endif /* _BCACHE_BTREE_CACHE_H */ diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c index 3257c6221ddb..975293880358 100644 --- a/drivers/md/bcache/btree_gc.c +++ b/drivers/md/bcache/btree_gc.c @@ -208,7 +208,7 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id) spin_lock(&c->btree_root_lock); - b = c->btree_roots[btree_id]; + b = c->btree_roots[btree_id].b; __bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); @@ -398,7 +398,7 @@ void bch_gc(struct cache_set *c) /* Walk btree: */ while (c->gc_pos.phase < (int) BTREE_ID_NR) { - int ret = c->btree_roots[c->gc_pos.phase] + int ret = c->btree_roots[c->gc_pos.phase].b ? bch_gc_btree(c, (int) c->gc_pos.phase) : 0; @@ -454,7 +454,6 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], struct keylist keylist; struct bkey_format_state format_state; struct bkey_format new_format; - int ret; memset(new_nodes, 0, sizeof(new_nodes)); bch_keylist_init(&keylist, NULL, 0); @@ -592,7 +591,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], struct bkey_i delete; unsigned j; - bch_pending_btree_node_free_init(c, as, old_nodes[i]); + bch_btree_node_free_start(c, as, old_nodes[i]); for (j = 0; j < nr_new_nodes; j++) if (!bkey_cmp(old_nodes[i]->key.k.p, @@ -616,9 +615,7 @@ next: bch_keylist_add_in_order(&keylist, &new_nodes[i]->key); /* Insert the newly coalesced nodes */ - ret = bch_btree_insert_node(parent, iter, &keylist, res, as); - if (ret) - goto err; + bch_btree_insert_node(parent, iter, &keylist, res, as); BUG_ON(!bch_keylist_empty(&keylist)); @@ -631,7 +628,7 @@ next: /* Free the old nodes and update our sliding window */ for (i = 0; i < nr_old_nodes; i++) { - bch_btree_node_free(iter, old_nodes[i]); + bch_btree_node_free_inmem(iter, old_nodes[i]); six_unlock_intent(&old_nodes[i]->lock); /* @@ -651,13 +648,6 @@ next: out: bch_keylist_free(&keylist); bch_btree_reserve_put(c, res); - return; -err: - for (i = 0; i < nr_new_nodes; i++) { - bch_btree_node_free_never_inserted(c, new_nodes[i]); - six_unlock_intent(&new_nodes[i]->lock); - } - goto out; } static int bch_coalesce_btree(struct cache_set *c, enum btree_id btree_id) @@ -751,7 +741,7 @@ static void bch_coalesce(struct cache_set *c) trace_bcache_gc_coalesce_start(c); for (id = 0; id < BTREE_ID_NR; id++) { - int ret = c->btree_roots[id] + int ret = c->btree_roots[id].b ? bch_coalesce_btree(c, id) : 0; @@ -848,7 +838,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id) btree_node_range_checks_init(&r); - if (!c->btree_roots[id]) + if (!c->btree_roots[id].b) return; for_each_btree_node(&iter, c, id, POS_MIN, b) { @@ -870,7 +860,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id) bch_btree_iter_unlock(&iter); __bch_btree_mark_key(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&c->btree_roots[id]->key)); + bkey_i_to_s_c(&c->btree_roots[id].b->key)); } int bch_initial_gc(struct cache_set *c, struct list_head *journal) diff --git a/drivers/md/bcache/btree_io.c b/drivers/md/bcache/btree_io.c index f1e9c57e6e3e..4f3a7a2cdb76 100644 --- a/drivers/md/bcache/btree_io.c +++ b/drivers/md/bcache/btree_io.c @@ -499,9 +499,7 @@ static void btree_node_write_done(struct closure *cl) if (btree_node_write_error(b)) bch_journal_halt(&c->journal); - /* XXX: pin btree node in memory somehow */ - if (!btree_node_write_error(b)) - bch_btree_complete_write(c, b, w); + bch_btree_complete_write(c, b, w); if (btree_node_dirty(b) && c->btree_flush_delay) schedule_delayed_work(&b->work, c->btree_flush_delay * HZ); @@ -516,10 +514,8 @@ static void btree_node_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); if (cache_fatal_io_err_on(bio->bi_error, wbio->bio.ca, "btree write") || - bch_meta_write_fault("btree")) { + bch_meta_write_fault("btree")) set_btree_node_write_error(b); - set_bit(CACHE_SET_BTREE_WRITE_ERROR, &b->c->flags); - } if (wbio->orig) bio_endio(wbio->orig); @@ -580,6 +576,29 @@ static void do_btree_node_write(struct closure *cl) BUG_ON(b->written + blocks_to_write > btree_blocks(c)); + /* + * We handle btree write errors by immediately halting the journal - + * after we've done that, we can't issue any subsequent btree writes + * because they might have pointers to new nodes that failed to write. + * + * Furthermore, there's no point in doing any more btree writes because + * with the journal stopped, we're never going to update the journal to + * reflect that those writes were done and the data flushed from the + * journal: + * + * Make sure to update b->written so bch_btree_init_next() doesn't + * break: + */ + if (bch_journal_error(&b->c->journal)) { + struct btree_write *w = btree_prev_write(b); + + set_btree_node_write_error(b); + b->written += blocks_to_write; + bch_btree_complete_write(c, b, w); + + closure_return_with_destructor(cl, btree_node_write_unlock); + } + bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->bio_write); wbio = to_wbio(bio); @@ -663,15 +682,22 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent, if (!test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) return; - BUG_ON(!list_empty(&b->write_blocked)); - /* * io_mutex ensures only a single IO in flight to a btree node at a * time, and also protects use of the b->io closure. * do_btree_node_write() will drop it asynchronously. */ down(&b->io_mutex); + + BUG_ON(!list_empty(&b->write_blocked)); #if 0 + /* + * This is an optimization for when journal flushing races with the + * btree node being written for some other reason, and the write the + * journal wanted to flush has already happened - in that case we'd + * prefer not to write a mostly empty bset. It seemed to be buggy, + * though: + */ if (idx_to_write != -1 && idx_to_write != btree_node_write_idx(b)) { up(&b->io_mutex); @@ -699,6 +725,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent, static void bch_btree_node_write_dirty(struct btree *b, struct closure *parent) { six_lock_read(&b->lock); + BUG_ON(b->level); + __bch_btree_node_write(b, parent, -1); six_unlock_read(&b->lock); } @@ -752,7 +780,11 @@ restart: for (; i < tbl->size; i++) rht_for_each_entry_rcu(b, pos, tbl, i, hash) - if (btree_node_dirty(b)) { + /* + * XXX - locking for b->level, when called from + * bch_journal_move() + */ + if (!b->level && btree_node_dirty(b)) { rcu_read_unlock(); bch_btree_node_write_dirty(b, &cl); dropped_lock = true; diff --git a/drivers/md/bcache/btree_iter.c b/drivers/md/bcache/btree_iter.c index 6b9477f740e4..71e0ec18871e 100644 --- a/drivers/md/bcache/btree_iter.c +++ b/drivers/md/bcache/btree_iter.c @@ -357,12 +357,12 @@ static void btree_iter_lock_root(struct btree_iter *iter, struct bpos pos) memset(iter->nodes, 0, sizeof(iter->nodes)); while (1) { - struct btree *b = iter->c->btree_roots[iter->btree_id]; + struct btree *b = iter->c->btree_roots[iter->btree_id].b; iter->level = b->level; if (btree_node_lock(b, iter, iter->level, - (b != iter->c->btree_roots[iter->btree_id]))) { + (b != iter->c->btree_roots[iter->btree_id].b))) { btree_iter_node_set(iter, b, pos); break; } diff --git a/drivers/md/bcache/btree_types.h b/drivers/md/bcache/btree_types.h index f3af91e3d23c..05b70dc53e37 100644 --- a/drivers/md/bcache/btree_types.h +++ b/drivers/md/bcache/btree_types.h @@ -14,6 +14,7 @@ struct cache_set; struct open_bucket; +struct async_split; struct btree_write { unsigned index; @@ -22,6 +23,17 @@ struct btree_write { struct closure_waitlist wait; }; +struct btree_root { + struct btree *b; + + struct async_split *as; + + /* On disk root - see async splits: */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u8 level; + u8 alive; +}; + struct btree { /* Hottest entries first */ struct rhash_head hash; diff --git a/drivers/md/bcache/btree_update.c b/drivers/md/bcache/btree_update.c index 2764a6470cdb..cef1f7bb8d28 100644 --- a/drivers/md/bcache/btree_update.c +++ b/drivers/md/bcache/btree_update.c @@ -17,6 +17,9 @@ #include <linux/random.h> #include <trace/events/bcache.h> +static void async_split_updated_root(struct async_split *, + struct btree *); + /* Calculate ideal packed bkey format for new btree nodes: */ void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b) @@ -88,9 +91,12 @@ bool bch_btree_node_format_fits(struct btree *b, struct bkey_format *new_f) /* Btree node freeing/allocation: */ -void bch_pending_btree_node_free_init(struct cache_set *c, - struct async_split *as, - struct btree *b) +/* + * @b is going to be freed, allocate a pending_btree_node_free in @as: + */ +void bch_btree_node_free_start(struct cache_set *c, + struct async_split *as, + struct btree *b) { struct pending_btree_node_free *d; @@ -105,9 +111,16 @@ void bch_pending_btree_node_free_init(struct cache_set *c, mutex_unlock(&c->btree_node_pending_free_lock); } -static void bch_pending_btree_node_free_insert_done(struct cache_set *c, - struct btree *b, enum btree_id id, struct bkey_s_c k, - struct bucket_stats_cache_set *stats) +/* + * We're doing the index update that makes @b unreachable, update stuff to + * reflect that: + * + * Must be called _before_ async_split_updated_root() or + * async_split_updated_btree: + */ +static void bch_btree_node_free_index(struct cache_set *c, struct btree *b, + enum btree_id id, struct bkey_s_c k, + struct bucket_stats_cache_set *stats) { struct pending_btree_node_free *d; @@ -201,7 +214,7 @@ void bch_btree_node_free_never_inserted(struct cache_set *c, struct btree *b) bch_open_bucket_put(c, ob); } -void bch_btree_node_free(struct btree_iter *iter, struct btree *b) +void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b) { bch_btree_iter_node_drop_linked(iter, b); @@ -349,7 +362,7 @@ struct btree *btree_node_alloc_replacement(struct cache_set *c, return __btree_node_alloc_replacement(c, b, new_f, reserve); } -static void __bch_btree_set_root(struct cache_set *c, struct btree *b, +static void bch_btree_set_root_inmem(struct cache_set *c, struct btree *b, struct bucket_stats_cache_set *stats) { /* Root nodes cannot be reaped */ @@ -369,6 +382,20 @@ static void __bch_btree_set_root(struct cache_set *c, struct btree *b, bch_recalc_btree_reserve(c); } +static void bch_btree_set_root_ondisk(struct cache_set *c, struct btree *b) +{ + struct btree_root *r = &c->btree_roots[b->btree_id]; + + spin_lock(&c->btree_root_lock); + + BUG_ON(b != r->b); + bkey_copy(&r->key, &b->key); + r->level = b->level; + r->alive = true; + + spin_unlock(&c->btree_root_lock); +} + /* * Only for cache set bringup, when first reading the btree roots or allocating * btree roots when initializing a new cache set: @@ -380,7 +407,8 @@ void bch_btree_set_root_initial(struct cache_set *c, struct btree *b, BUG_ON(btree_node_root(b)); - __bch_btree_set_root(c, b, &stats); + bch_btree_set_root_inmem(c, b, &stats); + bch_btree_set_root_ondisk(c, b); if (btree_reserve) bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res); @@ -398,14 +426,13 @@ void bch_btree_set_root_initial(struct cache_set *c, struct btree *b, * is nothing new to be done. This just guarantees that there is a * journal write. */ -static int bch_btree_set_root(struct btree_iter *iter, struct btree *b, - struct journal_res *res, - struct btree_reserve *btree_reserve) +static void bch_btree_set_root(struct btree_iter *iter, struct btree *b, + struct async_split *as, + struct btree_reserve *btree_reserve) { struct bucket_stats_cache_set stats = { 0 }; struct cache_set *c = iter->c; struct btree *old; - u64 seq; trace_bcache_btree_set_root(b); BUG_ON(!b->written); @@ -418,7 +445,13 @@ static int bch_btree_set_root(struct btree_iter *iter, struct btree *b, */ btree_node_lock_write(old, iter); - __bch_btree_set_root(c, b, &stats); + bch_btree_set_root_inmem(c, b, &stats); + + bch_btree_node_free_index(c, NULL, old->btree_id, + bkey_i_to_s_c(&old->key), + &stats); + + async_split_updated_root(as, b); /* * Unlock old root after new root is visible: @@ -429,21 +462,8 @@ static int bch_btree_set_root(struct btree_iter *iter, struct btree *b, */ btree_node_unlock_write(old, iter); - bch_pending_btree_node_free_insert_done(c, NULL, old->btree_id, - bkey_i_to_s_c(&old->key), - &stats); - stats.sectors_meta -= c->sb.btree_node_size; bch_cache_set_stats_apply(c, &stats, &btree_reserve->disk_res); - - /* - * Ensure new btree root is persistent (reachable via the - * journal) before returning and the caller unlocking it: - */ - seq = c->journal.seq; - bch_journal_res_put(&c->journal, res, NULL); - - return bch_journal_flush_seq(&c->journal, seq); } static struct btree *__btree_root_alloc(struct cache_set *c, unsigned level, @@ -627,9 +647,8 @@ static bool bch_insert_fixup_btree_ptr(struct btree_iter *iter, break; if (!cmp && !bkey_deleted(k)) { - bch_pending_btree_node_free_insert_done(c, b, - iter->btree_id, - u, &stats); + bch_btree_node_free_index(c, b, iter->btree_id, + u, &stats); /* * Look up pending delete, mark so that gc marks it on * the pending delete list @@ -876,25 +895,15 @@ struct async_split *__bch_async_split_alloc(struct btree *nodes[], { struct cache_set *c = iter->c; struct async_split *as; - struct journal_res res; struct journal_entry_pin_list *pin_list = NULL; unsigned i, pin_idx = UINT_MAX; - memset(&res, 0, sizeof(res)); - - /* - * must get journal res before getting a journal pin (else we deadlock) - */ - if (bch_journal_res_get(&c->journal, &res, - jset_u64s(0), jset_u64s(0))) - return NULL; - as = mempool_alloc(&c->btree_async_split_pool, GFP_NOIO); closure_init(&as->cl, &c->cl); as->c = c; + as->mode = ASYNC_SPLIT_NO_UPDATE; as->b = NULL; - as->as = NULL; - as->res = res; + as->parent_as = NULL; as->nr_pending = 0; init_llist_head(&as->wait.list); @@ -964,7 +973,7 @@ static void async_split_free(struct closure *cl) mempool_free(as, &as->c->btree_async_split_pool); } -static void async_split_update_done(struct closure *cl) +static void async_split_pointers_written(struct closure *cl) { struct async_split *as = container_of(cl, struct async_split, cl); struct cache_set *c = as->c; @@ -980,10 +989,11 @@ static void async_split_update_done(struct closure *cl) closure_return_with_destructor(cl, async_split_free); } -static void async_split_writes_done(struct closure *cl) +static void async_split_nodes_written(struct closure *cl) { struct async_split *as = container_of(cl, struct async_split, cl); struct cache_set *c = as->c; + struct btree *b; /* * We did an update to a parent node where the pointers we added pointed @@ -991,14 +1001,15 @@ static void async_split_writes_done(struct closure *cl) * been written so we can write out the update to the interior node. */ - /* XXX: error handling */ retry: mutex_lock(&c->async_split_lock); + switch (as->mode) { + case ASYNC_SPLIT_NO_UPDATE: + BUG(); + case ASYNC_SPLIT_UPDATING_BTREE: + /* The usual case: */ + b = READ_ONCE(as->b); - if (as->b) { - struct btree *b = as->b; - - /* Normal case */ if (!six_trylock_read(&b->lock)) { mutex_unlock(&c->async_split_lock); six_lock_read(&b->lock); @@ -1006,6 +1017,7 @@ retry: goto retry; } + BUG_ON(!btree_node_dirty(b)); closure_wait(&btree_current_write(b)->wait, cl); list_del(&as->list); @@ -1013,29 +1025,130 @@ retry: if (list_empty(&b->write_blocked)) __bch_btree_node_write(b, NULL, -1); six_unlock_read(&b->lock); - } else if (as->as) { + break; + + case ASYNC_SPLIT_UPDATING_AS: /* * The btree node we originally updated has been freed and is * being rewritten - so we need to write anything here, we just * need to signal to that async_split that it's ok to make the * new replacement node visible: */ - closure_put(&as->as->cl); + closure_put(&as->parent_as->cl); /* * and then we have to wait on that async_split to finish: */ - closure_wait(&as->as->wait, cl); - } else { + closure_wait(&as->parent_as->wait, cl); + break; + + case ASYNC_SPLIT_UPDATING_ROOT: + /* b is the new btree root: */ + b = READ_ONCE(as->b); + + if (!six_trylock_read(&b->lock)) { + mutex_unlock(&c->async_split_lock); + six_lock_read(&b->lock); + six_unlock_read(&b->lock); + goto retry; + } + + BUG_ON(c->btree_roots[b->btree_id].as != as); + c->btree_roots[b->btree_id].as = NULL; + + bch_btree_set_root_ondisk(c, b); + /* - * Instead of updating an interior node we set a new btree root - * (synchronously) - nothing to do here: + * We don't have to wait anything anything here (before + * async_split_pointers_written frees the old nodes ondisk) - + * we've ensured that the very next journal write will have the + * pointer to the new root, and before the allocator can reuse + * the old nodes it'll have to do a journal commit: */ + six_unlock_read(&b->lock); } - mutex_unlock(&c->async_split_lock); - continue_at(cl, async_split_update_done, system_wq); + continue_at(cl, async_split_pointers_written, system_wq); +} + +/* + * We're updating @b with pointers to nodes that haven't finished writing yet: + * block @b from being written until @as completes + */ +static void async_split_updated_btree(struct async_split *as, + struct btree *b) +{ + mutex_lock(&as->c->async_split_lock); + + BUG_ON(as->mode != ASYNC_SPLIT_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); + + as->mode = ASYNC_SPLIT_UPDATING_BTREE; + as->b = b; + list_add(&as->list, &b->write_blocked); + + mutex_unlock(&as->c->async_split_lock); + + continue_at(&as->cl, async_split_nodes_written, system_wq); +} + +static void async_split_updated_root(struct async_split *as, + struct btree *b) +{ + struct btree_root *r = &as->c->btree_roots[b->btree_id]; + + /* + * XXX: if there's an outstanding async_split updating the root, we + * have to do the dance with the old one + */ + + mutex_lock(&as->c->async_split_lock); + + if (r->as) { + BUG_ON(r->as->mode != ASYNC_SPLIT_UPDATING_ROOT); + + r->as->b = NULL; + r->as->mode = ASYNC_SPLIT_UPDATING_AS; + r->as->parent_as = as; + closure_get(&as->cl); + } + + BUG_ON(as->mode != ASYNC_SPLIT_NO_UPDATE); + as->mode = ASYNC_SPLIT_UPDATING_ROOT; + as->b = b; + r->as = as; + + mutex_unlock(&as->c->async_split_lock); + + continue_at(&as->cl, async_split_nodes_written, system_wq); +} + +/* + * @b is being split/rewritten: it may have pointers to not-yet-written btree + * nodes and thus outstanding async_splits - redirect @b's async_splits to point + * to this async_split: + */ +static void async_split_will_free_node(struct async_split *as, + struct btree *b) +{ + mutex_lock(&as->c->async_split_lock); + + while (!list_empty(&b->write_blocked)) { + struct async_split *p = + list_first_entry(&b->write_blocked, + struct async_split, list); + + BUG_ON(p->mode != ASYNC_SPLIT_UPDATING_BTREE); + + p->mode = ASYNC_SPLIT_UPDATING_AS; + list_del(&p->list); + p->b = NULL; + p->parent_as = as; + closure_get(&as->cl); + } + + mutex_unlock(&as->c->async_split_lock); } static void btree_node_interior_verify(struct btree *b) @@ -1092,6 +1205,7 @@ bch_btree_insert_keys_interior(struct btree *b, struct btree_node_iter *node_iter = &iter->node_iters[b->level]; const struct bkey_format *f = &b->keys.format; struct bkey_packed *k; + bool inserted = false; BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level)); BUG_ON(!b->level); @@ -1107,14 +1221,6 @@ bch_btree_insert_keys_interior(struct btree *b, return BTREE_INSERT_NEED_SPLIT; } - /* not using the journal reservation, drop it now before blocking: */ - bch_journal_res_put(&iter->c->journal, &as->res, NULL); - - mutex_lock(&iter->c->async_split_lock); - as->b = b; - list_add(&as->list, &b->write_blocked); - mutex_unlock(&iter->c->async_split_lock); - while (!bch_keylist_empty(insert_keys)) { struct bkey_i *insert = bch_keylist_front(insert_keys); @@ -1129,9 +1235,13 @@ bch_btree_insert_keys_interior(struct btree *b, bch_insert_fixup_btree_ptr(iter, b, insert, node_iter, &res->disk_res); + inserted = true; bch_keylist_dequeue(insert_keys); } + BUG_ON(!inserted); + async_split_updated_btree(as, b); + btree_node_unlock_write(b, iter); /* @@ -1143,8 +1253,6 @@ bch_btree_insert_keys_interior(struct btree *b, (bkey_cmp_left_packed(f, k, iter->pos) >= 0)) ; - continue_at_noreturn(&as->cl, async_split_writes_done, system_wq); - btree_node_interior_verify(b); return BTREE_INSERT_OK; @@ -1354,10 +1462,10 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, btree_node_interior_verify(b); } -static int btree_split(struct btree *b, struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct async_split *as) +static void btree_split(struct btree *b, struct btree_iter *iter, + struct keylist *insert_keys, + struct btree_reserve *reserve, + struct async_split *as) { struct cache_set *c = iter->c; struct btree *parent = iter->nodes[b->level + 1]; @@ -1365,28 +1473,11 @@ static int btree_split(struct btree *b, struct btree_iter *iter, uint64_t start_time = local_clock(); unsigned u64s_to_insert = b->level ? bch_keylist_nkeys(insert_keys) : 0; - int ret; BUG_ON(!parent && (b != btree_node_root(b))); BUG_ON(!btree_node_intent_locked(iter, btree_node_root(b)->level)); - mutex_lock(&c->async_split_lock); - while (!list_empty(&b->write_blocked)) { - struct async_split *p = - list_first_entry(&b->write_blocked, - struct async_split, list); - - /* - * If there were async splits blocking this node from being - * written, we can't make the new replacement nodes visible - * until those async splits finish - so point them at us: - */ - list_del(&p->list); - p->b = NULL; - p->as = as; - closure_get(&as->cl); - } - mutex_unlock(&c->async_split_lock); + async_split_will_free_node(as, b); n1 = btree_node_alloc_replacement(c, b, reserve); @@ -1473,38 +1564,19 @@ static int btree_split(struct btree *b, struct btree_iter *iter, bch_btree_node_write(n1, &as->cl, NULL); - bch_pending_btree_node_free_init(c, as, b); + bch_btree_node_free_start(c, as, b); /* New nodes all written, now make them visible: */ if (parent) { /* Split a non root node */ - ret = bch_btree_insert_node(parent, iter, &as->parent_keys, - reserve, as); - if (ret) - goto err; + bch_btree_insert_node(parent, iter, &as->parent_keys, + reserve, as); + } else if (n3) { + bch_btree_set_root(iter, n3, as, reserve); } else { - /* Wait on journal flush and btree node writes: */ - closure_sync(&as->cl); - - /* Check for journal error after waiting on the journal flush: */ - if (bch_journal_error(&c->journal) || - test_bit(CACHE_SET_BTREE_WRITE_ERROR, &c->flags)) - goto err; - - if (n3) { - ret = bch_btree_set_root(iter, n3, &as->res, reserve); - if (ret) - goto err; - } else { - /* Root filled up but didn't need to be split */ - ret = bch_btree_set_root(iter, n1, &as->res, reserve); - if (ret) - goto err; - } - - continue_at_noreturn(&as->cl, async_split_writes_done, - system_wq); + /* Root filled up but didn't need to be split */ + bch_btree_set_root(iter, n1, as, reserve); } btree_open_bucket_put(c, n1); @@ -1522,7 +1594,7 @@ static int btree_split(struct btree *b, struct btree_iter *iter, * We have to free the node first because the bch_iter_node_replace() * calls will drop _our_ iterator's reference - and intent lock - to @b. */ - bch_btree_node_free(iter, b); + bch_btree_node_free_inmem(iter, b); /* Successful split, update the iterator to point to the new nodes: */ @@ -1533,20 +1605,6 @@ static int btree_split(struct btree *b, struct btree_iter *iter, bch_btree_iter_node_replace(iter, n1); bch_time_stats_update(&c->btree_split_time, start_time); - return 0; -err: - /* IO error: */ - if (n3) { - bch_btree_node_free_never_inserted(c, n3); - six_unlock_intent(&n3->lock); - } - if (n2) { - bch_btree_node_free_never_inserted(c, n2); - six_unlock_intent(&n2->lock); - } - bch_btree_node_free_never_inserted(c, n1); - six_unlock_intent(&n1->lock); - return -EIO; } /** @@ -1562,11 +1620,11 @@ err: * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic. */ -int bch_btree_insert_node(struct btree *b, - struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct async_split *as) +void bch_btree_insert_node(struct btree *b, + struct btree_iter *iter, + struct keylist *insert_keys, + struct btree_reserve *reserve, + struct async_split *as) { BUG_ON(!b->level); BUG_ON(!reserve || !as); @@ -1574,9 +1632,10 @@ int bch_btree_insert_node(struct btree *b, switch (bch_btree_insert_keys_interior(b, iter, insert_keys, as, reserve)) { case BTREE_INSERT_OK: - return 0; + break; case BTREE_INSERT_NEED_SPLIT: - return btree_split(b, iter, insert_keys, reserve, as); + btree_split(b, iter, insert_keys, reserve, as); + break; default: BUG(); } @@ -1619,7 +1678,8 @@ static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags) goto out_put_reserve; } - ret = btree_split(b, iter, NULL, reserve, as); + btree_split(b, iter, NULL, reserve, as); + ret = 0; out_put_reserve: bch_btree_reserve_put(c, reserve); @@ -2043,7 +2103,6 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait) struct btree *n, *parent = iter->nodes[b->level + 1]; struct btree_reserve *reserve; struct async_split *as; - int ret; iter->locks_want = BTREE_MAX_DEPTH; if (!bch_btree_iter_upgrade(iter)) @@ -2068,32 +2127,20 @@ int bch_btree_node_rewrite(struct btree *b, struct btree_iter *iter, bool wait) trace_bcache_btree_gc_rewrite_node(b); bch_btree_node_write(n, &as->cl, NULL); - bch_pending_btree_node_free_init(c, as, b); + + bch_btree_node_free_start(c, as, b); if (parent) { - ret = bch_btree_insert_node(parent, iter, + bch_btree_insert_node(parent, iter, &keylist_single(&n->key), reserve, as); - BUG_ON(ret); } else { - closure_sync(&as->cl); - - if (bch_journal_error(&c->journal) || - btree_node_write_error(n)) { - bch_btree_node_free_never_inserted(c, n); - six_unlock_intent(&n->lock); - return -EIO; - } - - bch_btree_set_root(iter, n, &as->res, reserve); - - continue_at_noreturn(&as->cl, async_split_writes_done, - system_wq); + bch_btree_set_root(iter, n, as, reserve); } btree_open_bucket_put(iter->c, n); - bch_btree_node_free(iter, b); + bch_btree_node_free_inmem(iter, b); BUG_ON(!bch_btree_iter_node_replace(iter, n)); diff --git a/drivers/md/bcache/btree_update.h b/drivers/md/bcache/btree_update.h index ff2b42b32c5f..c8de69e8c47c 100644 --- a/drivers/md/bcache/btree_update.h +++ b/drivers/md/bcache/btree_update.h @@ -25,6 +25,11 @@ bool bch_btree_node_format_fits(struct btree *, struct bkey_format *); /* Btree node freeing/allocation: */ +/* + * Tracks a btree node that has been (or is about to be) freed in memory, but + * has _not_ yet been freed on disk (because the write that makes the new + * node(s) visible and frees the old hasn't completed yet) + */ struct pending_btree_node_free { struct list_head list; bool index_update_done; @@ -32,22 +37,58 @@ struct pending_btree_node_free { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ struct async_split { struct closure cl; struct cache_set *c; + enum { + ASYNC_SPLIT_NO_UPDATE, + ASYNC_SPLIT_UPDATING_BTREE, + ASYNC_SPLIT_UPDATING_ROOT, + ASYNC_SPLIT_UPDATING_AS, + } mode; + + /* + * ASYNC_SPLIT_UPDATING_BTREE: + * @b - node we're blocking from being written + * @list - corresponds to @b->write_blocked + */ struct btree *b; - struct async_split *as; struct list_head list; - /* for setting a new btree root: */ - struct journal_res res; + /* + * ASYNC_SPLIT_UPDATING_AS: btree node we updated was freed, so now + * we're now blocking another async_split + * @parent_as - async_split that's waiting on our nodes to finish + * writing, before it can make new nodes visible on disk + * @wait - list of child async_splits that are waiting on this + * async_split to make all the new nodes visible before they can free + * their old btree nodes + */ + struct async_split *parent_as; + struct closure_waitlist wait; struct journal_entry_pin journal; - struct closure_waitlist wait; - struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; unsigned nr_pending; @@ -61,11 +102,11 @@ struct async_split { u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; }; -void bch_pending_btree_node_free_init(struct cache_set *, struct async_split *, - struct btree *); +void bch_btree_node_free_start(struct cache_set *, struct async_split *, + struct btree *); +void bch_btree_node_free_inmem(struct btree_iter *, struct btree *); void bch_btree_node_free_never_inserted(struct cache_set *, struct btree *); -void bch_btree_node_free(struct btree_iter *, struct btree *); void btree_open_bucket_put(struct cache_set *c, struct btree *); @@ -153,9 +194,9 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c, #endif } -int bch_btree_insert_node(struct btree *, struct btree_iter *, - struct keylist *, struct btree_reserve *, - struct async_split *as); +void bch_btree_insert_node(struct btree *, struct btree_iter *, + struct keylist *, struct btree_reserve *, + struct async_split *as); /* Normal update interface: */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 5a133c3f2a2f..5994c08c48ae 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -1485,7 +1485,6 @@ static void journal_write_locked(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct cache_set *c = container_of(j, struct cache_set, journal); struct cache *ca; - struct btree *b; struct journal_write *w = journal_cur_write(j); struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); struct bch_extent_ptr *ptr; @@ -1504,14 +1503,6 @@ static void journal_write_locked(struct closure *cl) clear_bit(JOURNAL_DIRTY, &j->flags); cancel_delayed_work(&j->write_work); - spin_lock(&c->btree_root_lock); - - for (i = 0; i < BTREE_ID_NR; i++) - if ((b = c->btree_roots[i])) - bch_journal_add_btree_root(j, i, &b->key, b->level); - - spin_unlock(&c->btree_root_lock); - bch_journal_add_prios(j); /* So last_seq is up to date */ @@ -1525,6 +1516,18 @@ static void journal_write_locked(struct closure *cl) w->data->version = cpu_to_le32(BCACHE_JSET_VERSION); w->data->last_seq = cpu_to_le64(last_seq(j)); + /* _after_ last_seq(): */ + spin_lock(&c->btree_root_lock); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = &c->btree_roots[i]; + + if (r->alive) + bch_journal_add_btree_root(j, i, &r->key, r->level); + } + + spin_unlock(&c->btree_root_lock); + SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum); @@ -2170,6 +2173,10 @@ int bch_journal_move(struct cache *ca) * journal entries written to ca become stale and are no * longer needed. */ + + /* + * XXX: switch to normal journal reclaim machinery + */ bch_btree_flush(c); /* diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0a1daed0ff74..6229b633f9b1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -524,9 +524,9 @@ static unsigned bch_root_usage(struct cache_set *c) do { six_unlock_read(&b->lock); lock_root: - b = c->btree_roots[BTREE_ID_EXTENTS]; + b = c->btree_roots[BTREE_ID_EXTENTS].b; six_lock_read(&b->lock); - } while (b != c->btree_roots[BTREE_ID_EXTENTS]); + } while (b != c->btree_roots[BTREE_ID_EXTENTS].b); for_each_btree_node_key(&b->keys, k, &iter) bytes += bkey_bytes(k); @@ -721,7 +721,7 @@ SHOW(bch_cache_set) if (attr == &sysfs_alloc_debug) return show_cache_set_alloc_debug(c, buf); - sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS]->level); + sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level); sysfs_print(root_usage_percent, bch_root_usage(c)); if (attr == &sysfs_compression_stats) |