diff options
author | Slava Pestov <sviatoslavpestov@gmail.com> | 2014-06-21 17:24:27 -0700 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 20:19:49 -0900 |
commit | 891475b75962c79b68a18ab0f214cc376a70bc63 (patch) | |
tree | 3bc7ef1157755d408bfd54617631c9fb22f2c37b | |
parent | 8b5c7a3253b25771bbb86fc3dd62bfc19aa3d271 (diff) |
bcache: comments
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r-- | drivers/md/bcache/alloc.c | 65 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 32 |
2 files changed, 69 insertions, 28 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index d1012bb5dc78..70e11a4132fd 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -32,14 +32,7 @@ * If we've got discards enabled, that happens when a bucket moves from the * free_inc list to the free list. * - * There is another freelist, because sometimes we have buckets that we know - * have nothing pointing into them - these we can reuse without waiting for - * priorities to be rewritten. These come from freed btree nodes and buckets - * that garbage collection discovered no longer had valid keys pointing into - * them (because they were overwritten). That's the unused list - buckets on the - * unused list move to the free list, optionally being discarded in the process. - * - * It's also important to ensure that gens don't wrap around - with respect to + * It's important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite * difficult to do in practice, but we explicitly guard against it anyways - if * a bucket is in danger of wrapping around we simply skip invalidating it that @@ -51,7 +44,7 @@ * bch_bucket_alloc_set() allocates one or more buckets from different caches * out of a cache set. * - * free_some_buckets() drives all the processes described above. It's called + * invalidate_buckets() drives all the processes described above. It's called * from bch_bucket_alloc() and a few other places that need to make sure free * buckets are ready. * @@ -70,6 +63,12 @@ #include <linux/random.h> #include <trace/events/bcache.h> +/** + * alloc_failed - kick off external processes to free up buckets + * + * We couldn't find enough buckets in invalidate_buckets(). Ask + * btree GC to run, hoping it will find more clean buckets. + */ static void alloc_failed(struct cache *ca) { struct cache_set *c = ca->set; @@ -397,6 +396,14 @@ static int bch_allocator_push(struct cache *ca, long bucket) return false; } +/** + * bch_allocator_thread - move buckets from free_inc to reserves + * + * The free_inc FIFO is populated by invalidate_buckets(), and + * the reserves are depleted by bucket allocation. When we run out + * of free_inc, try to invalidate some buckets and write out + * prios and gens. + */ static int bch_allocator_thread(void *arg) { struct cache *ca = arg; @@ -406,8 +413,8 @@ static int bch_allocator_thread(void *arg) while (1) { /* - * First, we pull buckets off of the free_inc lists, possibly - * issue discards to them, then we add the bucket to the + * First, we pull buckets off of the free_inc list, possibly + * issue discards to them, then we add the bucket to a * free list: */ while (!fifo_empty(&ca->free_inc)) { @@ -427,6 +434,10 @@ static int bch_allocator_thread(void *arg) mutex_lock(&c->bucket_lock); } + /* + * Wait for someone to allocate a bucket if all + * reserves are full: + */ allocator_wait(c, ca->fifo_wait, bch_allocator_push(ca, bucket)); fifo_pop(&ca->free_inc, bucket); @@ -434,36 +445,36 @@ static int bch_allocator_thread(void *arg) closure_wake_up(&c->bucket_wait); } - /* - * We've run out of free buckets, we need to find some buckets - * we can invalidate. First, invalidate them in memory and add - * them to the free_inc list: - */ + /* We've run out of free buckets! */ retry_invalidate: + /* Wait for in-progress GC to finish if there is one */ allocator_wait(c, c->gc_wait, c->gc_mark_valid); + /* + * Find some buckets that we can invalidate, either they're + * completely unused, or only contain clean data that's been + * written back to the backing device or another cache tier + */ invalidate_buckets(ca); if (CACHE_SYNC(&ca->set->sb)) { - /* - * This could deadlock if an allocation with a btree - * node locked ever blocked - having the btree node - * locked would block garbage collection, but here we're - * waiting on garbage collection before we invalidate - * and free anything. - * - * But this should be safe since the btree code always - * uses btree_check_reserve() before allocating now, and - * if it fails it blocks without btree nodes locked. - */ trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); + /* + * If we didn't invalidate enough buckets to fill up + * free_inc, try to invalidate some more. This will + * limit the amount of metadata writes we issue below + */ if (!fifo_full(&ca->free_inc)) goto retry_invalidate; + /* + * free_inc is full of newly-invalidated buckets, must + * write out prios and gens before they can be re-used + */ bch_prio_write(ca); } } diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 018fb2dce62d..9dd44c4caf33 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1468,6 +1468,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, return -EINTR; out_nocoalesce: + /* We may have written out some new nodes which are garbage now, + * wait for writes to finish */ closure_sync(&cl); bch_keylist_free(&keylist); @@ -1479,6 +1481,9 @@ out_nocoalesce: return 0; } +/** + * btree_gc_rewrite_node - merge node bsets together and update parent + */ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, struct btree *replace) { @@ -1513,6 +1518,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return -EINTR; } +/** + * btree_gc_count_keys - count keys in a btree node to see if we must coalesce + */ static unsigned btree_gc_count_keys(struct btree *b) { struct bkey *k; @@ -1525,6 +1533,13 @@ static unsigned btree_gc_count_keys(struct btree *b) return ret; } +/** + * btree_gc_recurse - tracing garbage collection on a node and children + * + * This may bail out early for a variety of reasons. This is allowed + * because concurrent writes will conservatively mark buckets dirty, + * ensuring they won't be touched until the next GC pass. + */ static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct gc_stat *gc) { @@ -1532,6 +1547,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, bool should_rewrite; struct bkey *k; struct btree_iter iter; + + /* Sliding window of GC_MERGE_NODES adjacent btree nodes */ struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; struct bkey tmp = NEXT_KEY(&b->c->gc_cur_key); @@ -1547,12 +1564,14 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); if (IS_ERR(r->b)) { + /* XXX: handle IO error better */ ret = PTR_ERR(r->b); break; } r->keys = btree_gc_count_keys(r->b); + /* See if we should coalesce */ ret = btree_gc_coalesce(b, op, gc, r); if (ret) break; @@ -1704,6 +1723,14 @@ static void bch_btree_gc_finish(struct cache_set *c) mutex_unlock(&c->bucket_lock); } +/** + * bch_btree_gc - find reclaimable buckets and clean up the btree + * + * This will find buckets that are completely unreachable, as well as those + * only containing clean data that can be safely discarded. Also, nodes that + * contain too many bsets are merged up and re-written, and adjacent nodes + * with low occupancy are coalesced together. + */ static void bch_btree_gc(struct cache_set *c) { struct gc_stat stats; @@ -2415,7 +2442,10 @@ int bch_btree_insert_node_sync(struct btree *b, struct btree_op *op, /** * bch_btree_insert_check_key - insert dummy key into btree * - * Used to address a race with cache promotion after cache miss. + * We insert a random key on a cache miss, then compare exchange on it + * once the cache promotion or backing device read completes. This + * ensures that if this key is written to after the read, the read will + * lose and not overwrite the key with stale data. * * Return values: * -EAGAIN: @cl was put on a waitlist waiting for btree node allocation |