summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSlava Pestov <sviatoslavpestov@gmail.com>2014-06-21 17:24:27 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 09:00:19 -0800
commit6cbf03d43a3711873d64de8ef631b0c599045cc6 (patch)
treeaf6485e45e112f2051d8b39cc686b1a4ace86a37
parent5e0c54762b17d0a59750a4e79c0393c7873871f4 (diff)
bcache: comments
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--drivers/md/bcache/alloc.c65
-rw-r--r--drivers/md/bcache/btree.c32
2 files changed, 69 insertions, 28 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index d1012bb5dc78..70e11a4132fd 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -32,14 +32,7 @@
* If we've got discards enabled, that happens when a bucket moves from the
* free_inc list to the free list.
*
- * There is another freelist, because sometimes we have buckets that we know
- * have nothing pointing into them - these we can reuse without waiting for
- * priorities to be rewritten. These come from freed btree nodes and buckets
- * that garbage collection discovered no longer had valid keys pointing into
- * them (because they were overwritten). That's the unused list - buckets on the
- * unused list move to the free list, optionally being discarded in the process.
- *
- * It's also important to ensure that gens don't wrap around - with respect to
+ * It's important to ensure that gens don't wrap around - with respect to
* either the oldest gen in the btree or the gen on disk. This is quite
* difficult to do in practice, but we explicitly guard against it anyways - if
* a bucket is in danger of wrapping around we simply skip invalidating it that
@@ -51,7 +44,7 @@
* bch_bucket_alloc_set() allocates one or more buckets from different caches
* out of a cache set.
*
- * free_some_buckets() drives all the processes described above. It's called
+ * invalidate_buckets() drives all the processes described above. It's called
* from bch_bucket_alloc() and a few other places that need to make sure free
* buckets are ready.
*
@@ -70,6 +63,12 @@
#include <linux/random.h>
#include <trace/events/bcache.h>
+/**
+ * alloc_failed - kick off external processes to free up buckets
+ *
+ * We couldn't find enough buckets in invalidate_buckets(). Ask
+ * btree GC to run, hoping it will find more clean buckets.
+ */
static void alloc_failed(struct cache *ca)
{
struct cache_set *c = ca->set;
@@ -397,6 +396,14 @@ static int bch_allocator_push(struct cache *ca, long bucket)
return false;
}
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by invalidate_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
static int bch_allocator_thread(void *arg)
{
struct cache *ca = arg;
@@ -406,8 +413,8 @@ static int bch_allocator_thread(void *arg)
while (1) {
/*
- * First, we pull buckets off of the free_inc lists, possibly
- * issue discards to them, then we add the bucket to the
+ * First, we pull buckets off of the free_inc list, possibly
+ * issue discards to them, then we add the bucket to a
* free list:
*/
while (!fifo_empty(&ca->free_inc)) {
@@ -427,6 +434,10 @@ static int bch_allocator_thread(void *arg)
mutex_lock(&c->bucket_lock);
}
+ /*
+ * Wait for someone to allocate a bucket if all
+ * reserves are full:
+ */
allocator_wait(c, ca->fifo_wait,
bch_allocator_push(ca, bucket));
fifo_pop(&ca->free_inc, bucket);
@@ -434,36 +445,36 @@ static int bch_allocator_thread(void *arg)
closure_wake_up(&c->bucket_wait);
}
- /*
- * We've run out of free buckets, we need to find some buckets
- * we can invalidate. First, invalidate them in memory and add
- * them to the free_inc list:
- */
+ /* We've run out of free buckets! */
retry_invalidate:
+ /* Wait for in-progress GC to finish if there is one */
allocator_wait(c, c->gc_wait, c->gc_mark_valid);
+ /*
+ * Find some buckets that we can invalidate, either they're
+ * completely unused, or only contain clean data that's been
+ * written back to the backing device or another cache tier
+ */
invalidate_buckets(ca);
if (CACHE_SYNC(&ca->set->sb)) {
- /*
- * This could deadlock if an allocation with a btree
- * node locked ever blocked - having the btree node
- * locked would block garbage collection, but here we're
- * waiting on garbage collection before we invalidate
- * and free anything.
- *
- * But this should be safe since the btree code always
- * uses btree_check_reserve() before allocating now, and
- * if it fails it blocks without btree nodes locked.
- */
trace_bcache_alloc_batch(ca,
fifo_used(&ca->free_inc),
ca->free_inc.size);
+ /*
+ * If we didn't invalidate enough buckets to fill up
+ * free_inc, try to invalidate some more. This will
+ * limit the amount of metadata writes we issue below
+ */
if (!fifo_full(&ca->free_inc))
goto retry_invalidate;
+ /*
+ * free_inc is full of newly-invalidated buckets, must
+ * write out prios and gens before they can be re-used
+ */
bch_prio_write(ca);
}
}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index dd2fdce21f03..45903db933f8 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1472,6 +1472,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
return -EINTR;
out_nocoalesce:
+ /* We may have written out some new nodes which are garbage now,
+ * wait for writes to finish */
closure_sync(&cl);
bch_keylist_free(&keylist);
@@ -1483,6 +1485,9 @@ out_nocoalesce:
return 0;
}
+/**
+ * btree_gc_rewrite_node - merge node bsets together and update parent
+ */
static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
struct btree *replace)
{
@@ -1517,6 +1522,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
return -EINTR;
}
+/**
+ * btree_gc_count_keys - count keys in a btree node to see if we must coalesce
+ */
static unsigned btree_gc_count_keys(struct btree *b)
{
struct bkey *k;
@@ -1529,6 +1537,13 @@ static unsigned btree_gc_count_keys(struct btree *b)
return ret;
}
+/**
+ * btree_gc_recurse - tracing garbage collection on a node and children
+ *
+ * This may bail out early for a variety of reasons. This is allowed
+ * because concurrent writes will conservatively mark buckets dirty,
+ * ensuring they won't be touched until the next GC pass.
+ */
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct gc_stat *gc)
{
@@ -1536,6 +1551,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
bool should_rewrite;
struct bkey *k;
struct btree_iter iter;
+
+ /* Sliding window of GC_MERGE_NODES adjacent btree nodes */
struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
struct bkey tmp = NEXT_KEY(&b->c->gc_cur_key);
@@ -1551,12 +1568,14 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
true, b);
if (IS_ERR(r->b)) {
+ /* XXX: handle IO error better */
ret = PTR_ERR(r->b);
break;
}
r->keys = btree_gc_count_keys(r->b);
+ /* See if we should coalesce */
ret = btree_gc_coalesce(b, op, gc, r);
if (ret)
break;
@@ -1708,6 +1727,14 @@ static void bch_btree_gc_finish(struct cache_set *c)
mutex_unlock(&c->bucket_lock);
}
+/**
+ * bch_btree_gc - find reclaimable buckets and clean up the btree
+ *
+ * This will find buckets that are completely unreachable, as well as those
+ * only containing clean data that can be safely discarded. Also, nodes that
+ * contain too many bsets are merged up and re-written, and adjacent nodes
+ * with low occupancy are coalesced together.
+ */
static void bch_btree_gc(struct cache_set *c)
{
struct gc_stat stats;
@@ -2419,7 +2446,10 @@ int bch_btree_insert_node_sync(struct btree *b, struct btree_op *op,
/**
* bch_btree_insert_check_key - insert dummy key into btree
*
- * Used to address a race with cache promotion after cache miss.
+ * We insert a random key on a cache miss, then compare exchange on it
+ * once the cache promotion or backing device read completes. This
+ * ensures that if this key is written to after the read, the read will
+ * lose and not overwrite the key with stale data.
*
* Return values:
* -EAGAIN: @cl was put on a waitlist waiting for btree node allocation