bcache: comments

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
author: Slava Pestov <sviatoslavpestov@gmail.com> 2014-06-21 17:24:27 -0700
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 09:00:19 -0800
commit: 6cbf03d43a3711873d64de8ef631b0c599045cc6 (patch)
tree: af6485e45e112f2051d8b39cc686b1a4ace86a37
parent: 5e0c54762b17d0a59750a4e79c0393c7873871f4 (diff)
2 files changed, 69 insertions, 28 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index d1012bb5dc78..70e11a4132fd 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -32,14 +32,7 @@
  * If we've got discards enabled, that happens when a bucket moves from the
  * free_inc list to the free list.
  *
- * There is another freelist, because sometimes we have buckets that we know
- * have nothing pointing into them - these we can reuse without waiting for
- * priorities to be rewritten. These come from freed btree nodes and buckets
- * that garbage collection discovered no longer had valid keys pointing into
- * them (because they were overwritten). That's the unused list - buckets on the
- * unused list move to the free list, optionally being discarded in the process.
- *
- * It's also important to ensure that gens don't wrap around - with respect to
+ * It's important to ensure that gens don't wrap around - with respect to
  * either the oldest gen in the btree or the gen on disk. This is quite
  * difficult to do in practice, but we explicitly guard against it anyways - if
  * a bucket is in danger of wrapping around we simply skip invalidating it that
@@ -51,7 +44,7 @@
  * bch_bucket_alloc_set() allocates one or more buckets from different caches
  * out of a cache set.
  *
- * free_some_buckets() drives all the processes described above. It's called
+ * invalidate_buckets() drives all the processes described above. It's called
  * from bch_bucket_alloc() and a few other places that need to make sure free
  * buckets are ready.
  *
@@ -70,6 +63,12 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
+/**
+ * alloc_failed - kick off external processes to free up buckets
+ *
+ * We couldn't find enough buckets in invalidate_buckets(). Ask
+ * btree GC to run, hoping it will find more clean buckets.
+ */
 static void alloc_failed(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
@@ -397,6 +396,14 @@ static int bch_allocator_push(struct cache *ca, long bucket)
 	return false;
 }
 
+/**
+ * bch_allocator_thread - move buckets from free_inc to reserves
+ *
+ * The free_inc FIFO is populated by invalidate_buckets(), and
+ * the reserves are depleted by bucket allocation. When we run out
+ * of free_inc, try to invalidate some buckets and write out
+ * prios and gens.
+ */
 static int bch_allocator_thread(void *arg)
 {
 	struct cache *ca = arg;
@@ -406,8 +413,8 @@ static int bch_allocator_thread(void *arg)
 
 	while (1) {
 		/*
-		 * First, we pull buckets off of the free_inc lists, possibly
-		 * issue discards to them, then we add the bucket to the
+		 * First, we pull buckets off of the free_inc list, possibly
+		 * issue discards to them, then we add the bucket to a
 		 * free list:
 		 */
 		while (!fifo_empty(&ca->free_inc)) {
@@ -427,6 +434,10 @@ static int bch_allocator_thread(void *arg)
 				mutex_lock(&c->bucket_lock);
 			}
 
+			/*
+			 * Wait for someone to allocate a bucket if all
+			 * reserves are full:
+			 */
 			allocator_wait(c, ca->fifo_wait,
 					bch_allocator_push(ca, bucket));
 			fifo_pop(&ca->free_inc, bucket);
@@ -434,36 +445,36 @@ static int bch_allocator_thread(void *arg)
 			closure_wake_up(&c->bucket_wait);
 		}
 
-		/*
-		 * We've run out of free buckets, we need to find some buckets
-		 * we can invalidate. First, invalidate them in memory and add
-		 * them to the free_inc list:
-		 */
+		/* We've run out of free buckets! */
 
 retry_invalidate:
+		/* Wait for in-progress GC to finish if there is one */
 		allocator_wait(c, c->gc_wait, c->gc_mark_valid);
 
+		/*
+		 * Find some buckets that we can invalidate, either they're
+		 * completely unused, or only contain clean data that's been
+		 * written back to the backing device or another cache tier
+		 */
 		invalidate_buckets(ca);
 
 		if (CACHE_SYNC(&ca->set->sb)) {
-			/*
-			 * This could deadlock if an allocation with a btree
-			 * node locked ever blocked - having the btree node
-			 * locked would block garbage collection, but here we're
-			 * waiting on garbage collection before we invalidate
-			 * and free anything.
-			 *
-			 * But this should be safe since the btree code always
-			 * uses btree_check_reserve() before allocating now, and
-			 * if it fails it blocks without btree nodes locked.
-			 */
 			trace_bcache_alloc_batch(ca,
 						fifo_used(&ca->free_inc),
 						ca->free_inc.size);
 
+			/*
+			 * If we didn't invalidate enough buckets to fill up
+			 * free_inc, try to invalidate some more. This will
+			 * limit the amount of metadata writes we issue below
+			 */
 			if (!fifo_full(&ca->free_inc))
 				goto retry_invalidate;
 
+			/*
+			 * free_inc is full of newly-invalidated buckets, must
+			 * write out prios and gens before they can be re-used
+			 */
 			bch_prio_write(ca);
 		}
 	}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index dd2fdce21f03..45903db933f8 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1472,6 +1472,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	return -EINTR;
 
 out_nocoalesce:
+	/* We may have written out some new nodes which are garbage now,
+	 * wait for writes to finish */
 	closure_sync(&cl);
 	bch_keylist_free(&keylist);
 
@@ -1483,6 +1485,9 @@ out_nocoalesce:
 	return 0;
 }
 
+/**
+ * btree_gc_rewrite_node - merge node bsets together and update parent
+ */
 static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
 				 struct btree *replace)
 {
@@ -1517,6 +1522,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
 	return -EINTR;
 }
 
+/**
+ * btree_gc_count_keys - count keys in a btree node to see if we must coalesce
+ */
 static unsigned btree_gc_count_keys(struct btree *b)
 {
 	struct bkey *k;
@@ -1529,6 +1537,13 @@ static unsigned btree_gc_count_keys(struct btree *b)
 	return ret;
 }
 
+/**
+ * btree_gc_recurse - tracing garbage collection on a node and children
+ *
+ * This may bail out early for a variety of reasons. This is allowed
+ * because concurrent writes will conservatively mark buckets dirty,
+ * ensuring they won't be touched until the next GC pass.
+ */
 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct gc_stat *gc)
 {
@@ -1536,6 +1551,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 	bool should_rewrite;
 	struct bkey *k;
 	struct btree_iter iter;
+
+	/* Sliding window of GC_MERGE_NODES adjacent btree nodes */
 	struct gc_merge_info r[GC_MERGE_NODES];
 	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
 	struct bkey tmp = NEXT_KEY(&b->c->gc_cur_key);
@@ -1551,12 +1568,14 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
 						  true, b);
 			if (IS_ERR(r->b)) {
+				/* XXX: handle IO error better */
 				ret = PTR_ERR(r->b);
 				break;
 			}
 
 			r->keys = btree_gc_count_keys(r->b);
 
+			/* See if we should coalesce */
 			ret = btree_gc_coalesce(b, op, gc, r);
 			if (ret)
 				break;
@@ -1708,6 +1727,14 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	mutex_unlock(&c->bucket_lock);
 }
 
+/**
+ * bch_btree_gc - find reclaimable buckets and clean up the btree
+ *
+ * This will find buckets that are completely unreachable, as well as those
+ * only containing clean data that can be safely discarded. Also, nodes that
+ * contain too many bsets are merged up and re-written, and adjacent nodes
+ * with low occupancy are coalesced together.
+ */
 static void bch_btree_gc(struct cache_set *c)
 {
 	struct gc_stat stats;
@@ -2419,7 +2446,10 @@ int bch_btree_insert_node_sync(struct btree *b, struct btree_op *op,
 /**
  * bch_btree_insert_check_key - insert dummy key into btree
  *
- * Used to address a race with cache promotion after cache miss.
+ * We insert a random key on a cache miss, then compare exchange on it
+ * once the cache promotion or backing device read completes. This
+ * ensures that if this key is written to after the read, the read will
+ * lose and not overwrite the key with stale data.
  *
  * Return values:
  * -EAGAIN: @cl was put on a waitlist waiting for btree node allocation
author	Slava Pestov <sviatoslavpestov@gmail.com>	2014-06-21 17:24:27 -0700
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 09:00:19 -0800
commit	6cbf03d43a3711873d64de8ef631b0c599045cc6 (patch)
tree	af6485e45e112f2051d8b39cc686b1a4ace86a37
parent	5e0c54762b17d0a59750a4e79c0393c7873871f4 (diff)