bcache: copygc fixes

Copygc now passes really nasty torture tests without deadlocking
author: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-06 04:56:55 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:37:21 -0800
commit: e77dbfe9b212cb4301ff37d9edafc2791b6f4970 (patch)
tree: 92522842ba127aeddf37f05544cac49134e27bca
parent: d00cbd9b839349ecfae58fd0d4845999467ca08a (diff)
5 files changed, 103 insertions, 53 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4e1b3a925c34..3b6ad78ecb9e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -733,11 +733,17 @@ static void invalidate_buckets(struct cache *ca)
 
 static bool __bch_allocator_push(struct cache *ca, long bucket)
 {
-	unsigned i;
+	if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
+		goto success;
+
+	if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
+		goto success;
 
-	for (i = 0; i < RESERVE_NR; i++)
-		if (fifo_push(&ca->free[i], bucket))
-			goto success;
+	if (fifo_push(&ca->free[RESERVE_NONE], bucket))
+		goto success;
 
 	return false;
 success:
@@ -1030,6 +1036,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 
 	ret = ALLOC_SUCCESS;
 err:
+	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
 	spin_unlock(&devs->lock);
 	rcu_read_unlock();
 	return ret;
@@ -1528,11 +1535,17 @@ static void bch_recalc_capacity(struct cache_set *c)
 		 * allocations for foreground writes must wait -
 		 * not -ENOSPC calculations.
 		 */
-		for (j = 0; j < RESERVE_NR; j++)
+		for (j = 0; j < RESERVE_NONE; j++)
 			reserve += ca->free[j].size;
 
 		reserve += ca->free_inc.size;
 
+		reserve += ARRAY_SIZE(c->write_points);
+
+		if (ca->mi.tier)
+			reserve += 1;	/* tiering write point */
+		reserve += 1;		/* btree write point */
+
 		reserved_sectors += reserve << ca->bucket_bits;
 
 		capacity += (ca->mi.nbuckets -
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 9c49c6e64f1a..37091d458cb4 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -108,7 +108,7 @@ static int bch_migrate_index_update(struct bch_write_op *op)
 				break;
 		} else {
 nomatch:
-			bch_btree_iter_set_pos(&iter, k.k->p);
+			bch_btree_iter_advance_pos(&iter);
 		}
 
 		while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) {
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 76b92c4f1713..1e6661b00ab9 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -119,12 +119,22 @@ next:
 	bch_queue_run(&ca->moving_gc_queue, ctxt);
 }
 
-static bool bch_moving_gc(struct cache *ca)
+static bool have_copygc_reserve(struct cache *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
+		COPYGC_BUCKETS_PER_ITER(ca);
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch_moving_gc(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 	struct bucket *g;
-	bool moved = false;
-
 	u64 sectors_to_move, sectors_gen, gen_current, sectors_total;
 	size_t buckets_to_move, buckets_unused = 0;
 	struct bucket_heap_entry e;
@@ -136,22 +146,21 @@ static bool bch_moving_gc(struct cache *ca)
 	bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate,
 				MOVING_PURPOSE_COPY_GC);
 
-	/*
-	 * We won't fill up the moving GC reserve completely if the data
-	 * being copied is from different generations. In the worst case,
-	 * there will be NUM_GC_GENS buckets of internal fragmentation
-	 */
-
-	spin_lock(&ca->freelist_lock);
-	reserve_sectors = ca->mi.bucket_size *
-		(fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS);
-	spin_unlock(&ca->freelist_lock);
+	if (!have_copygc_reserve(ca)) {
+		struct closure cl;
 
-	if (reserve_sectors < (int) c->sb.block_size) {
-		trace_bcache_moving_gc_reserve_empty(ca);
-		return false;
+		closure_init_stack(&cl);
+		while (1) {
+			closure_wait(&c->freelist_wait, &cl);
+			if (have_copygc_reserve(ca))
+				break;
+			closure_sync(&cl);
+		}
+		closure_wake_up(&c->freelist_wait);
 	}
 
+	reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+
 	trace_bcache_moving_gc_start(ca);
 
 	/*
@@ -161,7 +170,15 @@ static bool bch_moving_gc(struct cache *ca)
 	 * buckets have been visited.
 	 */
 
+	/*
+	 * We need bucket marks to be up to date, so gc can't be recalculating
+	 * them, and we don't want the allocator invalidating a bucket after
+	 * we've decided to evacuate it but before we set copygc_gen:
+	 */
+	down_read(&c->gc_lock);
 	mutex_lock(&ca->heap_lock);
+	mutex_lock(&ca->set->bucket_lock);
+
 	ca->heap.used = 0;
 	for_each_bucket(g, ca) {
 		g->copygc_gen = 0;
@@ -187,31 +204,18 @@ static bool bch_moving_gc(struct cache *ca)
 	for (i = 0; i < ca->heap.used; i++)
 		sectors_to_move += ca->heap.data[i].val;
 
-	/* XXX: calculate this threshold rigorously */
-
-	if (ca->heap.used < ca->free_inc.size / 2 &&
-	    sectors_to_move < reserve_sectors) {
-		mutex_unlock(&ca->heap_lock);
-		trace_bcache_moving_gc_no_work(ca);
-		return false;
-	}
-
-	while (sectors_to_move > reserve_sectors) {
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
 		BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
 		sectors_to_move -= e.val;
 	}
 
 	buckets_to_move = ca->heap.used;
 
-	if (sectors_to_move)
-		moved = true;
-
 	/*
 	 * resort by write_prio to group into generations, attempts to
 	 * keep hot and cold data in the same locality.
 	 */
 
-	mutex_lock(&ca->set->bucket_lock);
 	for (i = 0; i < ca->heap.used; i++) {
 		struct bucket_heap_entry *e = &ca->heap.data[i];
 
@@ -231,16 +235,20 @@ static bool bch_moving_gc(struct cache *ca)
 		    sectors_total >= sectors_gen * gen_current)
 			gen_current++;
 	}
-	mutex_unlock(&ca->set->bucket_lock);
 
+	mutex_unlock(&ca->set->bucket_lock);
 	mutex_unlock(&ca->heap_lock);
+	up_read(&c->gc_lock);
 
 	read_moving(ca, &ctxt);
 
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		for_each_bucket(g, ca)
+			BUG_ON(g->copygc_gen && bucket_sectors_used(g));
+	}
+
 	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
 				buckets_to_move);
-
-	return moved;
 }
 
 static int bch_moving_gc_thread(void *arg)
@@ -249,8 +257,7 @@ static int bch_moving_gc_thread(void *arg)
 	struct cache_set *c = ca->set;
 	struct io_clock *clock = &c->io_clock[WRITE];
 	unsigned long last;
-	s64 next;
-	bool moved;
+	u64 available, want, next;
 
 	set_freezable();
 
@@ -263,15 +270,17 @@ static int bch_moving_gc_thread(void *arg)
 		 * don't start copygc until less than half the gc reserve is
 		 * available:
 		 */
-		next = (buckets_available_cache(ca) -
-			div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-				  c->opts.gc_reserve_percent, 200)) *
-			ca->mi.bucket_size;
-
-		if (next <= 0)
-			moved = bch_moving_gc(ca);
-		else
-			bch_kthread_io_clock_wait(clock, last + next);
+		available = buckets_available_cache(ca);
+		want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+				 c->opts.gc_reserve_percent, 200);
+		if (available > want) {
+			next = last + (available - want) *
+				ca->mi.bucket_size;
+			bch_kthread_io_clock_wait(clock, next);
+			continue;
+		}
+
+		bch_moving_gc(ca);
 	}
 
 	return 0;
diff --git a/drivers/md/bcache/movinggc.h b/drivers/md/bcache/movinggc.h
index 5d09e0fa3ae1..45496a31a95f 100644
--- a/drivers/md/bcache/movinggc.h
+++ b/drivers/md/bcache/movinggc.h
@@ -1,6 +1,29 @@
 #ifndef _BCACHE_MOVINGGC_H
 #define _BCACHE_MOVINGGC_H
 
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *						\
+	 (COPYGC_BUCKETS_PER_ITER(ca) - (NUM_GC_GENS - 1)))
+
 int bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
 int bch_moving_gc_thread_start(struct cache *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 309a720c5a80..fd302e7d6daa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1933,11 +1933,16 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 	ca->bucket_bits = ilog2(ca->mi.bucket_size);
 
 	/* XXX: tune these */
-	movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2,
+	movinggc_reserve = max_t(size_t, NUM_GC_GENS * 4,
 				 ca->mi.nbuckets >> 7);
 	reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
-	free_inc_reserve = reserve_none << 1;
-	heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve);
+	/*
+	 * free_inc must be smaller than the copygc reserve: if it was bigger,
+	 * one copygc iteration might not make enough buckets available to fill
+	 * up free_inc and allow the allocator to make forward progress
+	 */
+	free_inc_reserve = movinggc_reserve / 2;
+	heap_size = movinggc_reserve * 8;
 
 	if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-06 04:56:55 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:37:21 -0800
commit	e77dbfe9b212cb4301ff37d9edafc2791b6f4970 (patch)
tree	92522842ba127aeddf37f05544cac49134e27bca
parent	d00cbd9b839349ecfae58fd0d4845999467ca08a (diff)