summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-10-06 04:56:55 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:37:21 -0800
commite77dbfe9b212cb4301ff37d9edafc2791b6f4970 (patch)
tree92522842ba127aeddf37f05544cac49134e27bca
parentd00cbd9b839349ecfae58fd0d4845999467ca08a (diff)
bcache: copygc fixes
Copygc now passes really nasty torture tests without deadlocking
-rw-r--r--drivers/md/bcache/alloc.c23
-rw-r--r--drivers/md/bcache/move.c2
-rw-r--r--drivers/md/bcache/movinggc.c97
-rw-r--r--drivers/md/bcache/movinggc.h23
-rw-r--r--drivers/md/bcache/super.c11
5 files changed, 103 insertions, 53 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4e1b3a925c34..3b6ad78ecb9e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -733,11 +733,17 @@ static void invalidate_buckets(struct cache *ca)
static bool __bch_allocator_push(struct cache *ca, long bucket)
{
- unsigned i;
+ if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+ goto success;
+
+ if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
+ goto success;
+
+ if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
+ goto success;
- for (i = 0; i < RESERVE_NR; i++)
- if (fifo_push(&ca->free[i], bucket))
- goto success;
+ if (fifo_push(&ca->free[RESERVE_NONE], bucket))
+ goto success;
return false;
success:
@@ -1030,6 +1036,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
ret = ALLOC_SUCCESS;
err:
+ EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
spin_unlock(&devs->lock);
rcu_read_unlock();
return ret;
@@ -1528,11 +1535,17 @@ static void bch_recalc_capacity(struct cache_set *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_NR; j++)
+ for (j = 0; j < RESERVE_NONE; j++)
reserve += ca->free[j].size;
reserve += ca->free_inc.size;
+ reserve += ARRAY_SIZE(c->write_points);
+
+ if (ca->mi.tier)
+ reserve += 1; /* tiering write point */
+ reserve += 1; /* btree write point */
+
reserved_sectors += reserve << ca->bucket_bits;
capacity += (ca->mi.nbuckets -
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 9c49c6e64f1a..37091d458cb4 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -108,7 +108,7 @@ static int bch_migrate_index_update(struct bch_write_op *op)
break;
} else {
nomatch:
- bch_btree_iter_set_pos(&iter, k.k->p);
+ bch_btree_iter_advance_pos(&iter);
}
while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) {
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 76b92c4f1713..1e6661b00ab9 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -119,12 +119,22 @@ next:
bch_queue_run(&ca->moving_gc_queue, ctxt);
}
-static bool bch_moving_gc(struct cache *ca)
+static bool have_copygc_reserve(struct cache *ca)
+{
+ bool ret;
+
+ spin_lock(&ca->freelist_lock);
+ ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
+ COPYGC_BUCKETS_PER_ITER(ca);
+ spin_unlock(&ca->freelist_lock);
+
+ return ret;
+}
+
+static void bch_moving_gc(struct cache *ca)
{
struct cache_set *c = ca->set;
struct bucket *g;
- bool moved = false;
-
u64 sectors_to_move, sectors_gen, gen_current, sectors_total;
size_t buckets_to_move, buckets_unused = 0;
struct bucket_heap_entry e;
@@ -136,22 +146,21 @@ static bool bch_moving_gc(struct cache *ca)
bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate,
MOVING_PURPOSE_COPY_GC);
- /*
- * We won't fill up the moving GC reserve completely if the data
- * being copied is from different generations. In the worst case,
- * there will be NUM_GC_GENS buckets of internal fragmentation
- */
-
- spin_lock(&ca->freelist_lock);
- reserve_sectors = ca->mi.bucket_size *
- (fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS);
- spin_unlock(&ca->freelist_lock);
+ if (!have_copygc_reserve(ca)) {
+ struct closure cl;
- if (reserve_sectors < (int) c->sb.block_size) {
- trace_bcache_moving_gc_reserve_empty(ca);
- return false;
+ closure_init_stack(&cl);
+ while (1) {
+ closure_wait(&c->freelist_wait, &cl);
+ if (have_copygc_reserve(ca))
+ break;
+ closure_sync(&cl);
+ }
+ closure_wake_up(&c->freelist_wait);
}
+ reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+
trace_bcache_moving_gc_start(ca);
/*
@@ -161,7 +170,15 @@ static bool bch_moving_gc(struct cache *ca)
* buckets have been visited.
*/
+ /*
+ * We need bucket marks to be up to date, so gc can't be recalculating
+ * them, and we don't want the allocator invalidating a bucket after
+ * we've decided to evacuate it but before we set copygc_gen:
+ */
+ down_read(&c->gc_lock);
mutex_lock(&ca->heap_lock);
+ mutex_lock(&ca->set->bucket_lock);
+
ca->heap.used = 0;
for_each_bucket(g, ca) {
g->copygc_gen = 0;
@@ -187,31 +204,18 @@ static bool bch_moving_gc(struct cache *ca)
for (i = 0; i < ca->heap.used; i++)
sectors_to_move += ca->heap.data[i].val;
- /* XXX: calculate this threshold rigorously */
-
- if (ca->heap.used < ca->free_inc.size / 2 &&
- sectors_to_move < reserve_sectors) {
- mutex_unlock(&ca->heap_lock);
- trace_bcache_moving_gc_no_work(ca);
- return false;
- }
-
- while (sectors_to_move > reserve_sectors) {
+ while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
sectors_to_move -= e.val;
}
buckets_to_move = ca->heap.used;
- if (sectors_to_move)
- moved = true;
-
/*
* resort by write_prio to group into generations, attempts to
* keep hot and cold data in the same locality.
*/
- mutex_lock(&ca->set->bucket_lock);
for (i = 0; i < ca->heap.used; i++) {
struct bucket_heap_entry *e = &ca->heap.data[i];
@@ -231,16 +235,20 @@ static bool bch_moving_gc(struct cache *ca)
sectors_total >= sectors_gen * gen_current)
gen_current++;
}
- mutex_unlock(&ca->set->bucket_lock);
+ mutex_unlock(&ca->set->bucket_lock);
mutex_unlock(&ca->heap_lock);
+ up_read(&c->gc_lock);
read_moving(ca, &ctxt);
+ if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+ for_each_bucket(g, ca)
+ BUG_ON(g->copygc_gen && bucket_sectors_used(g));
+ }
+
trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
buckets_to_move);
-
- return moved;
}
static int bch_moving_gc_thread(void *arg)
@@ -249,8 +257,7 @@ static int bch_moving_gc_thread(void *arg)
struct cache_set *c = ca->set;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last;
- s64 next;
- bool moved;
+ u64 available, want, next;
set_freezable();
@@ -263,15 +270,17 @@ static int bch_moving_gc_thread(void *arg)
* don't start copygc until less than half the gc reserve is
* available:
*/
- next = (buckets_available_cache(ca) -
- div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
- c->opts.gc_reserve_percent, 200)) *
- ca->mi.bucket_size;
-
- if (next <= 0)
- moved = bch_moving_gc(ca);
- else
- bch_kthread_io_clock_wait(clock, last + next);
+ available = buckets_available_cache(ca);
+ want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+ c->opts.gc_reserve_percent, 200);
+ if (available > want) {
+ next = last + (available - want) *
+ ca->mi.bucket_size;
+ bch_kthread_io_clock_wait(clock, next);
+ continue;
+ }
+
+ bch_moving_gc(ca);
}
return 0;
diff --git a/drivers/md/bcache/movinggc.h b/drivers/md/bcache/movinggc.h
index 5d09e0fa3ae1..45496a31a95f 100644
--- a/drivers/md/bcache/movinggc.h
+++ b/drivers/md/bcache/movinggc.h
@@ -1,6 +1,29 @@
#ifndef _BCACHE_MOVINGGC_H
#define _BCACHE_MOVINGGC_H
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca) \
+ ((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca) \
+ ((ca)->mi.bucket_size * \
+ (COPYGC_BUCKETS_PER_ITER(ca) - (NUM_GC_GENS - 1)))
+
int bch_moving_init_cache(struct cache *);
void bch_moving_gc_stop(struct cache *);
int bch_moving_gc_thread_start(struct cache *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 309a720c5a80..fd302e7d6daa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1933,11 +1933,16 @@ static const char *cache_alloc(struct bcache_superblock *sb,
ca->bucket_bits = ilog2(ca->mi.bucket_size);
/* XXX: tune these */
- movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2,
+ movinggc_reserve = max_t(size_t, NUM_GC_GENS * 4,
ca->mi.nbuckets >> 7);
reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
- free_inc_reserve = reserve_none << 1;
- heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve);
+ /*
+ * free_inc must be smaller than the copygc reserve: if it was bigger,
+ * one copygc iteration might not make enough buckets available to fill
+ * up free_inc and allow the allocator to make forward progress
+ */
+ free_inc_reserve = movinggc_reserve / 2;
+ heap_size = movinggc_reserve * 8;
if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||