diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-06 04:56:55 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-07 12:37:21 -0800 |
commit | e77dbfe9b212cb4301ff37d9edafc2791b6f4970 (patch) | |
tree | 92522842ba127aeddf37f05544cac49134e27bca | |
parent | d00cbd9b839349ecfae58fd0d4845999467ca08a (diff) |
bcache: copygc fixes
Copygc now passes really nasty torture tests without deadlocking
-rw-r--r-- | drivers/md/bcache/alloc.c | 23 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 97 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.h | 23 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 11 |
5 files changed, 103 insertions, 53 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4e1b3a925c34..3b6ad78ecb9e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -733,11 +733,17 @@ static void invalidate_buckets(struct cache *ca) static bool __bch_allocator_push(struct cache *ca, long bucket) { - unsigned i; + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + goto success; + + if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) + goto success; + + if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) + goto success; - for (i = 0; i < RESERVE_NR; i++) - if (fifo_push(&ca->free[i], bucket)) - goto success; + if (fifo_push(&ca->free[RESERVE_NONE], bucket)) + goto success; return false; success: @@ -1030,6 +1036,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, ret = ALLOC_SUCCESS; err: + EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); spin_unlock(&devs->lock); rcu_read_unlock(); return ret; @@ -1528,11 +1535,17 @@ static void bch_recalc_capacity(struct cache_set *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_NR; j++) + for (j = 0; j < RESERVE_NONE; j++) reserve += ca->free[j].size; reserve += ca->free_inc.size; + reserve += ARRAY_SIZE(c->write_points); + + if (ca->mi.tier) + reserve += 1; /* tiering write point */ + reserve += 1; /* btree write point */ + reserved_sectors += reserve << ca->bucket_bits; capacity += (ca->mi.nbuckets - diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index 9c49c6e64f1a..37091d458cb4 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -108,7 +108,7 @@ static int bch_migrate_index_update(struct bch_write_op *op) break; } else { nomatch: - bch_btree_iter_set_pos(&iter, k.k->p); + bch_btree_iter_advance_pos(&iter); } while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) { diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 76b92c4f1713..1e6661b00ab9 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -119,12 +119,22 @@ next: bch_queue_run(&ca->moving_gc_queue, ctxt); } -static bool bch_moving_gc(struct cache *ca) +static bool have_copygc_reserve(struct cache *ca) +{ + bool ret; + + spin_lock(&ca->freelist_lock); + ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >= + COPYGC_BUCKETS_PER_ITER(ca); + spin_unlock(&ca->freelist_lock); + + return ret; +} + +static void bch_moving_gc(struct cache *ca) { struct cache_set *c = ca->set; struct bucket *g; - bool moved = false; - u64 sectors_to_move, sectors_gen, gen_current, sectors_total; size_t buckets_to_move, buckets_unused = 0; struct bucket_heap_entry e; @@ -136,22 +146,21 @@ static bool bch_moving_gc(struct cache *ca) bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate, MOVING_PURPOSE_COPY_GC); - /* - * We won't fill up the moving GC reserve completely if the data - * being copied is from different generations. In the worst case, - * there will be NUM_GC_GENS buckets of internal fragmentation - */ - - spin_lock(&ca->freelist_lock); - reserve_sectors = ca->mi.bucket_size * - (fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS); - spin_unlock(&ca->freelist_lock); + if (!have_copygc_reserve(ca)) { + struct closure cl; - if (reserve_sectors < (int) c->sb.block_size) { - trace_bcache_moving_gc_reserve_empty(ca); - return false; + closure_init_stack(&cl); + while (1) { + closure_wait(&c->freelist_wait, &cl); + if (have_copygc_reserve(ca)) + break; + closure_sync(&cl); + } + closure_wake_up(&c->freelist_wait); } + reserve_sectors = COPYGC_SECTORS_PER_ITER(ca); + trace_bcache_moving_gc_start(ca); /* @@ -161,7 +170,15 @@ static bool bch_moving_gc(struct cache *ca) * buckets have been visited. */ + /* + * We need bucket marks to be up to date, so gc can't be recalculating + * them, and we don't want the allocator invalidating a bucket after + * we've decided to evacuate it but before we set copygc_gen: + */ + down_read(&c->gc_lock); mutex_lock(&ca->heap_lock); + mutex_lock(&ca->set->bucket_lock); + ca->heap.used = 0; for_each_bucket(g, ca) { g->copygc_gen = 0; @@ -187,31 +204,18 @@ static bool bch_moving_gc(struct cache *ca) for (i = 0; i < ca->heap.used; i++) sectors_to_move += ca->heap.data[i].val; - /* XXX: calculate this threshold rigorously */ - - if (ca->heap.used < ca->free_inc.size / 2 && - sectors_to_move < reserve_sectors) { - mutex_unlock(&ca->heap_lock); - trace_bcache_moving_gc_no_work(ca); - return false; - } - - while (sectors_to_move > reserve_sectors) { + while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp)); sectors_to_move -= e.val; } buckets_to_move = ca->heap.used; - if (sectors_to_move) - moved = true; - /* * resort by write_prio to group into generations, attempts to * keep hot and cold data in the same locality. */ - mutex_lock(&ca->set->bucket_lock); for (i = 0; i < ca->heap.used; i++) { struct bucket_heap_entry *e = &ca->heap.data[i]; @@ -231,16 +235,20 @@ static bool bch_moving_gc(struct cache *ca) sectors_total >= sectors_gen * gen_current) gen_current++; } - mutex_unlock(&ca->set->bucket_lock); + mutex_unlock(&ca->set->bucket_lock); mutex_unlock(&ca->heap_lock); + up_read(&c->gc_lock); read_moving(ca, &ctxt); + if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { + for_each_bucket(g, ca) + BUG_ON(g->copygc_gen && bucket_sectors_used(g)); + } + trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, buckets_to_move); - - return moved; } static int bch_moving_gc_thread(void *arg) @@ -249,8 +257,7 @@ static int bch_moving_gc_thread(void *arg) struct cache_set *c = ca->set; struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last; - s64 next; - bool moved; + u64 available, want, next; set_freezable(); @@ -263,15 +270,17 @@ static int bch_moving_gc_thread(void *arg) * don't start copygc until less than half the gc reserve is * available: */ - next = (buckets_available_cache(ca) - - div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * - c->opts.gc_reserve_percent, 200)) * - ca->mi.bucket_size; - - if (next <= 0) - moved = bch_moving_gc(ca); - else - bch_kthread_io_clock_wait(clock, last + next); + available = buckets_available_cache(ca); + want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * + c->opts.gc_reserve_percent, 200); + if (available > want) { + next = last + (available - want) * + ca->mi.bucket_size; + bch_kthread_io_clock_wait(clock, next); + continue; + } + + bch_moving_gc(ca); } return 0; diff --git a/drivers/md/bcache/movinggc.h b/drivers/md/bcache/movinggc.h index 5d09e0fa3ae1..45496a31a95f 100644 --- a/drivers/md/bcache/movinggc.h +++ b/drivers/md/bcache/movinggc.h @@ -1,6 +1,29 @@ #ifndef _BCACHE_MOVINGGC_H #define _BCACHE_MOVINGGC_H +/* + * We can't use the entire copygc reserve in one iteration of copygc: we may + * need the buckets we're freeing up to go back into the copygc reserve to make + * forward progress, but if the copygc reserve is full they'll be available for + * any allocation - and it's possible that in a given iteration, we free up most + * of the buckets we're going to free before we allocate most of the buckets + * we're going to allocate. + * + * If we only use half of the reserve per iteration, then in steady state we'll + * always have room in the reserve for the buckets we're going to need in the + * next iteration: + */ +#define COPYGC_BUCKETS_PER_ITER(ca) \ + ((ca)->free[RESERVE_MOVINGGC].size / 2) + +/* + * Max sectors to move per iteration: Have to take into account internal + * fragmentation from the multiple write points for each generation: + */ +#define COPYGC_SECTORS_PER_ITER(ca) \ + ((ca)->mi.bucket_size * \ + (COPYGC_BUCKETS_PER_ITER(ca) - (NUM_GC_GENS - 1))) + int bch_moving_init_cache(struct cache *); void bch_moving_gc_stop(struct cache *); int bch_moving_gc_thread_start(struct cache *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 309a720c5a80..fd302e7d6daa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1933,11 +1933,16 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->bucket_bits = ilog2(ca->mi.bucket_size); /* XXX: tune these */ - movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2, + movinggc_reserve = max_t(size_t, NUM_GC_GENS * 4, ca->mi.nbuckets >> 7); reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); - free_inc_reserve = reserve_none << 1; - heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve); + /* + * free_inc must be smaller than the copygc reserve: if it was bigger, + * one copygc iteration might not make enough buckets available to fill + * up free_inc and allow the allocator to make forward progress + */ + free_inc_reserve = movinggc_reserve / 2; + heap_size = movinggc_reserve * 8; if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || |