diff options
-rw-r--r-- | drivers/md/bcache/alloc.c | 298 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 7 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 55 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 15 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 21 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 102 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 46 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 15 |
10 files changed, 222 insertions, 349 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6209e42c5633..cc6a09c1205c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -746,12 +746,14 @@ int bch_bucket_alloc_set(struct cache_set *c, enum alloc_reserve reserve, long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)]; int i, ret; - mutex_lock(&c->bucket_lock); + BUG_ON(tier_idx > ARRAY_SIZE(c->cache_by_alloc)); BUG_ON(!n || n > BKEY_EXTENT_PTRS_MAX); bkey_init(k); memset(caches_used, 0, sizeof(caches_used)); + mutex_lock(&c->bucket_lock); + /* sort by free space/prio of oldest data in caches */ for (i = 0; i < n; i++) { @@ -850,8 +852,7 @@ static struct open_bucket *bch_open_bucket_get(struct cache_set *c, } static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c, - enum alloc_reserve reserve, - int n, unsigned tier, + struct write_point *wp, struct closure *cl) { int ret; @@ -861,105 +862,74 @@ static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c, if (IS_ERR_OR_NULL(b)) return b; - ret = bch_bucket_alloc_set(c, reserve, &b->key, n, tier, cl); - if (ret) { - BUG_ON(ret > 0); - bch_open_bucket_put(c, b); - b = ERR_PTR(ret); + if (wp->ca) { + long bucket; + + mutex_lock(&c->bucket_lock); + + bucket = bch_bucket_alloc(wp->ca, RESERVE_MOVINGGC, cl); + if (bucket < 0) { + ret = bucket; + mutex_unlock(&c->bucket_lock); + goto err; + } + + b->key.val[0] = PTR(wp->ca->bucket_gens[bucket], + bucket_to_sector(wp->ca->set, bucket), + wp->ca->sb.nr_this_dev); + bch_set_extent_ptrs(&b->key, 1); + + mutex_unlock(&c->bucket_lock); + } else if (wp->tier) { + ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key, 1, + wp->tier - c->cache_by_alloc, cl); + if (ret) + goto err; + } else { + ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key, + CACHE_SET_DATA_REPLICAS_WANT(&c->sb), + 0, cl); + if (ret) + goto err; } return b; +err: + bch_open_bucket_put(c, b); + return ERR_PTR(ret); } /* Sector allocator */ -/* - * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. - * - * The ideas is if you've got multiple tasks pulling data into the cache at the - * same time, you'll get better cache utilization if you try to segregate their - * data and preserve locality. - * - * For example, say you've starting Firefox at the same time you're copying a - * bunch of files. Firefox will likely end up being fairly hot and stay in the - * cache awhile, but the data you copied might not be; if you wrote all that - * data to the same buckets it'd get invalidated at the same time. - * - * Both of those tasks will be doing fairly random IO so we can't rely on - * detecting sequential IO to segregate their data, but going off of the task - * should be a sane heuristic. - */ -static struct open_bucket *pick_data_bucket(struct cache_set *c, - const struct bkey *search, - unsigned write_point, - unsigned tier_idx, - struct closure *cl) - __releases(c->open_buckets_lock) - __acquires(c->open_buckets_lock) +static struct open_bucket *lock_and_refill_writepoint(struct cache_set *c, + struct write_point *wp, + struct closure *cl) { - struct cache_tier *tier = &c->cache_by_alloc[tier_idx]; struct open_bucket *b; - int i, wp = -1; -retry: - for (i = 0; - i < ARRAY_SIZE(tier->data_buckets) && - (b = tier->data_buckets[i]); i++) { - /* Tiering thread already writes keys in order, maximize - * write bandwidth instead */ - if (tier_idx == 0 && !bkey_cmp(&b->key, &START_KEY(search))) - goto found; - else if (b->last_write_point == write_point) - wp = i; - } - - i = wp; - if (i >= 0) - goto found; - - i = ARRAY_SIZE(tier->data_buckets) - 1; - if (tier->data_buckets[i]) - goto found; - spin_unlock(&c->open_buckets_lock); - b = bch_open_bucket_alloc(c, RESERVE_NONE, - CACHE_SET_DATA_REPLICAS_WANT(&c->sb), - tier_idx, cl); - spin_lock(&c->open_buckets_lock); + while (1) { + b = ACCESS_ONCE(wp->b); + if (b) { + spin_lock(&b->lock); + if (wp->b == b) + return b; - if (IS_ERR_OR_NULL(b)) - return b; + spin_unlock(&b->lock); + } else { + b = bch_open_bucket_alloc(c, wp, cl); + if (IS_ERR_OR_NULL(b)) + return b; - if (tier->data_buckets[i]) { - /* we raced - and we must unlock to call bch_bucket_free()... */ - spin_unlock(&c->open_buckets_lock); - bch_bucket_free_never_used(c, &b->key); - spin_lock(&c->open_buckets_lock); + spin_lock(&b->lock); + if (!race_fault() && + cmpxchg(&wp->b, NULL, b) == NULL) + return b; + spin_unlock(&b->lock); - __bch_open_bucket_put(c, b); - goto retry; - } else { - tier->data_buckets[i] = b; + bch_bucket_free_never_used(c, &b->key); + bch_open_bucket_put(c, b); + } } -found: - b = tier->data_buckets[i]; - - /* - * Move b to the end of the lru, and keep track of what - * this bucket was last used for: - */ - memmove(&tier->data_buckets[1], - &tier->data_buckets[0], - sizeof(struct open_bucket *) * i); - - tier->data_buckets[0] = b; - - b->last_write_point = write_point; - bkey_copy_key(&b->key, search); - - return b; } static void verify_not_stale(struct cache_set *c, struct bkey *k) @@ -993,34 +963,28 @@ static void verify_not_stale(struct cache_set *c, struct bkey *k) * @tier_idx - which tier this write is destined towards * @cl - closure to wait for a bucket */ -struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k, - unsigned write_point, unsigned tier_idx, - unsigned long *ptrs_to_write, +struct open_bucket *bch_alloc_sectors(struct cache_set *c, + struct write_point *wp, + struct bkey *k, struct closure *cl) { - struct cache_tier *tier = &c->cache_by_alloc[tier_idx]; struct open_bucket *b; unsigned i, sectors; - spin_lock(&c->open_buckets_lock); - - b = pick_data_bucket(c, k, write_point, tier_idx, cl); + b = lock_and_refill_writepoint(c, wp, cl); if (IS_ERR_OR_NULL(b)) - goto out; + return b; - BUG_ON(b != tier->data_buckets[0]); + BUG_ON(!b->sectors_free); verify_not_stale(c, &b->key); /* Set up the pointer to the space we're allocating: */ + memcpy(&k->val[bch_extent_ptrs(k)], + &b->key.val[0], + bch_extent_ptrs(&b->key) * sizeof(u64)); - for (i = 0; i < bch_extent_ptrs(&b->key); i++) { - unsigned ptrs = bch_extent_ptrs(k); - - k->val[ptrs] = b->key.val[i]; - __set_bit(ptrs, ptrs_to_write); - bch_set_extent_ptrs(k, ptrs + 1); - } + bch_set_extent_ptrs(k, bch_extent_ptrs(k) + bch_extent_ptrs(&b->key)); sectors = min_t(unsigned, KEY_SIZE(k), b->sectors_free); @@ -1030,6 +994,10 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k, /* update open bucket for next time: */ b->sectors_free -= sectors; + if (b->sectors_free) + atomic_inc(&b->pin); + else + BUG_ON(xchg(&wp->b, NULL) != b); rcu_read_lock(); for (i = 0; i < bch_extent_ptrs(&b->key); i++) { @@ -1044,127 +1012,11 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k, } rcu_read_unlock(); - /* - * k takes refcounts on the buckets it points to until it's inserted - * into the btree, but if we're done with this bucket we just transfer - * get_data_bucket()'s refcount. - */ - - if (b->sectors_free) { - atomic_inc(&b->pin); - } else { - memmove(&tier->data_buckets[0], - &tier->data_buckets[1], - sizeof(struct open_bucket *) * - (ARRAY_SIZE(tier->data_buckets) - 1)); - tier->data_buckets[ARRAY_SIZE(tier->data_buckets) - 1] = NULL; - } -out: - spin_unlock(&c->open_buckets_lock); + spin_unlock(&b->lock); return b; } -struct open_bucket *bch_gc_alloc_sectors(struct cache_set *c, struct bkey *k, - unsigned long *ptrs_to_write, - struct closure *cl) -{ - unsigned i, gen, sectors = KEY_SIZE(k); - struct cache *ca; - struct open_bucket *b; - long bucket; - - mutex_lock(&c->bucket_lock); -retry: - /* Check if we raced with a foreground write */ - - rcu_read_lock(); - for (i = 0; i < bch_extent_ptrs(k); i++) - if ((ca = PTR_CACHE(c, k, i)) && - (gen = PTR_BUCKET(c, ca, k, i)->copygc_gen)) { - gen--; - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - mutex_unlock(&c->bucket_lock); - return ERR_PTR(-ESRCH); -found: - b = ca->gc_buckets[gen]; - if (!b) { - mutex_unlock(&c->bucket_lock); - - b = bch_open_bucket_get(c, NULL); - if (WARN_ONCE(IS_ERR(b), - "bcache: movinggc bucket allocation failed with %ld", - PTR_ERR(b))) { - b = ERR_PTR(-ENOSPC); - goto out_put; - } - - mutex_lock(&c->bucket_lock); - - bucket = bch_bucket_alloc(ca, RESERVE_MOVINGGC, NULL); - if (WARN_ONCE(bucket < 0, - "bcache: movinggc bucket allocation failed with %ld", - bucket)) { - mutex_unlock(&c->bucket_lock); - bch_open_bucket_put(c, b); - b = ERR_PTR(-ENOSPC); - goto out_put; - } - - b->key.val[0] = PTR(ca->bucket_gens[bucket], - bucket_to_sector(ca->set, bucket), - ca->sb.nr_this_dev); - bch_set_extent_ptrs(&b->key, 1); - - /* we dropped bucket_lock, might've raced */ - if (ca->gc_buckets[gen] || race_fault()) { - /* we raced */ - mutex_unlock(&c->bucket_lock); - bch_bucket_free_never_used(c, &b->key); - bch_open_bucket_put(c, b); - mutex_lock(&c->bucket_lock); - } else { - ca->gc_buckets[gen] = b; - } - - /* - * GC_GEN() might also have been reset... don't strictly need to - * recheck though - */ - percpu_ref_put(&ca->ref); - goto retry; - } - - verify_not_stale(c, &b->key); - - k->val[i] = b->key.val[0]; - __set_bit(i, ptrs_to_write); - - sectors = min_t(unsigned, sectors, b->sectors_free); - - SET_KEY_OFFSET(k, KEY_START(k) + sectors); - SET_KEY_SIZE(k, sectors); - - /* update open bucket for next time: */ - - b->sectors_free -= sectors; - if (b->sectors_free) { - SET_PTR_OFFSET(&b->key, 0, PTR_OFFSET(&b->key, 0) + sectors); - atomic_inc(&b->pin); - } else - ca->gc_buckets[gen] = NULL; - - atomic_long_add(sectors, &ca->sectors_written); - mutex_unlock(&c->bucket_lock); -out_put: - percpu_ref_put(&ca->ref); - return b; -} - void bch_mark_open_buckets(struct cache_set *c) { struct cache *ca; @@ -1214,9 +1066,13 @@ void bch_open_buckets_init(struct cache_set *c) spin_lock_init(&c->open_buckets_lock); for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) { + spin_lock_init(&c->open_buckets[i].lock); c->open_buckets_nr_free++; list_add(&c->open_buckets[i].list, &c->open_buckets_free); } + + for (i = 0; i < ARRAY_SIZE(c->cache_by_alloc); i++) + c->cache_by_alloc[i].wp.tier = &c->cache_by_alloc[i]; } int bch_cache_allocator_start(struct cache *ca) diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index adbe0177937a..c4dff7d53e5c 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -48,11 +48,8 @@ int bch_bucket_alloc_set(struct cache_set *, enum alloc_reserve, struct bkey *, void bch_open_bucket_put(struct cache_set *, struct open_bucket *); -struct open_bucket *bch_alloc_sectors(struct cache_set *, struct bkey *, - unsigned, unsigned, - unsigned long *, struct closure *); -struct open_bucket *bch_gc_alloc_sectors(struct cache_set *, struct bkey *, - unsigned long *, struct closure *); +struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *, + struct bkey *, struct closure *); void bch_mark_open_buckets(struct cache_set *); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 03e316d51b76..4c741260dd6e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -463,7 +463,20 @@ struct cached_dev { * and one for moving GC */ enum alloc_reserve { RESERVE_PRIO = BTREE_ID_NR, + /* + * free_inc.size buckets are set aside for moving GC btree node + * allocations. This means that if moving GC runs out of new buckets for + * btree nodes, it will have put back at least free_inc.size buckets + * back on free_inc, preventing a deadlock. + * + * XXX: figure out a less stupid way of achieving this + */ RESERVE_MOVINGGC_BTREE, + /* + * Tiering needs a btree node reserve because of how + * btree_check_reserve() works -- if the cache tier is full, we don't + * want tiering to block forever. + */ RESERVE_TIERING_BTREE, RESERVE_METADATA_LAST = RESERVE_TIERING_BTREE, RESERVE_MOVINGGC, @@ -481,14 +494,39 @@ enum alloc_reserve { /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ #define OPEN_BUCKETS_COUNT 256 +#define WRITE_POINT_COUNT 16 + struct open_bucket { struct list_head list; + spinlock_t lock; atomic_t pin; - unsigned last_write_point; unsigned sectors_free; BKEY_PADDED(key); }; +struct write_point { + struct open_bucket *b; + + /* + * If not NULL, refill from that device (this write point is a member of + * that struct cache) + * + * If NULL, do a normal replicated bucket allocation + */ + struct cache *ca; + + /* + * If not NULL, tier specific writepoint used by tiering/promotion - + * always allocates a single replica + */ + struct cache_tier *tier; + + /* + * Otherwise do a normal replicated bucket allocation that could come + * from any tier (foreground write) + */ +}; + struct bucket_stats { u64 buckets_dirty; u64 buckets_cached; @@ -583,7 +621,7 @@ struct cache { * Protected by bucket_lock. */ #define NUM_GC_GENS 7 - struct open_bucket *gc_buckets[NUM_GC_GENS]; + struct write_point gc_buckets[NUM_GC_GENS]; struct journal_device journal; @@ -623,12 +661,15 @@ struct gc_stat { #define CACHE_SET_STOPPING 1 #define CACHE_SET_RUNNING 2 -#define TIER_OPEN_BUCKETS_COUNT 16 - struct cache_tier { unsigned nr_devices; struct cache *devices[MAX_CACHES_PER_SET]; - struct open_bucket *data_buckets[TIER_OPEN_BUCKETS_COUNT]; + + /* + * writepoint specific to this tier, for cache promote/background + * tiering + */ + struct write_point wp; }; struct prio_clock { @@ -735,6 +776,8 @@ struct cache_set { spinlock_t open_buckets_lock; struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + struct write_point write_points[WRITE_POINT_COUNT]; + /* GARBAGE COLLECTION */ struct task_struct *gc_thread; @@ -1087,7 +1130,7 @@ void bch_bbio_prep(struct bbio *, struct cache *); void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *, unsigned, bool); void bch_submit_bbio_replicas(struct bio *, struct cache_set *, - struct bkey *, unsigned long *, bool); + struct bkey *, unsigned, bool); void bch_bbio_reset(struct bbio *bio); __printf(2, 3) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e4a4a24f2a3b..99a61145cc67 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -456,7 +456,6 @@ static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io; struct bset *i = btree_bset_last(b); - unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)]; BKEY_PADDED(key) k; int n; @@ -478,8 +477,6 @@ static void do_btree_node_write(struct btree *b) bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); bch_bio_map(b->bio, i); - memset(ptrs_to_write, 0xFF, sizeof(ptrs_to_write)); - /* * If we're appending to a leaf node, we don't technically need FUA - * this write just needs to be persisted before the next journal write, @@ -509,8 +506,7 @@ static void do_btree_node_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - bch_submit_bbio_replicas(b->bio, b->c, &k.key, - ptrs_to_write, true); + bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true); continue_at(cl, btree_node_write_done, NULL); } else { trace_bcache_btree_bounce_write_fail(b); @@ -518,8 +514,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - bch_submit_bbio_replicas(b->bio, b->c, &k.key, - ptrs_to_write, true); + bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true); closure_sync(cl); continue_at_nobarrier(cl, __btree_node_write_done, NULL); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 697cb295b8e8..d565ad1f496d 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -94,17 +94,14 @@ void bch_submit_bbio(struct bbio *b, struct cache *ca, } void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c, - struct bkey *k, unsigned long *ptrs_to_write, - bool punt) + struct bkey *k, unsigned ptrs_from, bool punt) { struct cache *ca; - unsigned ptr, next, nr_ptrs = bch_extent_ptrs(k); - - for (ptr = find_first_bit(ptrs_to_write, nr_ptrs); - ptr != nr_ptrs; - ptr = next) { - next = find_next_bit(ptrs_to_write, nr_ptrs, ptr + 1); + unsigned ptr; + for (ptr = ptrs_from; + ptr < bch_extent_ptrs(k); + ptr++) { rcu_read_lock(); ca = PTR_CACHE(c, k, ptr); if (ca) @@ -116,7 +113,7 @@ void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c, break; } - if (next != nr_ptrs) { + if (ptr + 1 < bch_extent_ptrs(k)) { struct bio *n = bio_clone_fast(bio, GFP_NOIO, ca->replica_set); n->bi_end_io = bio->bi_end_io; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 998646c57452..91956f0e6e6b 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -71,6 +71,8 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats) struct keybuf_key *w; struct moving_io *io; struct closure cl; + struct write_point *wp; + unsigned ptr, gen; closure_init_stack(&cl); bch_ratelimit_reset(&ca->moving_gc_pd.rate); @@ -85,6 +87,19 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats) if (!w) break; + for (ptr = 0; ptr < bch_extent_ptrs(&w->key); ptr++) + if ((ca->sb.nr_this_dev == PTR_DEV(&w->key, ptr)) && + (gen = PTR_BUCKET(c, ca, &w->key, + ptr)->copygc_gen)) { + gen--; + BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets)); + wp = &ca->gc_buckets[gen]; + goto found; + } + + bch_keybuf_put(&ca->moving_gc_keys, w); + continue; +found: io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), GFP_KERNEL); @@ -98,11 +113,13 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats) io->keybuf = &ca->moving_gc_keys; io->stats = stats; - bch_data_insert_op_init(&io->op, c, &io->bio.bio, 0, + bch_data_insert_op_init(&io->op, c, &io->bio.bio, wp, false, false, false, &io->w->key, &io->w->key); io->op.io_wq = ca->moving_gc_write; - io->op.moving_gc = true; + io->op.btree_alloc_reserve = RESERVE_MOVINGGC_BTREE; + + bch_extent_drop_ptr(&io->op.insert_key, ptr); trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 08f82223c0ef..4f62ff07afa1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -103,31 +103,6 @@ static void bio_csum(struct bio *bio, struct bkey *k) /* Insert data into cache */ -static enum alloc_reserve bch_btree_reserve(struct data_insert_op *op) -{ - if (op->moving_gc) { - /* - * free_inc.size buckets are set aside for moving GC - * btree node allocations. This means that if moving GC - * runs out of new buckets for btree nodes, it will have - * put back at least free_inc.size buckets back on - * free_inc, preventing a deadlock. - * - * XXX: figure out a less stupid way of achieving this - */ - return RESERVE_MOVINGGC_BTREE; - } else if (op->tiering) { - /* - * Tiering needs a btree node reserve because of how - * btree_check_reserve() works -- if the cache tier is - * full, we don't want tiering to block forever. - */ - return RESERVE_TIERING_BTREE; - } - - return BTREE_ID_EXTENTS; -} - static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { struct data_insert_op *op = container_of(b_op, @@ -189,10 +164,8 @@ static void bch_data_insert_keys(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); enum btree_id id = BTREE_ID_EXTENTS; - enum alloc_reserve reserve; - reserve = bch_btree_reserve(op); - __bch_btree_op_init(&op->op, id, reserve, 0); + __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0); closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl); continue_at(cl, bch_data_insert_keys_done, op->c->wq); @@ -288,14 +261,16 @@ static void bch_data_insert_endio(struct bio *bio) static void bch_data_insert_start(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)]; struct bio *bio = op->bio, *n; - unsigned open_bucket_nr = 0; + unsigned open_bucket_nr = 0, ptrs_from; struct open_bucket *b; if (op->discard) return bch_data_invalidate(cl); + bch_extent_drop_stale(op->c, &op->insert_key); + ptrs_from = bch_extent_ptrs(&op->insert_key); + /* * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. @@ -321,15 +296,9 @@ static void bch_data_insert_start(struct closure *cl) k = op->insert_keys.top; bkey_copy(k, &op->insert_key); - bch_extent_drop_stale(op->c, k); - memset(ptrs_to_write, 0, sizeof(ptrs_to_write)); - - b = op->moving_gc - ? bch_gc_alloc_sectors(op->c, k, ptrs_to_write, cl) - : bch_alloc_sectors(op->c, k, op->write_point, op->tier, - ptrs_to_write, - op->wait ? cl : NULL); + b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL); BUG_ON(!b); + if (PTR_ERR(b) == -EAGAIN) { /* If we already have some keys, must insert them first * before allocating another open bucket. We only hit @@ -357,7 +326,7 @@ static void bch_data_insert_start(struct closure *cl) trace_bcache_cache_insert(k); bio_set_op_attrs(n, REQ_OP_WRITE, 0); - bch_submit_bbio_replicas(n, op->c, k, ptrs_to_write, false); + bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false); bch_extent_normalize(op->c, k); bch_keylist_push(&op->insert_keys); @@ -458,7 +427,7 @@ void bch_data_insert(struct closure *cl) &start, &end); } - if (op->moving_gc) + if (op->wp->ca) bch_mark_gc_write(c, bio_sectors(op->bio)); else if (!op->discard) bch_mark_foreground_write(c, bio_sectors(op->bio)); @@ -479,6 +448,44 @@ void bch_data_insert(struct closure *cl) continue_at_nobarrier(cl, bch_data_insert_start, NULL); } +void bch_data_insert_op_init(struct data_insert_op *op, + struct cache_set *c, + struct bio *bio, + struct write_point *wp, + bool wait, bool discard, bool flush, + struct bkey *insert_key, + struct bkey *replace_key) +{ + if (!wp) { + unsigned wp_idx = hash_long((unsigned long) current, + ilog2(ARRAY_SIZE(c->write_points))); + + BUG_ON(wp_idx > ARRAY_SIZE(c->write_points)); + wp = &c->write_points[wp_idx]; + } + + op->c = c; + op->io_wq = NULL; + op->bio = bio; + op->error = 0; + op->flags = 0; + op->wait = wait; + op->discard = discard; + op->flush = flush; + op->wp = wp; + op->btree_alloc_reserve = BTREE_ID_EXTENTS; + + memset(op->open_buckets, 0, sizeof(op->open_buckets)); + bch_keylist_init(&op->insert_keys); + bkey_copy(&op->insert_key, insert_key); + + if (replace_key) { + op->replace = true; + bkey_copy(&op->replace_key, replace_key); + } +} +EXPORT_SYMBOL(bch_data_insert_op_init); + /* Cache promotion on read */ struct cache_promote_op { @@ -597,12 +604,9 @@ static void __cache_promote(struct cache_set *c, struct bbio *orig_bio, op->orig_bio = &orig_bio->bio; op->stale = 0; - bch_data_insert_op_init(&op->iop, c, - bio, - hash_long((unsigned long) current, 16), - false, - false, - false, + bch_data_insert_op_init(&op->iop, c, bio, + &c->cache_by_alloc[0].wp, + false, false, false, replace_key, replace_key); @@ -1403,8 +1407,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl); } - bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio, - hash_long((unsigned long) current, 16), + bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio, NULL, !KEY_CACHED(&insert_key), bypass, bio->bi_opf & (REQ_PREFLUSH|REQ_FUA), &insert_key, NULL); @@ -1554,8 +1557,7 @@ static void __flash_dev_make_request(struct request_queue *q, struct bio *bio) s = search_alloc(bio, d); bio = &s->bio.bio; - bch_data_insert_op_init(&s->iop, d->c, bio, - hash_long((unsigned long) current, 16), + bch_data_insert_op_init(&s->iop, d->c, bio, NULL, true, bio_op(bio) == REQ_OP_DISCARD, bio->bi_opf & (REQ_PREFLUSH|REQ_FUA), diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index a4467456fe72..386f452f6951 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -17,11 +17,10 @@ struct data_insert_op { /* Used internally, do not touch */ struct btree_op op; - uint16_t write_point; short error; union { - uint16_t flags; + u8 flags; struct { /* Wait for data bucket allocation or just @@ -33,20 +32,17 @@ struct data_insert_op { unsigned flush:1; /* Perform a compare-exchange with replace_key? */ unsigned replace:1; - /* Tier to write to */ - unsigned tier:2; - /* Use moving GC reserves for buckets, btree nodes and - * open buckets? */ - unsigned moving_gc:1; - /* Use tiering reserves for btree nodes? */ - unsigned tiering:1; - /* Set on completion */ + + /* Set on completion, if cmpxchg index update failed */ unsigned replace_collision:1; /* Internal */ unsigned insert_data_done:1; }; }; + u8 btree_alloc_reserve; + + struct write_point *wp; struct open_bucket *open_buckets[2]; struct keylist insert_keys; @@ -54,33 +50,9 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -static inline void bch_data_insert_op_init(struct data_insert_op *op, - struct cache_set *c, - struct bio *bio, - unsigned write_point, - bool wait, bool discard, bool flush, - struct bkey *insert_key, - struct bkey *replace_key) -{ - op->c = c; - op->io_wq = NULL; - op->bio = bio; - op->write_point = write_point; - op->error = 0; - op->flags = 0; - op->wait = wait; - op->discard = discard; - op->flush = flush; - - memset(op->open_buckets, 0, sizeof(op->open_buckets)); - bch_keylist_init(&op->insert_keys); - bkey_copy(&op->insert_key, insert_key); - - if (replace_key) { - op->replace = true; - bkey_copy(&op->replace_key, replace_key); - } -} +void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *, + struct bio *, struct write_point *, bool, + bool, bool, struct bkey *, struct bkey *); unsigned bch_get_congested(struct cache_set *); int bch_read(struct cache_set *, struct bio *, u64); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9023d9feb1fc..58441478d8bd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2192,6 +2192,9 @@ static int cache_init(struct cache *ca) total_reserve += ca->free[i].size; pr_debug("%zu buckets reserved", total_reserve); + for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++) + ca->gc_buckets[i].ca = ca; + mutex_init(&ca->heap_lock); init_waitqueue_head(&ca->fifo_wait); bch_moving_init_cache(ca); diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index 44663744ddf6..49b213d20a1f 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -72,15 +72,12 @@ static void read_tiering(struct cache_set *c) struct moving_io *io; struct closure cl; struct moving_io_stats stats; - unsigned write_point; trace_bcache_tiering_start(c); closure_init_stack(&cl); memset(&stats, 0, sizeof(stats)); - write_point = 0; - /* XXX: if we error, background writeback could stall indefinitely */ c->tiering_keys.last_scanned = ZERO_KEY; @@ -106,11 +103,11 @@ static void read_tiering(struct cache_set *c) io->stats = &stats; bch_data_insert_op_init(&io->op, c, &io->bio.bio, - write_point, true, false, false, + &c->cache_by_alloc[1].wp, + true, false, false, &io->w->key, &io->w->key); io->op.io_wq = c->tiering_write; - io->op.tiering = 1; - io->op.tier = 1; + io->op.btree_alloc_reserve = RESERVE_TIERING_BTREE; trace_bcache_tiering_copy(&w->key); @@ -118,12 +115,6 @@ static void read_tiering(struct cache_set *c) KEY_SIZE(&w->key) << 9); closure_call(&io->cl, bch_data_move, NULL, &cl); - - /* Try to stripe writes across cache devices by sending them - * to different open buckets */ - write_point++; - if (write_point == c->sb.nr_in_set) - write_point = 0; } closure_sync(&cl); |