summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/alloc.c298
-rw-r--r--drivers/md/bcache/alloc.h7
-rw-r--r--drivers/md/bcache/bcache.h55
-rw-r--r--drivers/md/bcache/btree.c9
-rw-r--r--drivers/md/bcache/io.c15
-rw-r--r--drivers/md/bcache/movinggc.c21
-rw-r--r--drivers/md/bcache/request.c102
-rw-r--r--drivers/md/bcache/request.h46
-rw-r--r--drivers/md/bcache/super.c3
-rw-r--r--drivers/md/bcache/tier.c15
10 files changed, 222 insertions, 349 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6209e42c5633..cc6a09c1205c 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -746,12 +746,14 @@ int bch_bucket_alloc_set(struct cache_set *c, enum alloc_reserve reserve,
long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
int i, ret;
- mutex_lock(&c->bucket_lock);
+ BUG_ON(tier_idx > ARRAY_SIZE(c->cache_by_alloc));
BUG_ON(!n || n > BKEY_EXTENT_PTRS_MAX);
bkey_init(k);
memset(caches_used, 0, sizeof(caches_used));
+ mutex_lock(&c->bucket_lock);
+
/* sort by free space/prio of oldest data in caches */
for (i = 0; i < n; i++) {
@@ -850,8 +852,7 @@ static struct open_bucket *bch_open_bucket_get(struct cache_set *c,
}
static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c,
- enum alloc_reserve reserve,
- int n, unsigned tier,
+ struct write_point *wp,
struct closure *cl)
{
int ret;
@@ -861,105 +862,74 @@ static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c,
if (IS_ERR_OR_NULL(b))
return b;
- ret = bch_bucket_alloc_set(c, reserve, &b->key, n, tier, cl);
- if (ret) {
- BUG_ON(ret > 0);
- bch_open_bucket_put(c, b);
- b = ERR_PTR(ret);
+ if (wp->ca) {
+ long bucket;
+
+ mutex_lock(&c->bucket_lock);
+
+ bucket = bch_bucket_alloc(wp->ca, RESERVE_MOVINGGC, cl);
+ if (bucket < 0) {
+ ret = bucket;
+ mutex_unlock(&c->bucket_lock);
+ goto err;
+ }
+
+ b->key.val[0] = PTR(wp->ca->bucket_gens[bucket],
+ bucket_to_sector(wp->ca->set, bucket),
+ wp->ca->sb.nr_this_dev);
+ bch_set_extent_ptrs(&b->key, 1);
+
+ mutex_unlock(&c->bucket_lock);
+ } else if (wp->tier) {
+ ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key, 1,
+ wp->tier - c->cache_by_alloc, cl);
+ if (ret)
+ goto err;
+ } else {
+ ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key,
+ CACHE_SET_DATA_REPLICAS_WANT(&c->sb),
+ 0, cl);
+ if (ret)
+ goto err;
}
return b;
+err:
+ bch_open_bucket_put(c, b);
+ return ERR_PTR(ret);
}
/* Sector allocator */
-/*
- * We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
- *
- * The ideas is if you've got multiple tasks pulling data into the cache at the
- * same time, you'll get better cache utilization if you try to segregate their
- * data and preserve locality.
- *
- * For example, say you've starting Firefox at the same time you're copying a
- * bunch of files. Firefox will likely end up being fairly hot and stay in the
- * cache awhile, but the data you copied might not be; if you wrote all that
- * data to the same buckets it'd get invalidated at the same time.
- *
- * Both of those tasks will be doing fairly random IO so we can't rely on
- * detecting sequential IO to segregate their data, but going off of the task
- * should be a sane heuristic.
- */
-static struct open_bucket *pick_data_bucket(struct cache_set *c,
- const struct bkey *search,
- unsigned write_point,
- unsigned tier_idx,
- struct closure *cl)
- __releases(c->open_buckets_lock)
- __acquires(c->open_buckets_lock)
+static struct open_bucket *lock_and_refill_writepoint(struct cache_set *c,
+ struct write_point *wp,
+ struct closure *cl)
{
- struct cache_tier *tier = &c->cache_by_alloc[tier_idx];
struct open_bucket *b;
- int i, wp = -1;
-retry:
- for (i = 0;
- i < ARRAY_SIZE(tier->data_buckets) &&
- (b = tier->data_buckets[i]); i++) {
- /* Tiering thread already writes keys in order, maximize
- * write bandwidth instead */
- if (tier_idx == 0 && !bkey_cmp(&b->key, &START_KEY(search)))
- goto found;
- else if (b->last_write_point == write_point)
- wp = i;
- }
-
- i = wp;
- if (i >= 0)
- goto found;
-
- i = ARRAY_SIZE(tier->data_buckets) - 1;
- if (tier->data_buckets[i])
- goto found;
- spin_unlock(&c->open_buckets_lock);
- b = bch_open_bucket_alloc(c, RESERVE_NONE,
- CACHE_SET_DATA_REPLICAS_WANT(&c->sb),
- tier_idx, cl);
- spin_lock(&c->open_buckets_lock);
+ while (1) {
+ b = ACCESS_ONCE(wp->b);
+ if (b) {
+ spin_lock(&b->lock);
+ if (wp->b == b)
+ return b;
- if (IS_ERR_OR_NULL(b))
- return b;
+ spin_unlock(&b->lock);
+ } else {
+ b = bch_open_bucket_alloc(c, wp, cl);
+ if (IS_ERR_OR_NULL(b))
+ return b;
- if (tier->data_buckets[i]) {
- /* we raced - and we must unlock to call bch_bucket_free()... */
- spin_unlock(&c->open_buckets_lock);
- bch_bucket_free_never_used(c, &b->key);
- spin_lock(&c->open_buckets_lock);
+ spin_lock(&b->lock);
+ if (!race_fault() &&
+ cmpxchg(&wp->b, NULL, b) == NULL)
+ return b;
+ spin_unlock(&b->lock);
- __bch_open_bucket_put(c, b);
- goto retry;
- } else {
- tier->data_buckets[i] = b;
+ bch_bucket_free_never_used(c, &b->key);
+ bch_open_bucket_put(c, b);
+ }
}
-found:
- b = tier->data_buckets[i];
-
- /*
- * Move b to the end of the lru, and keep track of what
- * this bucket was last used for:
- */
- memmove(&tier->data_buckets[1],
- &tier->data_buckets[0],
- sizeof(struct open_bucket *) * i);
-
- tier->data_buckets[0] = b;
-
- b->last_write_point = write_point;
- bkey_copy_key(&b->key, search);
-
- return b;
}
static void verify_not_stale(struct cache_set *c, struct bkey *k)
@@ -993,34 +963,28 @@ static void verify_not_stale(struct cache_set *c, struct bkey *k)
* @tier_idx - which tier this write is destined towards
* @cl - closure to wait for a bucket
*/
-struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
- unsigned write_point, unsigned tier_idx,
- unsigned long *ptrs_to_write,
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+ struct write_point *wp,
+ struct bkey *k,
struct closure *cl)
{
- struct cache_tier *tier = &c->cache_by_alloc[tier_idx];
struct open_bucket *b;
unsigned i, sectors;
- spin_lock(&c->open_buckets_lock);
-
- b = pick_data_bucket(c, k, write_point, tier_idx, cl);
+ b = lock_and_refill_writepoint(c, wp, cl);
if (IS_ERR_OR_NULL(b))
- goto out;
+ return b;
- BUG_ON(b != tier->data_buckets[0]);
+ BUG_ON(!b->sectors_free);
verify_not_stale(c, &b->key);
/* Set up the pointer to the space we're allocating: */
+ memcpy(&k->val[bch_extent_ptrs(k)],
+ &b->key.val[0],
+ bch_extent_ptrs(&b->key) * sizeof(u64));
- for (i = 0; i < bch_extent_ptrs(&b->key); i++) {
- unsigned ptrs = bch_extent_ptrs(k);
-
- k->val[ptrs] = b->key.val[i];
- __set_bit(ptrs, ptrs_to_write);
- bch_set_extent_ptrs(k, ptrs + 1);
- }
+ bch_set_extent_ptrs(k, bch_extent_ptrs(k) + bch_extent_ptrs(&b->key));
sectors = min_t(unsigned, KEY_SIZE(k), b->sectors_free);
@@ -1030,6 +994,10 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
/* update open bucket for next time: */
b->sectors_free -= sectors;
+ if (b->sectors_free)
+ atomic_inc(&b->pin);
+ else
+ BUG_ON(xchg(&wp->b, NULL) != b);
rcu_read_lock();
for (i = 0; i < bch_extent_ptrs(&b->key); i++) {
@@ -1044,127 +1012,11 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
}
rcu_read_unlock();
- /*
- * k takes refcounts on the buckets it points to until it's inserted
- * into the btree, but if we're done with this bucket we just transfer
- * get_data_bucket()'s refcount.
- */
-
- if (b->sectors_free) {
- atomic_inc(&b->pin);
- } else {
- memmove(&tier->data_buckets[0],
- &tier->data_buckets[1],
- sizeof(struct open_bucket *) *
- (ARRAY_SIZE(tier->data_buckets) - 1));
- tier->data_buckets[ARRAY_SIZE(tier->data_buckets) - 1] = NULL;
- }
-out:
- spin_unlock(&c->open_buckets_lock);
+ spin_unlock(&b->lock);
return b;
}
-struct open_bucket *bch_gc_alloc_sectors(struct cache_set *c, struct bkey *k,
- unsigned long *ptrs_to_write,
- struct closure *cl)
-{
- unsigned i, gen, sectors = KEY_SIZE(k);
- struct cache *ca;
- struct open_bucket *b;
- long bucket;
-
- mutex_lock(&c->bucket_lock);
-retry:
- /* Check if we raced with a foreground write */
-
- rcu_read_lock();
- for (i = 0; i < bch_extent_ptrs(k); i++)
- if ((ca = PTR_CACHE(c, k, i)) &&
- (gen = PTR_BUCKET(c, ca, k, i)->copygc_gen)) {
- gen--;
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
- goto found;
- }
- rcu_read_unlock();
- mutex_unlock(&c->bucket_lock);
- return ERR_PTR(-ESRCH);
-found:
- b = ca->gc_buckets[gen];
- if (!b) {
- mutex_unlock(&c->bucket_lock);
-
- b = bch_open_bucket_get(c, NULL);
- if (WARN_ONCE(IS_ERR(b),
- "bcache: movinggc bucket allocation failed with %ld",
- PTR_ERR(b))) {
- b = ERR_PTR(-ENOSPC);
- goto out_put;
- }
-
- mutex_lock(&c->bucket_lock);
-
- bucket = bch_bucket_alloc(ca, RESERVE_MOVINGGC, NULL);
- if (WARN_ONCE(bucket < 0,
- "bcache: movinggc bucket allocation failed with %ld",
- bucket)) {
- mutex_unlock(&c->bucket_lock);
- bch_open_bucket_put(c, b);
- b = ERR_PTR(-ENOSPC);
- goto out_put;
- }
-
- b->key.val[0] = PTR(ca->bucket_gens[bucket],
- bucket_to_sector(ca->set, bucket),
- ca->sb.nr_this_dev);
- bch_set_extent_ptrs(&b->key, 1);
-
- /* we dropped bucket_lock, might've raced */
- if (ca->gc_buckets[gen] || race_fault()) {
- /* we raced */
- mutex_unlock(&c->bucket_lock);
- bch_bucket_free_never_used(c, &b->key);
- bch_open_bucket_put(c, b);
- mutex_lock(&c->bucket_lock);
- } else {
- ca->gc_buckets[gen] = b;
- }
-
- /*
- * GC_GEN() might also have been reset... don't strictly need to
- * recheck though
- */
- percpu_ref_put(&ca->ref);
- goto retry;
- }
-
- verify_not_stale(c, &b->key);
-
- k->val[i] = b->key.val[0];
- __set_bit(i, ptrs_to_write);
-
- sectors = min_t(unsigned, sectors, b->sectors_free);
-
- SET_KEY_OFFSET(k, KEY_START(k) + sectors);
- SET_KEY_SIZE(k, sectors);
-
- /* update open bucket for next time: */
-
- b->sectors_free -= sectors;
- if (b->sectors_free) {
- SET_PTR_OFFSET(&b->key, 0, PTR_OFFSET(&b->key, 0) + sectors);
- atomic_inc(&b->pin);
- } else
- ca->gc_buckets[gen] = NULL;
-
- atomic_long_add(sectors, &ca->sectors_written);
- mutex_unlock(&c->bucket_lock);
-out_put:
- percpu_ref_put(&ca->ref);
- return b;
-}
-
void bch_mark_open_buckets(struct cache_set *c)
{
struct cache *ca;
@@ -1214,9 +1066,13 @@ void bch_open_buckets_init(struct cache_set *c)
spin_lock_init(&c->open_buckets_lock);
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) {
+ spin_lock_init(&c->open_buckets[i].lock);
c->open_buckets_nr_free++;
list_add(&c->open_buckets[i].list, &c->open_buckets_free);
}
+
+ for (i = 0; i < ARRAY_SIZE(c->cache_by_alloc); i++)
+ c->cache_by_alloc[i].wp.tier = &c->cache_by_alloc[i];
}
int bch_cache_allocator_start(struct cache *ca)
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index adbe0177937a..c4dff7d53e5c 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -48,11 +48,8 @@ int bch_bucket_alloc_set(struct cache_set *, enum alloc_reserve, struct bkey *,
void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
-struct open_bucket *bch_alloc_sectors(struct cache_set *, struct bkey *,
- unsigned, unsigned,
- unsigned long *, struct closure *);
-struct open_bucket *bch_gc_alloc_sectors(struct cache_set *, struct bkey *,
- unsigned long *, struct closure *);
+struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
+ struct bkey *, struct closure *);
void bch_mark_open_buckets(struct cache_set *);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 03e316d51b76..4c741260dd6e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -463,7 +463,20 @@ struct cached_dev {
* and one for moving GC */
enum alloc_reserve {
RESERVE_PRIO = BTREE_ID_NR,
+ /*
+ * free_inc.size buckets are set aside for moving GC btree node
+ * allocations. This means that if moving GC runs out of new buckets for
+ * btree nodes, it will have put back at least free_inc.size buckets
+ * back on free_inc, preventing a deadlock.
+ *
+ * XXX: figure out a less stupid way of achieving this
+ */
RESERVE_MOVINGGC_BTREE,
+ /*
+ * Tiering needs a btree node reserve because of how
+ * btree_check_reserve() works -- if the cache tier is full, we don't
+ * want tiering to block forever.
+ */
RESERVE_TIERING_BTREE,
RESERVE_METADATA_LAST = RESERVE_TIERING_BTREE,
RESERVE_MOVINGGC,
@@ -481,14 +494,39 @@ enum alloc_reserve {
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
+#define WRITE_POINT_COUNT 16
+
struct open_bucket {
struct list_head list;
+ spinlock_t lock;
atomic_t pin;
- unsigned last_write_point;
unsigned sectors_free;
BKEY_PADDED(key);
};
+struct write_point {
+ struct open_bucket *b;
+
+ /*
+ * If not NULL, refill from that device (this write point is a member of
+ * that struct cache)
+ *
+ * If NULL, do a normal replicated bucket allocation
+ */
+ struct cache *ca;
+
+ /*
+ * If not NULL, tier specific writepoint used by tiering/promotion -
+ * always allocates a single replica
+ */
+ struct cache_tier *tier;
+
+ /*
+ * Otherwise do a normal replicated bucket allocation that could come
+ * from any tier (foreground write)
+ */
+};
+
struct bucket_stats {
u64 buckets_dirty;
u64 buckets_cached;
@@ -583,7 +621,7 @@ struct cache {
* Protected by bucket_lock.
*/
#define NUM_GC_GENS 7
- struct open_bucket *gc_buckets[NUM_GC_GENS];
+ struct write_point gc_buckets[NUM_GC_GENS];
struct journal_device journal;
@@ -623,12 +661,15 @@ struct gc_stat {
#define CACHE_SET_STOPPING 1
#define CACHE_SET_RUNNING 2
-#define TIER_OPEN_BUCKETS_COUNT 16
-
struct cache_tier {
unsigned nr_devices;
struct cache *devices[MAX_CACHES_PER_SET];
- struct open_bucket *data_buckets[TIER_OPEN_BUCKETS_COUNT];
+
+ /*
+ * writepoint specific to this tier, for cache promote/background
+ * tiering
+ */
+ struct write_point wp;
};
struct prio_clock {
@@ -735,6 +776,8 @@ struct cache_set {
spinlock_t open_buckets_lock;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
+ struct write_point write_points[WRITE_POINT_COUNT];
+
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
@@ -1087,7 +1130,7 @@ void bch_bbio_prep(struct bbio *, struct cache *);
void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
unsigned, bool);
void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
- struct bkey *, unsigned long *, bool);
+ struct bkey *, unsigned, bool);
void bch_bbio_reset(struct bbio *bio);
__printf(2, 3)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e4a4a24f2a3b..99a61145cc67 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -456,7 +456,6 @@ static void do_btree_node_write(struct btree *b)
{
struct closure *cl = &b->io;
struct bset *i = btree_bset_last(b);
- unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
BKEY_PADDED(key) k;
int n;
@@ -478,8 +477,6 @@ static void do_btree_node_write(struct btree *b)
bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
bch_bio_map(b->bio, i);
- memset(ptrs_to_write, 0xFF, sizeof(ptrs_to_write));
-
/*
* If we're appending to a leaf node, we don't technically need FUA -
* this write just needs to be persisted before the next journal write,
@@ -509,8 +506,7 @@ static void do_btree_node_write(struct btree *b)
memcpy(page_address(bv->bv_page),
base + j * PAGE_SIZE, PAGE_SIZE);
- bch_submit_bbio_replicas(b->bio, b->c, &k.key,
- ptrs_to_write, true);
+ bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
continue_at(cl, btree_node_write_done, NULL);
} else {
trace_bcache_btree_bounce_write_fail(b);
@@ -518,8 +514,7 @@ static void do_btree_node_write(struct btree *b)
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
- bch_submit_bbio_replicas(b->bio, b->c, &k.key,
- ptrs_to_write, true);
+ bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
closure_sync(cl);
continue_at_nobarrier(cl, __btree_node_write_done, NULL);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 697cb295b8e8..d565ad1f496d 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -94,17 +94,14 @@ void bch_submit_bbio(struct bbio *b, struct cache *ca,
}
void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c,
- struct bkey *k, unsigned long *ptrs_to_write,
- bool punt)
+ struct bkey *k, unsigned ptrs_from, bool punt)
{
struct cache *ca;
- unsigned ptr, next, nr_ptrs = bch_extent_ptrs(k);
-
- for (ptr = find_first_bit(ptrs_to_write, nr_ptrs);
- ptr != nr_ptrs;
- ptr = next) {
- next = find_next_bit(ptrs_to_write, nr_ptrs, ptr + 1);
+ unsigned ptr;
+ for (ptr = ptrs_from;
+ ptr < bch_extent_ptrs(k);
+ ptr++) {
rcu_read_lock();
ca = PTR_CACHE(c, k, ptr);
if (ca)
@@ -116,7 +113,7 @@ void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c,
break;
}
- if (next != nr_ptrs) {
+ if (ptr + 1 < bch_extent_ptrs(k)) {
struct bio *n = bio_clone_fast(bio, GFP_NOIO,
ca->replica_set);
n->bi_end_io = bio->bi_end_io;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 998646c57452..91956f0e6e6b 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -71,6 +71,8 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
struct keybuf_key *w;
struct moving_io *io;
struct closure cl;
+ struct write_point *wp;
+ unsigned ptr, gen;
closure_init_stack(&cl);
bch_ratelimit_reset(&ca->moving_gc_pd.rate);
@@ -85,6 +87,19 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
if (!w)
break;
+ for (ptr = 0; ptr < bch_extent_ptrs(&w->key); ptr++)
+ if ((ca->sb.nr_this_dev == PTR_DEV(&w->key, ptr)) &&
+ (gen = PTR_BUCKET(c, ca, &w->key,
+ ptr)->copygc_gen)) {
+ gen--;
+ BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
+ wp = &ca->gc_buckets[gen];
+ goto found;
+ }
+
+ bch_keybuf_put(&ca->moving_gc_keys, w);
+ continue;
+found:
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
GFP_KERNEL);
@@ -98,11 +113,13 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
io->keybuf = &ca->moving_gc_keys;
io->stats = stats;
- bch_data_insert_op_init(&io->op, c, &io->bio.bio, 0,
+ bch_data_insert_op_init(&io->op, c, &io->bio.bio, wp,
false, false, false,
&io->w->key, &io->w->key);
io->op.io_wq = ca->moving_gc_write;
- io->op.moving_gc = true;
+ io->op.btree_alloc_reserve = RESERVE_MOVINGGC_BTREE;
+
+ bch_extent_drop_ptr(&io->op.insert_key, ptr);
trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 08f82223c0ef..4f62ff07afa1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -103,31 +103,6 @@ static void bio_csum(struct bio *bio, struct bkey *k)
/* Insert data into cache */
-static enum alloc_reserve bch_btree_reserve(struct data_insert_op *op)
-{
- if (op->moving_gc) {
- /*
- * free_inc.size buckets are set aside for moving GC
- * btree node allocations. This means that if moving GC
- * runs out of new buckets for btree nodes, it will have
- * put back at least free_inc.size buckets back on
- * free_inc, preventing a deadlock.
- *
- * XXX: figure out a less stupid way of achieving this
- */
- return RESERVE_MOVINGGC_BTREE;
- } else if (op->tiering) {
- /*
- * Tiering needs a btree node reserve because of how
- * btree_check_reserve() works -- if the cache tier is
- * full, we don't want tiering to block forever.
- */
- return RESERVE_TIERING_BTREE;
- }
-
- return BTREE_ID_EXTENTS;
-}
-
static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
{
struct data_insert_op *op = container_of(b_op,
@@ -189,10 +164,8 @@ static void bch_data_insert_keys(struct closure *cl)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
enum btree_id id = BTREE_ID_EXTENTS;
- enum alloc_reserve reserve;
- reserve = bch_btree_reserve(op);
- __bch_btree_op_init(&op->op, id, reserve, 0);
+ __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
continue_at(cl, bch_data_insert_keys_done, op->c->wq);
@@ -288,14 +261,16 @@ static void bch_data_insert_endio(struct bio *bio)
static void bch_data_insert_start(struct closure *cl)
{
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
struct bio *bio = op->bio, *n;
- unsigned open_bucket_nr = 0;
+ unsigned open_bucket_nr = 0, ptrs_from;
struct open_bucket *b;
if (op->discard)
return bch_data_invalidate(cl);
+ bch_extent_drop_stale(op->c, &op->insert_key);
+ ptrs_from = bch_extent_ptrs(&op->insert_key);
+
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
* flush, it'll wait on the journal write.
@@ -321,15 +296,9 @@ static void bch_data_insert_start(struct closure *cl)
k = op->insert_keys.top;
bkey_copy(k, &op->insert_key);
- bch_extent_drop_stale(op->c, k);
- memset(ptrs_to_write, 0, sizeof(ptrs_to_write));
-
- b = op->moving_gc
- ? bch_gc_alloc_sectors(op->c, k, ptrs_to_write, cl)
- : bch_alloc_sectors(op->c, k, op->write_point, op->tier,
- ptrs_to_write,
- op->wait ? cl : NULL);
+ b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
BUG_ON(!b);
+
if (PTR_ERR(b) == -EAGAIN) {
/* If we already have some keys, must insert them first
* before allocating another open bucket. We only hit
@@ -357,7 +326,7 @@ static void bch_data_insert_start(struct closure *cl)
trace_bcache_cache_insert(k);
bio_set_op_attrs(n, REQ_OP_WRITE, 0);
- bch_submit_bbio_replicas(n, op->c, k, ptrs_to_write, false);
+ bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
bch_extent_normalize(op->c, k);
bch_keylist_push(&op->insert_keys);
@@ -458,7 +427,7 @@ void bch_data_insert(struct closure *cl)
&start, &end);
}
- if (op->moving_gc)
+ if (op->wp->ca)
bch_mark_gc_write(c, bio_sectors(op->bio));
else if (!op->discard)
bch_mark_foreground_write(c, bio_sectors(op->bio));
@@ -479,6 +448,44 @@ void bch_data_insert(struct closure *cl)
continue_at_nobarrier(cl, bch_data_insert_start, NULL);
}
+void bch_data_insert_op_init(struct data_insert_op *op,
+ struct cache_set *c,
+ struct bio *bio,
+ struct write_point *wp,
+ bool wait, bool discard, bool flush,
+ struct bkey *insert_key,
+ struct bkey *replace_key)
+{
+ if (!wp) {
+ unsigned wp_idx = hash_long((unsigned long) current,
+ ilog2(ARRAY_SIZE(c->write_points)));
+
+ BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
+ wp = &c->write_points[wp_idx];
+ }
+
+ op->c = c;
+ op->io_wq = NULL;
+ op->bio = bio;
+ op->error = 0;
+ op->flags = 0;
+ op->wait = wait;
+ op->discard = discard;
+ op->flush = flush;
+ op->wp = wp;
+ op->btree_alloc_reserve = BTREE_ID_EXTENTS;
+
+ memset(op->open_buckets, 0, sizeof(op->open_buckets));
+ bch_keylist_init(&op->insert_keys);
+ bkey_copy(&op->insert_key, insert_key);
+
+ if (replace_key) {
+ op->replace = true;
+ bkey_copy(&op->replace_key, replace_key);
+ }
+}
+EXPORT_SYMBOL(bch_data_insert_op_init);
+
/* Cache promotion on read */
struct cache_promote_op {
@@ -597,12 +604,9 @@ static void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
op->orig_bio = &orig_bio->bio;
op->stale = 0;
- bch_data_insert_op_init(&op->iop, c,
- bio,
- hash_long((unsigned long) current, 16),
- false,
- false,
- false,
+ bch_data_insert_op_init(&op->iop, c, bio,
+ &c->cache_by_alloc[0].wp,
+ false, false, false,
replace_key,
replace_key);
@@ -1403,8 +1407,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
closure_bio_submit(bio, cl);
}
- bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio,
- hash_long((unsigned long) current, 16),
+ bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio, NULL,
!KEY_CACHED(&insert_key), bypass,
bio->bi_opf & (REQ_PREFLUSH|REQ_FUA),
&insert_key, NULL);
@@ -1554,8 +1557,7 @@ static void __flash_dev_make_request(struct request_queue *q, struct bio *bio)
s = search_alloc(bio, d);
bio = &s->bio.bio;
- bch_data_insert_op_init(&s->iop, d->c, bio,
- hash_long((unsigned long) current, 16),
+ bch_data_insert_op_init(&s->iop, d->c, bio, NULL,
true,
bio_op(bio) == REQ_OP_DISCARD,
bio->bi_opf & (REQ_PREFLUSH|REQ_FUA),
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index a4467456fe72..386f452f6951 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -17,11 +17,10 @@ struct data_insert_op {
/* Used internally, do not touch */
struct btree_op op;
- uint16_t write_point;
short error;
union {
- uint16_t flags;
+ u8 flags;
struct {
/* Wait for data bucket allocation or just
@@ -33,20 +32,17 @@ struct data_insert_op {
unsigned flush:1;
/* Perform a compare-exchange with replace_key? */
unsigned replace:1;
- /* Tier to write to */
- unsigned tier:2;
- /* Use moving GC reserves for buckets, btree nodes and
- * open buckets? */
- unsigned moving_gc:1;
- /* Use tiering reserves for btree nodes? */
- unsigned tiering:1;
- /* Set on completion */
+
+ /* Set on completion, if cmpxchg index update failed */
unsigned replace_collision:1;
/* Internal */
unsigned insert_data_done:1;
};
};
+ u8 btree_alloc_reserve;
+
+ struct write_point *wp;
struct open_bucket *open_buckets[2];
struct keylist insert_keys;
@@ -54,33 +50,9 @@ struct data_insert_op {
BKEY_PADDED(replace_key);
};
-static inline void bch_data_insert_op_init(struct data_insert_op *op,
- struct cache_set *c,
- struct bio *bio,
- unsigned write_point,
- bool wait, bool discard, bool flush,
- struct bkey *insert_key,
- struct bkey *replace_key)
-{
- op->c = c;
- op->io_wq = NULL;
- op->bio = bio;
- op->write_point = write_point;
- op->error = 0;
- op->flags = 0;
- op->wait = wait;
- op->discard = discard;
- op->flush = flush;
-
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
- bch_keylist_init(&op->insert_keys);
- bkey_copy(&op->insert_key, insert_key);
-
- if (replace_key) {
- op->replace = true;
- bkey_copy(&op->replace_key, replace_key);
- }
-}
+void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
+ struct bio *, struct write_point *, bool,
+ bool, bool, struct bkey *, struct bkey *);
unsigned bch_get_congested(struct cache_set *);
int bch_read(struct cache_set *, struct bio *, u64);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9023d9feb1fc..58441478d8bd 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2192,6 +2192,9 @@ static int cache_init(struct cache *ca)
total_reserve += ca->free[i].size;
pr_debug("%zu buckets reserved", total_reserve);
+ for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++)
+ ca->gc_buckets[i].ca = ca;
+
mutex_init(&ca->heap_lock);
init_waitqueue_head(&ca->fifo_wait);
bch_moving_init_cache(ca);
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 44663744ddf6..49b213d20a1f 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -72,15 +72,12 @@ static void read_tiering(struct cache_set *c)
struct moving_io *io;
struct closure cl;
struct moving_io_stats stats;
- unsigned write_point;
trace_bcache_tiering_start(c);
closure_init_stack(&cl);
memset(&stats, 0, sizeof(stats));
- write_point = 0;
-
/* XXX: if we error, background writeback could stall indefinitely */
c->tiering_keys.last_scanned = ZERO_KEY;
@@ -106,11 +103,11 @@ static void read_tiering(struct cache_set *c)
io->stats = &stats;
bch_data_insert_op_init(&io->op, c, &io->bio.bio,
- write_point, true, false, false,
+ &c->cache_by_alloc[1].wp,
+ true, false, false,
&io->w->key, &io->w->key);
io->op.io_wq = c->tiering_write;
- io->op.tiering = 1;
- io->op.tier = 1;
+ io->op.btree_alloc_reserve = RESERVE_TIERING_BTREE;
trace_bcache_tiering_copy(&w->key);
@@ -118,12 +115,6 @@ static void read_tiering(struct cache_set *c)
KEY_SIZE(&w->key) << 9);
closure_call(&io->cl, bch_data_move, NULL, &cl);
-
- /* Try to stripe writes across cache devices by sending them
- * to different open buckets */
- write_point++;
- if (write_point == c->sb.nr_in_set)
- write_point = 0;
}
closure_sync(&cl);