10 files changed, 222 insertions, 349 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6209e42c5633..cc6a09c1205c 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -746,12 +746,14 @@ int bch_bucket_alloc_set(struct cache_set *c, enum alloc_reserve reserve,
 	long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
 	int i, ret;
 
-	mutex_lock(&c->bucket_lock);
+	BUG_ON(tier_idx > ARRAY_SIZE(c->cache_by_alloc));
 	BUG_ON(!n || n > BKEY_EXTENT_PTRS_MAX);
 
 	bkey_init(k);
 	memset(caches_used, 0, sizeof(caches_used));
 
+	mutex_lock(&c->bucket_lock);
+
 	/* sort by free space/prio of oldest data in caches */
 
 	for (i = 0; i < n; i++) {
@@ -850,8 +852,7 @@ static struct open_bucket *bch_open_bucket_get(struct cache_set *c,
 }
 
 static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c,
-						 enum alloc_reserve reserve,
-						 int n, unsigned tier,
+						 struct write_point *wp,
 						 struct closure *cl)
 {
 	int ret;
@@ -861,105 +862,74 @@ static struct open_bucket *bch_open_bucket_alloc(struct cache_set *c,
 	if (IS_ERR_OR_NULL(b))
 		return b;
 
-	ret = bch_bucket_alloc_set(c, reserve, &b->key, n, tier, cl);
-	if (ret) {
-		BUG_ON(ret > 0);
-		bch_open_bucket_put(c, b);
-		b = ERR_PTR(ret);
+	if (wp->ca) {
+		long bucket;
+
+		mutex_lock(&c->bucket_lock);
+
+		bucket = bch_bucket_alloc(wp->ca, RESERVE_MOVINGGC, cl);
+		if (bucket < 0) {
+			ret = bucket;
+			mutex_unlock(&c->bucket_lock);
+			goto err;
+		}
+
+		b->key.val[0] = PTR(wp->ca->bucket_gens[bucket],
+				    bucket_to_sector(wp->ca->set, bucket),
+				    wp->ca->sb.nr_this_dev);
+		bch_set_extent_ptrs(&b->key, 1);
+
+		mutex_unlock(&c->bucket_lock);
+	} else if (wp->tier) {
+		ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key, 1,
+					   wp->tier - c->cache_by_alloc, cl);
+		if (ret)
+			goto err;
+	} else {
+		ret = bch_bucket_alloc_set(c, RESERVE_NONE, &b->key,
+				CACHE_SET_DATA_REPLICAS_WANT(&c->sb),
+				0, cl);
+		if (ret)
+			goto err;
 	}
 
 	return b;
+err:
+	bch_open_bucket_put(c, b);
+	return ERR_PTR(ret);
 }
 
 /* Sector allocator */
 
-/*
- * We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
- *
- * The ideas is if you've got multiple tasks pulling data into the cache at the
- * same time, you'll get better cache utilization if you try to segregate their
- * data and preserve locality.
- *
- * For example, say you've starting Firefox at the same time you're copying a
- * bunch of files. Firefox will likely end up being fairly hot and stay in the
- * cache awhile, but the data you copied might not be; if you wrote all that
- * data to the same buckets it'd get invalidated at the same time.
- *
- * Both of those tasks will be doing fairly random IO so we can't rely on
- * detecting sequential IO to segregate their data, but going off of the task
- * should be a sane heuristic.
- */
-static struct open_bucket *pick_data_bucket(struct cache_set *c,
-					    const struct bkey *search,
-					    unsigned write_point,
-					    unsigned tier_idx,
-					    struct closure *cl)
-	__releases(c->open_buckets_lock)
-	__acquires(c->open_buckets_lock)
+static struct open_bucket *lock_and_refill_writepoint(struct cache_set *c,
+						      struct write_point *wp,
+						      struct closure *cl)
 {
-	struct cache_tier *tier = &c->cache_by_alloc[tier_idx];
 	struct open_bucket *b;
-	int i, wp = -1;
-retry:
-	for (i = 0;
-	     i < ARRAY_SIZE(tier->data_buckets) &&
-	     (b = tier->data_buckets[i]); i++) {
-		/* Tiering thread already writes keys in order, maximize
-		 * write bandwidth instead */
-		if (tier_idx == 0 && !bkey_cmp(&b->key, &START_KEY(search)))
-			goto found;
-		else if (b->last_write_point == write_point)
-			wp = i;
-	}
-
-	i = wp;
-	if (i >= 0)
-		goto found;
-
-	i = ARRAY_SIZE(tier->data_buckets) - 1;
-	if (tier->data_buckets[i])
-		goto found;
 
-	spin_unlock(&c->open_buckets_lock);
-	b = bch_open_bucket_alloc(c, RESERVE_NONE,
-				  CACHE_SET_DATA_REPLICAS_WANT(&c->sb),
-				  tier_idx, cl);
-	spin_lock(&c->open_buckets_lock);
+	while (1) {
+		b = ACCESS_ONCE(wp->b);
+		if (b) {
+			spin_lock(&b->lock);
+			if (wp->b == b)
+				return b;
 
-	if (IS_ERR_OR_NULL(b))
-		return b;
+			spin_unlock(&b->lock);
+		} else {
+			b = bch_open_bucket_alloc(c, wp, cl);
+			if (IS_ERR_OR_NULL(b))
+				return b;
 
-	if (tier->data_buckets[i]) {
-		/* we raced - and we must unlock to call bch_bucket_free()... */
-		spin_unlock(&c->open_buckets_lock);
-		bch_bucket_free_never_used(c, &b->key);
-		spin_lock(&c->open_buckets_lock);
+			spin_lock(&b->lock);
+			if (!race_fault() &&
+			    cmpxchg(&wp->b, NULL, b) == NULL)
+				return b;
+			spin_unlock(&b->lock);
 
-		__bch_open_bucket_put(c, b);
-		goto retry;
-	} else {
-		tier->data_buckets[i] = b;
+			bch_bucket_free_never_used(c, &b->key);
+			bch_open_bucket_put(c, b);
+		}
 	}
-found:
-	b = tier->data_buckets[i];
-
-	/*
-	 * Move b to the end of the lru, and keep track of what
-	 * this bucket was last used for:
-	 */
-	memmove(&tier->data_buckets[1],
-		&tier->data_buckets[0],
-		sizeof(struct open_bucket *) * i);
-
-	tier->data_buckets[0] = b;
-
-	b->last_write_point = write_point;
-	bkey_copy_key(&b->key, search);
-
-	return b;
 }
 
 static void verify_not_stale(struct cache_set *c, struct bkey *k)
@@ -993,34 +963,28 @@ static void verify_not_stale(struct cache_set *c, struct bkey *k)
  * @tier_idx - which tier this write is destined towards
  * @cl - closure to wait for a bucket
  */
-struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
-				      unsigned write_point, unsigned tier_idx,
-				      unsigned long *ptrs_to_write,
+struct open_bucket *bch_alloc_sectors(struct cache_set *c,
+				      struct write_point *wp,
+				      struct bkey *k,
 				      struct closure *cl)
 {
-	struct cache_tier *tier = &c->cache_by_alloc[tier_idx];
 	struct open_bucket *b;
 	unsigned i, sectors;
 
-	spin_lock(&c->open_buckets_lock);
-
-	b = pick_data_bucket(c, k, write_point, tier_idx, cl);
+	b = lock_and_refill_writepoint(c, wp, cl);
 	if (IS_ERR_OR_NULL(b))
-		goto out;
+		return b;
 
-	BUG_ON(b != tier->data_buckets[0]);
+	BUG_ON(!b->sectors_free);
 
 	verify_not_stale(c, &b->key);
 
 	/* Set up the pointer to the space we're allocating: */
+	memcpy(&k->val[bch_extent_ptrs(k)],
+	       &b->key.val[0],
+	       bch_extent_ptrs(&b->key) * sizeof(u64));
 
-	for (i = 0; i < bch_extent_ptrs(&b->key); i++) {
-		unsigned ptrs = bch_extent_ptrs(k);
-
-		k->val[ptrs] = b->key.val[i];
-		__set_bit(ptrs, ptrs_to_write);
-		bch_set_extent_ptrs(k, ptrs + 1);
-	}
+	bch_set_extent_ptrs(k, bch_extent_ptrs(k) + bch_extent_ptrs(&b->key));
 
 	sectors = min_t(unsigned, KEY_SIZE(k), b->sectors_free);
 
@@ -1030,6 +994,10 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
 	/* update open bucket for next time: */
 
 	b->sectors_free	-= sectors;
+	if (b->sectors_free)
+		atomic_inc(&b->pin);
+	else
+		BUG_ON(xchg(&wp->b, NULL) != b);
 
 	rcu_read_lock();
 	for (i = 0; i < bch_extent_ptrs(&b->key); i++) {
@@ -1044,127 +1012,11 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c, struct bkey *k,
 	}
 	rcu_read_unlock();
 
-	/*
-	 * k takes refcounts on the buckets it points to until it's inserted
-	 * into the btree, but if we're done with this bucket we just transfer
-	 * get_data_bucket()'s refcount.
-	 */
-
-	if (b->sectors_free) {
-		atomic_inc(&b->pin);
-	} else {
-		memmove(&tier->data_buckets[0],
-			&tier->data_buckets[1],
-			sizeof(struct open_bucket *) *
-			(ARRAY_SIZE(tier->data_buckets) - 1));
-		tier->data_buckets[ARRAY_SIZE(tier->data_buckets) - 1] = NULL;
-	}
-out:
-	spin_unlock(&c->open_buckets_lock);
+	spin_unlock(&b->lock);
 
 	return b;
 }
 
-struct open_bucket *bch_gc_alloc_sectors(struct cache_set *c, struct bkey *k,
-					 unsigned long *ptrs_to_write,
-					 struct closure *cl)
-{
-	unsigned i, gen, sectors = KEY_SIZE(k);
-	struct cache *ca;
-	struct open_bucket *b;
-	long bucket;
-
-	mutex_lock(&c->bucket_lock);
-retry:
-	/* Check if we raced with a foreground write */
-
-	rcu_read_lock();
-	for (i = 0; i < bch_extent_ptrs(k); i++)
-		if ((ca = PTR_CACHE(c, k, i)) &&
-		    (gen = PTR_BUCKET(c, ca, k, i)->copygc_gen)) {
-			gen--;
-			percpu_ref_get(&ca->ref);
-			rcu_read_unlock();
-			goto found;
-		}
-	rcu_read_unlock();
-	mutex_unlock(&c->bucket_lock);
-	return ERR_PTR(-ESRCH);
-found:
-	b = ca->gc_buckets[gen];
-	if (!b) {
-		mutex_unlock(&c->bucket_lock);
-
-		b = bch_open_bucket_get(c, NULL);
-		if (WARN_ONCE(IS_ERR(b),
-			      "bcache: movinggc bucket allocation failed with %ld",
-			      PTR_ERR(b))) {
-			b = ERR_PTR(-ENOSPC);
-			goto out_put;
-		}
-
-		mutex_lock(&c->bucket_lock);
-
-		bucket = bch_bucket_alloc(ca, RESERVE_MOVINGGC, NULL);
-		if (WARN_ONCE(bucket < 0,
-			      "bcache: movinggc bucket allocation failed with %ld",
-			      bucket)) {
-			mutex_unlock(&c->bucket_lock);
-			bch_open_bucket_put(c, b);
-			b = ERR_PTR(-ENOSPC);
-			goto out_put;
-		}
-
-		b->key.val[0] = PTR(ca->bucket_gens[bucket],
-				    bucket_to_sector(ca->set, bucket),
-				    ca->sb.nr_this_dev);
-		bch_set_extent_ptrs(&b->key, 1);
-
-		/* we dropped bucket_lock, might've raced */
-		if (ca->gc_buckets[gen] || race_fault()) {
-			/* we raced */
-			mutex_unlock(&c->bucket_lock);
-			bch_bucket_free_never_used(c, &b->key);
-			bch_open_bucket_put(c, b);
-			mutex_lock(&c->bucket_lock);
-		} else {
-			ca->gc_buckets[gen] = b;
-		}
-
-		/*
-		 * GC_GEN() might also have been reset... don't strictly need to
-		 * recheck though
-		 */
-		percpu_ref_put(&ca->ref);
-		goto retry;
-	}
-
-	verify_not_stale(c, &b->key);
-
-	k->val[i] = b->key.val[0];
-	__set_bit(i, ptrs_to_write);
-
-	sectors = min_t(unsigned, sectors, b->sectors_free);
-
-	SET_KEY_OFFSET(k, KEY_START(k) + sectors);
-	SET_KEY_SIZE(k, sectors);
-
-	/* update open bucket for next time: */
-
-	b->sectors_free	-= sectors;
-	if (b->sectors_free) {
-		SET_PTR_OFFSET(&b->key, 0, PTR_OFFSET(&b->key, 0) + sectors);
-		atomic_inc(&b->pin);
-	} else
-		ca->gc_buckets[gen] = NULL;
-
-	atomic_long_add(sectors, &ca->sectors_written);
-	mutex_unlock(&c->bucket_lock);
-out_put:
-	percpu_ref_put(&ca->ref);
-	return b;
-}
-
 void bch_mark_open_buckets(struct cache_set *c)
 {
 	struct cache *ca;
@@ -1214,9 +1066,13 @@ void bch_open_buckets_init(struct cache_set *c)
 	spin_lock_init(&c->open_buckets_lock);
 
 	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) {
+		spin_lock_init(&c->open_buckets[i].lock);
 		c->open_buckets_nr_free++;
 		list_add(&c->open_buckets[i].list, &c->open_buckets_free);
 	}
+
+	for (i = 0; i < ARRAY_SIZE(c->cache_by_alloc); i++)
+		c->cache_by_alloc[i].wp.tier = &c->cache_by_alloc[i];
 }
 
 int bch_cache_allocator_start(struct cache *ca)
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index adbe0177937a..c4dff7d53e5c 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -48,11 +48,8 @@ int bch_bucket_alloc_set(struct cache_set *, enum alloc_reserve, struct bkey *,
 
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 
-struct open_bucket *bch_alloc_sectors(struct cache_set *, struct bkey *,
-				      unsigned, unsigned,
-				      unsigned long *, struct closure *);
-struct open_bucket *bch_gc_alloc_sectors(struct cache_set *, struct bkey *,
-					 unsigned long *, struct closure *);
+struct open_bucket *bch_alloc_sectors(struct cache_set *, struct write_point *,
+				      struct bkey *, struct closure *);
 
 void bch_mark_open_buckets(struct cache_set *);
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 03e316d51b76..4c741260dd6e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -463,7 +463,20 @@ struct cached_dev {
  * and one for moving GC */
 enum alloc_reserve {
 	RESERVE_PRIO	= BTREE_ID_NR,
+	/*
+	 * free_inc.size buckets are set aside for moving GC btree node
+	 * allocations. This means that if moving GC runs out of new buckets for
+	 * btree nodes, it will have put back at least free_inc.size buckets
+	 * back on free_inc, preventing a deadlock.
+	 *
+	 * XXX: figure out a less stupid way of achieving this
+	 */
 	RESERVE_MOVINGGC_BTREE,
+	/*
+	 * Tiering needs a btree node reserve because of how
+	 * btree_check_reserve() works -- if the cache tier is full, we don't
+	 * want tiering to block forever.
+	 */
 	RESERVE_TIERING_BTREE,
 	RESERVE_METADATA_LAST = RESERVE_TIERING_BTREE,
 	RESERVE_MOVINGGC,
@@ -481,14 +494,39 @@ enum alloc_reserve {
 /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
 #define OPEN_BUCKETS_COUNT 256
 
+#define WRITE_POINT_COUNT	16
+
 struct open_bucket {
 	struct list_head	list;
+	spinlock_t		lock;
 	atomic_t		pin;
-	unsigned		last_write_point;
 	unsigned		sectors_free;
 	BKEY_PADDED(key);
 };
 
+struct write_point {
+	struct open_bucket	*b;
+
+	/*
+	 * If not NULL, refill from that device (this write point is a member of
+	 * that struct cache)
+	 *
+	 * If NULL, do a normal replicated bucket allocation
+	 */
+	struct cache		*ca;
+
+	/*
+	 * If not NULL, tier specific writepoint used by tiering/promotion -
+	 * always allocates a single replica
+	 */
+	struct cache_tier	*tier;
+
+	/*
+	 * Otherwise do a normal replicated bucket allocation that could come
+	 * from any tier (foreground write)
+	 */
+};
+
 struct bucket_stats {
 	u64			buckets_dirty;
 	u64			buckets_cached;
@@ -583,7 +621,7 @@ struct cache {
 	 * Protected by bucket_lock.
 	 */
 #define NUM_GC_GENS 7
-	struct open_bucket	*gc_buckets[NUM_GC_GENS];
+	struct write_point	gc_buckets[NUM_GC_GENS];
 
 	struct journal_device	journal;
 
@@ -623,12 +661,15 @@ struct gc_stat {
 #define	CACHE_SET_STOPPING		1
 #define	CACHE_SET_RUNNING		2
 
-#define TIER_OPEN_BUCKETS_COUNT		16
-
 struct cache_tier {
 	unsigned		nr_devices;
 	struct cache		*devices[MAX_CACHES_PER_SET];
-	struct open_bucket	*data_buckets[TIER_OPEN_BUCKETS_COUNT];
+
+	/*
+	 * writepoint specific to this tier, for cache promote/background
+	 * tiering
+	 */
+	struct write_point	wp;
 };
 
 struct prio_clock {
@@ -735,6 +776,8 @@ struct cache_set {
 	spinlock_t		open_buckets_lock;
 	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
 
+	struct write_point	write_points[WRITE_POINT_COUNT];
+
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
 
@@ -1087,7 +1130,7 @@ void bch_bbio_prep(struct bbio *, struct cache *);
 void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
 		     unsigned, bool);
 void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
-			      struct bkey *, unsigned long *, bool);
+			      struct bkey *, unsigned, bool);
 void bch_bbio_reset(struct bbio *bio);
 
 __printf(2, 3)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e4a4a24f2a3b..99a61145cc67 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -456,7 +456,6 @@ static void do_btree_node_write(struct btree *b)
 {
 	struct closure *cl = &b->io;
 	struct bset *i = btree_bset_last(b);
-	unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
 	BKEY_PADDED(key) k;
 	int n;
 
@@ -478,8 +477,6 @@ static void do_btree_node_write(struct btree *b)
 	bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
 	bch_bio_map(b->bio, i);
 
-	memset(ptrs_to_write, 0xFF, sizeof(ptrs_to_write));
-
 	/*
 	 * If we're appending to a leaf node, we don't technically need FUA -
 	 * this write just needs to be persisted before the next journal write,
@@ -509,8 +506,7 @@ static void do_btree_node_write(struct btree *b)
 			memcpy(page_address(bv->bv_page),
 			       base + j * PAGE_SIZE, PAGE_SIZE);
 
-		bch_submit_bbio_replicas(b->bio, b->c, &k.key,
-					 ptrs_to_write, true);
+		bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
 		continue_at(cl, btree_node_write_done, NULL);
 	} else {
 		trace_bcache_btree_bounce_write_fail(b);
@@ -518,8 +514,7 @@ static void do_btree_node_write(struct btree *b)
 		b->bio->bi_vcnt = 0;
 		bch_bio_map(b->bio, i);
 
-		bch_submit_bbio_replicas(b->bio, b->c, &k.key,
-					 ptrs_to_write, true);
+		bch_submit_bbio_replicas(b->bio, b->c, &k.key, 0, true);
 
 		closure_sync(cl);
 		continue_at_nobarrier(cl, __btree_node_write_done, NULL);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 697cb295b8e8..d565ad1f496d 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -94,17 +94,14 @@ void bch_submit_bbio(struct bbio *b, struct cache *ca,
 }
 
 void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c,
-			      struct bkey *k, unsigned long *ptrs_to_write,
-			      bool punt)
+			      struct bkey *k, unsigned ptrs_from, bool punt)
 {
 	struct cache *ca;
-	unsigned ptr, next, nr_ptrs = bch_extent_ptrs(k);
-
-	for (ptr = find_first_bit(ptrs_to_write, nr_ptrs);
-	     ptr != nr_ptrs;
-	     ptr = next) {
-		next = find_next_bit(ptrs_to_write, nr_ptrs, ptr + 1);
+	unsigned ptr;
 
+	for (ptr = ptrs_from;
+	     ptr < bch_extent_ptrs(k);
+	     ptr++) {
 		rcu_read_lock();
 		ca = PTR_CACHE(c, k, ptr);
 		if (ca)
@@ -116,7 +113,7 @@ void bch_submit_bbio_replicas(struct bio *bio, struct cache_set *c,
 			break;
 		}
 
-		if (next != nr_ptrs) {
+		if (ptr + 1 < bch_extent_ptrs(k)) {
 			struct bio *n = bio_clone_fast(bio, GFP_NOIO,
 						       ca->replica_set);
 			n->bi_end_io		= bio->bi_end_io;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 998646c57452..91956f0e6e6b 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -71,6 +71,8 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
 	struct keybuf_key *w;
 	struct moving_io *io;
 	struct closure cl;
+	struct write_point *wp;
+	unsigned ptr, gen;
 
 	closure_init_stack(&cl);
 	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
@@ -85,6 +87,19 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
 		if (!w)
 			break;
 
+		for (ptr = 0; ptr < bch_extent_ptrs(&w->key); ptr++)
+			if ((ca->sb.nr_this_dev == PTR_DEV(&w->key, ptr)) &&
+			    (gen = PTR_BUCKET(c, ca, &w->key,
+					      ptr)->copygc_gen)) {
+				gen--;
+				BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
+				wp = &ca->gc_buckets[gen];
+				goto found;
+			}
+
+		bch_keybuf_put(&ca->moving_gc_keys, w);
+		continue;
+found:
 		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
 			     GFP_KERNEL);
@@ -98,11 +113,13 @@ static void read_moving(struct cache *ca, struct moving_io_stats *stats)
 		io->keybuf		= &ca->moving_gc_keys;
 		io->stats		= stats;
 
-		bch_data_insert_op_init(&io->op, c, &io->bio.bio, 0,
+		bch_data_insert_op_init(&io->op, c, &io->bio.bio, wp,
 					false, false, false,
 					&io->w->key, &io->w->key);
 		io->op.io_wq		= ca->moving_gc_write;
-		io->op.moving_gc	= true;
+		io->op.btree_alloc_reserve = RESERVE_MOVINGGC_BTREE;
+
+		bch_extent_drop_ptr(&io->op.insert_key, ptr);
 
 		trace_bcache_gc_copy(&w->key);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 08f82223c0ef..4f62ff07afa1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -103,31 +103,6 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 /* Insert data into cache */
 
-static enum alloc_reserve bch_btree_reserve(struct data_insert_op *op)
-{
-	if (op->moving_gc) {
-		/*
-		 * free_inc.size buckets are set aside for moving GC
-		 * btree node allocations. This means that if moving GC
-		 * runs out of new buckets for btree nodes, it will have
-		 * put back at least free_inc.size buckets back on
-		 * free_inc, preventing a deadlock.
-		 *
-		 * XXX: figure out a less stupid way of achieving this
-		 */
-		return RESERVE_MOVINGGC_BTREE;
-	} else if (op->tiering) {
-		/*
-		 * Tiering needs a btree node reserve because of how
-		 * btree_check_reserve() works -- if the cache tier is
-		 * full, we don't want tiering to block forever.
-		 */
-		return RESERVE_TIERING_BTREE;
-	}
-
-	return BTREE_ID_EXTENTS;
-}
-
 static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
 {
 	struct data_insert_op *op = container_of(b_op,
@@ -189,10 +164,8 @@ static void bch_data_insert_keys(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	enum btree_id id = BTREE_ID_EXTENTS;
-	enum alloc_reserve reserve;
 
-	reserve = bch_btree_reserve(op);
-	__bch_btree_op_init(&op->op, id, reserve, 0);
+	__bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
 
 	closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
 	continue_at(cl, bch_data_insert_keys_done, op->c->wq);
@@ -288,14 +261,16 @@ static void bch_data_insert_endio(struct bio *bio)
 static void bch_data_insert_start(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	unsigned long ptrs_to_write[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
 	struct bio *bio = op->bio, *n;
-	unsigned open_bucket_nr = 0;
+	unsigned open_bucket_nr = 0, ptrs_from;
 	struct open_bucket *b;
 
 	if (op->discard)
 		return bch_data_invalidate(cl);
 
+	bch_extent_drop_stale(op->c, &op->insert_key);
+	ptrs_from = bch_extent_ptrs(&op->insert_key);
+
 	/*
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
@@ -321,15 +296,9 @@ static void bch_data_insert_start(struct closure *cl)
 		k = op->insert_keys.top;
 		bkey_copy(k, &op->insert_key);
 
-		bch_extent_drop_stale(op->c, k);
-		memset(ptrs_to_write, 0, sizeof(ptrs_to_write));
-
-		b = op->moving_gc
-			? bch_gc_alloc_sectors(op->c, k, ptrs_to_write, cl)
-			: bch_alloc_sectors(op->c, k, op->write_point, op->tier,
-					    ptrs_to_write,
-					    op->wait ? cl : NULL);
+		b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
 		BUG_ON(!b);
+
 		if (PTR_ERR(b) == -EAGAIN) {
 			/* If we already have some keys, must insert them first
 			 * before allocating another open bucket. We only hit
@@ -357,7 +326,7 @@ static void bch_data_insert_start(struct closure *cl)
 		trace_bcache_cache_insert(k);
 
 		bio_set_op_attrs(n, REQ_OP_WRITE, 0);
-		bch_submit_bbio_replicas(n, op->c, k, ptrs_to_write, false);
+		bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
 
 		bch_extent_normalize(op->c, k);
 		bch_keylist_push(&op->insert_keys);
@@ -458,7 +427,7 @@ void bch_data_insert(struct closure *cl)
 					     &start, &end);
 	}
 
-	if (op->moving_gc)
+	if (op->wp->ca)
 		bch_mark_gc_write(c, bio_sectors(op->bio));
 	else if (!op->discard)
 		bch_mark_foreground_write(c, bio_sectors(op->bio));
@@ -479,6 +448,44 @@ void bch_data_insert(struct closure *cl)
 	continue_at_nobarrier(cl, bch_data_insert_start, NULL);
 }
 
+void bch_data_insert_op_init(struct data_insert_op *op,
+			     struct cache_set *c,
+			     struct bio *bio,
+			     struct write_point *wp,
+			     bool wait, bool discard, bool flush,
+			     struct bkey *insert_key,
+			     struct bkey *replace_key)
+{
+	if (!wp) {
+		unsigned wp_idx = hash_long((unsigned long) current,
+					    ilog2(ARRAY_SIZE(c->write_points)));
+
+		BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
+		wp = &c->write_points[wp_idx];
+	}
+
+	op->c		= c;
+	op->io_wq	= NULL;
+	op->bio		= bio;
+	op->error	= 0;
+	op->flags	= 0;
+	op->wait	= wait;
+	op->discard	= discard;
+	op->flush	= flush;
+	op->wp		= wp;
+	op->btree_alloc_reserve = BTREE_ID_EXTENTS;
+
+	memset(op->open_buckets, 0, sizeof(op->open_buckets));
+	bch_keylist_init(&op->insert_keys);
+	bkey_copy(&op->insert_key, insert_key);
+
+	if (replace_key) {
+		op->replace = true;
+		bkey_copy(&op->replace_key, replace_key);
+	}
+}
+EXPORT_SYMBOL(bch_data_insert_op_init);
+
 /* Cache promotion on read */
 
 struct cache_promote_op {
@@ -597,12 +604,9 @@ static void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
 	op->orig_bio		= &orig_bio->bio;
 	op->stale		= 0;
 
-	bch_data_insert_op_init(&op->iop, c,
-				bio,
-				hash_long((unsigned long) current, 16),
-				false,
-				false,
-				false,
+	bch_data_insert_op_init(&op->iop, c, bio,
+				&c->cache_by_alloc[0].wp,
+				false, false, false,
 				replace_key,
 				replace_key);
 
@@ -1403,8 +1407,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		closure_bio_submit(bio, cl);
 	}
 
-	bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio,
-				hash_long((unsigned long) current, 16),
+	bch_data_insert_op_init(&s->iop, dc->disk.c, insert_bio, NULL,
 				!KEY_CACHED(&insert_key), bypass,
 				bio->bi_opf & (REQ_PREFLUSH|REQ_FUA),
 				&insert_key, NULL);
@@ -1554,8 +1557,7 @@ static void __flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		s = search_alloc(bio, d);
 		bio = &s->bio.bio;
 
-		bch_data_insert_op_init(&s->iop, d->c, bio,
-					hash_long((unsigned long) current, 16),
+		bch_data_insert_op_init(&s->iop, d->c, bio, NULL,
 					true,
 					bio_op(bio) == REQ_OP_DISCARD,
 					bio->bi_opf & (REQ_PREFLUSH|REQ_FUA),
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index a4467456fe72..386f452f6951 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -17,11 +17,10 @@ struct data_insert_op {
 	/* Used internally, do not touch */
 	struct btree_op		op;
 
-	uint16_t		write_point;
 	short			error;
 
 	union {
-		uint16_t	flags;
+		u8		flags;
 
 	struct {
 		/* Wait for data bucket allocation or just
@@ -33,20 +32,17 @@ struct data_insert_op {
 		unsigned	flush:1;
 		/* Perform a compare-exchange with replace_key? */
 		unsigned	replace:1;
-		/* Tier to write to */
-		unsigned	tier:2;
-		/* Use moving GC reserves for buckets, btree nodes and
-		 * open buckets? */
-		unsigned	moving_gc:1;
-		/* Use tiering reserves for btree nodes? */
-		unsigned	tiering:1;
-		/* Set on completion */
+
+		/* Set on completion, if cmpxchg index update failed */
 		unsigned	replace_collision:1;
 		/* Internal */
 		unsigned	insert_data_done:1;
 	};
 	};
 
+	u8			btree_alloc_reserve;
+
+	struct write_point	*wp;
 	struct open_bucket	*open_buckets[2];
 
 	struct keylist		insert_keys;
@@ -54,33 +50,9 @@ struct data_insert_op {
 	BKEY_PADDED(replace_key);
 };
 
-static inline void bch_data_insert_op_init(struct data_insert_op *op,
-					   struct cache_set *c,
-					   struct bio *bio,
-					   unsigned write_point,
-					   bool wait, bool discard, bool flush,
-					   struct bkey *insert_key,
-					   struct bkey *replace_key)
-{
-	op->c		= c;
-	op->io_wq	= NULL;
-	op->bio		= bio;
-	op->write_point	= write_point;
-	op->error	= 0;
-	op->flags	= 0;
-	op->wait	= wait;
-	op->discard	= discard;
-	op->flush	= flush;
-
-	memset(op->open_buckets, 0, sizeof(op->open_buckets));
-	bch_keylist_init(&op->insert_keys);
-	bkey_copy(&op->insert_key, insert_key);
-
-	if (replace_key) {
-		op->replace = true;
-		bkey_copy(&op->replace_key, replace_key);
-	}
-}
+void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
+			     struct bio *, struct write_point *, bool,
+			     bool, bool, struct bkey *, struct bkey *);
 
 unsigned bch_get_congested(struct cache_set *);
 int bch_read(struct cache_set *, struct bio *, u64);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9023d9feb1fc..58441478d8bd 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2192,6 +2192,9 @@ static int cache_init(struct cache *ca)
 		total_reserve += ca->free[i].size;
 	pr_debug("%zu buckets reserved", total_reserve);
 
+	for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++)
+		ca->gc_buckets[i].ca = ca;
+
 	mutex_init(&ca->heap_lock);
 	init_waitqueue_head(&ca->fifo_wait);
 	bch_moving_init_cache(ca);
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 44663744ddf6..49b213d20a1f 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -72,15 +72,12 @@ static void read_tiering(struct cache_set *c)
 	struct moving_io *io;
 	struct closure cl;
 	struct moving_io_stats stats;
-	unsigned write_point;
 
 	trace_bcache_tiering_start(c);
 	closure_init_stack(&cl);
 
 	memset(&stats, 0, sizeof(stats));
 
-	write_point = 0;
-
 	/* XXX: if we error, background writeback could stall indefinitely */
 
 	c->tiering_keys.last_scanned = ZERO_KEY;
@@ -106,11 +103,11 @@ static void read_tiering(struct cache_set *c)
 		io->stats = &stats;
 
 		bch_data_insert_op_init(&io->op, c, &io->bio.bio,
-					write_point, true, false, false,
+					&c->cache_by_alloc[1].wp,
+					true, false, false,
 					&io->w->key, &io->w->key);
 		io->op.io_wq	= c->tiering_write;
-		io->op.tiering	= 1;
-		io->op.tier	= 1;
+		io->op.btree_alloc_reserve = RESERVE_TIERING_BTREE;
 
 		trace_bcache_tiering_copy(&w->key);
 
@@ -118,12 +115,6 @@ static void read_tiering(struct cache_set *c)
 					KEY_SIZE(&w->key) << 9);
 
 		closure_call(&io->cl, bch_data_move, NULL, &cl);
-
-		/* Try to stripe writes across cache devices by sending them
-		 * to different open buckets */
-		write_point++;
-		if (write_point == c->sb.nr_in_set)
-			write_point = 0;
 	}
 
 	closure_sync(&cl);