15 files changed, 924 insertions, 896 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 81c5544e84ee..8b27e2471d0c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -238,7 +238,7 @@ struct bucket {
 	u8			copygc_gen;
 };
 
-#include "stats.h"
+#include "stats_types.h"
 #include "inode.h"
 struct search;
 struct btree;
@@ -1172,21 +1172,6 @@ static inline void bch_check_mark_super(struct cache_set *c,
 
 /* Forward declarations */
 
-void bch_count_io_errors(struct cache *, int, const char *);
-void bch_bbio_count_io_errors(struct bbio *, int, const char *);
-void bch_bbio_endio(struct bbio *, int, const char *);
-void bch_bbio_free(struct bio *, struct cache_set *);
-struct bio *bch_bbio_alloc(struct cache_set *);
-
-void bch_generic_make_request(struct bio *, struct cache_set *);
-void bch_bio_submit_work(struct work_struct *);
-void bch_bbio_prep(struct bbio *, struct cache *);
-void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
-		     unsigned, bool);
-void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
-			      struct bkey *, unsigned, bool);
-void bch_bbio_reset(struct bbio *bio);
-
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cacd6c4b7ac4..1e01a5e77b26 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -26,6 +26,7 @@
 #include "buckets.h"
 #include "debug.h"
 #include "extents.h"
+#include "io.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "writeback.h"
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c2b81bebe3df..eb964573095f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -9,6 +9,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
+#include "io.h"
 #include "keybuf.h"
 
 #include <linux/console.h>
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index d565ad1f496d..c4f6d1ec984c 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -6,13 +6,19 @@
  */
 
 #include "bcache.h"
+#include "alloc.h"
 #include "bset.h"
-#include "debug.h"
 #include "btree.h"
+#include "debug.h"
 #include "extents.h"
+#include "io.h"
+#include "keybuf.h"
+#include "stats.h"
 
 #include <linux/blkdev.h>
 
+#include <trace/events/bcache.h>
+
 void bch_generic_make_request(struct bio *bio, struct cache_set *c)
 {
 	if (current->bio_list) {
@@ -234,3 +240,743 @@ void bch_bbio_endio(struct bbio *bio, int error, const char *m)
 		percpu_ref_put(&ca->ref);
 	closure_put(cl);
 }
+
+/* */
+
+static void bch_data_insert_start(struct closure *);
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	u64 crc = 0xffffffffffffffffULL;
+
+	bio_for_each_segment(bv, bio, iter) {
+		void *d = kmap(bv.bv_page) + bv.bv_offset;
+
+		crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
+		kunmap(bv.bv_page);
+	}
+
+	k->val[bch_extent_ptrs(k)] = crc;
+}
+
+/* Writes */
+
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+{
+	struct data_insert_op *op = container_of(b_op,
+					struct data_insert_op, op);
+	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+
+	int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys,
+					replace_key,
+					op->flush ? &op->cl : NULL);
+	return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret;
+}
+
+static void bch_data_insert_keys_done(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	unsigned i;
+
+	if (op->op.insert_collision)
+		op->replace_collision = true;
+
+	for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
+		if (op->open_buckets[i]) {
+			bch_open_bucket_put(op->c, op->open_buckets[i]);
+			op->open_buckets[i] = NULL;
+		}
+
+	if (!op->insert_data_done)
+		continue_at(cl, bch_data_insert_start, op->io_wq);
+
+	bch_keylist_free(&op->insert_keys);
+	closure_return(cl);
+}
+
+static void __bch_data_insert_keys(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op,
+					op.cl);
+	struct keylist *keys = &op->insert_keys;
+	int ret = 0;
+
+	while (!ret && !bch_keylist_empty(keys)) {
+		op->op.locks_want = 0;
+		ret = bch_btree_map_nodes(&op->op, op->c,
+					  &START_KEY(keys->keys),
+					  btree_insert_fn,
+					  MAP_ASYNC);
+	}
+
+	if (ret == -EAGAIN)
+		continue_at(cl, __bch_data_insert_keys, op->c->wq);
+
+	closure_return(cl);
+}
+
+/**
+ * bch_data_insert_keys - insert extent btree keys for a write
+ */
+static void bch_data_insert_keys(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	enum btree_id id = BTREE_ID_EXTENTS;
+
+	__bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
+
+	closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
+	continue_at(cl, bch_data_insert_keys_done, op->c->wq);
+}
+
+/**
+ * bch_data_invalidate - discard range of keys
+ *
+ * Used to implement discard, and to handle when writethrough write hits
+ * a write error on the cache device.
+ */
+static void bch_data_invalidate(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct keylist *keys = &op->insert_keys;
+	struct bio *bio = op->bio;
+
+	pr_debug("invalidating %i sectors from %llu",
+		 bio_sectors(bio), (u64) bio->bi_iter.bi_sector);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = min(bio_sectors(bio),
+				       1U << (KEY_SIZE_BITS - 1));
+
+		if (bch_keylist_realloc(keys, BKEY_U64s))
+			goto out;
+
+		bio->bi_iter.bi_sector	+= sectors;
+		bio->bi_iter.bi_size	-= sectors << 9;
+
+		*keys->top = KEY(KEY_INODE(&op->insert_key),
+				 bio->bi_iter.bi_sector, sectors);
+		SET_KEY_DELETED(keys->top, true);
+
+		bch_keylist_push(keys);
+	}
+
+	op->insert_data_done = true;
+	bio_put(bio);
+out:
+	continue_at(cl, bch_data_insert_keys, op->c->wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+	/*
+	 * Our data write just errored, which means we've got a bunch of keys to
+	 * insert that point to data that wasn't successfully written.
+	 *
+	 * We don't have to insert those keys but we still have to invalidate
+	 * that region of the cache - so, if we just strip off all the pointers
+	 * from the keys we'll accomplish just that.
+	 */
+
+	struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
+
+	while (src != op->insert_keys.top) {
+		struct bkey *n = bkey_next(src);
+
+		bch_set_extent_ptrs(src, 0);
+		memmove(dst, src, bkey_bytes(src));
+
+		dst = bkey_next(dst);
+		src = n;
+	}
+
+	op->insert_keys.top = dst;
+
+	bch_data_insert_keys(cl);
+}
+
+static void bch_data_insert_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+	if (bio->bi_error) {
+		/* TODO: We could try to recover from this. */
+		if (!KEY_CACHED(&op->insert_key))
+			op->error = bio->bi_error;
+		else if (!op->replace)
+			set_closure_fn(cl, bch_data_insert_error,
+				       op->c->wq);
+		else
+			set_closure_fn(cl, NULL, NULL);
+	}
+
+	bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
+}
+
+static void bch_data_insert_start(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio, *n;
+	unsigned open_bucket_nr = 0, ptrs_from;
+	struct open_bucket *b;
+
+	if (op->discard)
+		return bch_data_invalidate(cl);
+
+	bch_extent_drop_stale(op->c, &op->insert_key);
+	ptrs_from = bch_extent_ptrs(&op->insert_key);
+
+	/*
+	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
+	 * flush, it'll wait on the journal write.
+	 */
+	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
+
+	do {
+		struct bkey *k;
+		struct bio_set *split = op->c->bio_split;
+
+		BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key));
+
+		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+			continue_at(cl, bch_data_insert_keys,
+				    op->c->wq);
+
+		/* for the device pointers and 1 for the chksum */
+		if (bch_keylist_realloc(&op->insert_keys,
+					BKEY_EXTENT_MAX_U64s +
+					(KEY_CSUM(&op->insert_key) ? 1 : 0)))
+			continue_at(cl, bch_data_insert_keys, op->c->wq);
+
+		k = op->insert_keys.top;
+		bkey_copy(k, &op->insert_key);
+
+		b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
+		BUG_ON(!b);
+
+		if (PTR_ERR(b) == -EAGAIN) {
+			/* If we already have some keys, must insert them first
+			 * before allocating another open bucket. We only hit
+			 * this case if open_bucket_nr > 1. */
+			if (bch_keylist_empty(&op->insert_keys))
+				continue_at(cl, bch_data_insert_start,
+					    op->io_wq);
+			else
+				continue_at(cl, bch_data_insert_keys,
+					    op->c->wq);
+		} else if (IS_ERR(b))
+			goto err;
+
+		op->open_buckets[open_bucket_nr++] = b;
+
+		bch_cut_front(k, &op->insert_key);
+
+		n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+		n->bi_end_io	= bch_data_insert_endio;
+		n->bi_private	= cl;
+
+		if (KEY_CSUM(k))
+			bio_csum(n, k);
+
+		trace_bcache_cache_insert(k);
+
+		bio_set_op_attrs(n, REQ_OP_WRITE, 0);
+		bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
+
+		bch_extent_normalize(op->c, k);
+		bch_check_mark_super(op->c, k, false);
+
+		bch_keylist_push(&op->insert_keys);
+	} while (n != bio);
+
+	op->insert_data_done = true;
+	continue_at(cl, bch_data_insert_keys, op->c->wq);
+err:
+	if (KEY_CACHED(&op->insert_key)) {
+		/*
+		 * If we were writing cached data, not doing the write is fine
+		 * so long as we discard whatever would have been overwritten -
+		 * then it's equivalent to doing the write and immediately
+		 * reclaiming it.
+		 */
+
+		op->discard = true;
+		return bch_data_invalidate(cl);
+	}
+
+	op->error		= -ENOSPC;
+	op->insert_data_done	= true;
+	bio_put(bio);
+
+	/*
+	 * No reason not to insert keys for whatever data was successfully
+	 * written (especially for a cmpxchg operation that's moving data
+	 * around)
+	 */
+	if (!bch_keylist_empty(&op->insert_keys))
+		continue_at(cl, bch_data_insert_keys, op->c->wq);
+	else
+		closure_return(cl);
+}
+
+/**
+ * bch_data_insert - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->bio; bi_sector is used for the key offset, and
+ * op->inode is used for the key inode.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch_data_insert(struct closure *cl)
+{
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct cache_set *c = op->c;
+	u64 inode = KEY_INODE(&op->insert_key);
+
+	trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key),
+			   op->discard);
+
+	if (!bio_sectors(op->bio)) {
+		WARN_ONCE(1, "bch_data_insert() called with empty bio");
+		closure_return(cl);
+	}
+
+	/*
+	 * This ought to be initialized in bch_data_insert_op_init(), but struct
+	 * cache_set isn't exported
+	 */
+	if (!op->io_wq)
+		op->io_wq = op->c->wq;
+
+	if (!op->discard)
+		bch_increment_clock(c, bio_sectors(op->bio), WRITE);
+
+	if (!op->replace) {
+		/* XXX: discards may be for more sectors than max key size */
+
+		struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0);
+		struct bkey end = KEY(inode, bio_end_sector(op->bio), 0);
+
+		unsigned i;
+		struct cache *ca;
+
+		for_each_cache(ca, c, i)
+			bch_keybuf_check_overlapping(&ca->moving_gc_keys,
+						     &start, &end);
+
+		bch_keybuf_check_overlapping(&c->tiering_keys,
+					     &start, &end);
+	}
+
+	if (op->wp->ca)
+		bch_mark_gc_write(c, bio_sectors(op->bio));
+	else if (!op->discard)
+		bch_mark_foreground_write(c, bio_sectors(op->bio));
+	else
+		bch_mark_discard(c, bio_sectors(op->bio));
+
+	if (atomic64_sub_return(bio_sectors(op->bio),
+				&c->sectors_until_gc) < 0) {
+		set_gc_sectors(c);
+		wake_up_process(c->gc_thread);
+	}
+
+	SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio));
+	SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio));
+
+	bch_keylist_init(&op->insert_keys);
+	bio_get(op->bio);
+	continue_at_nobarrier(cl, bch_data_insert_start, NULL);
+}
+
+void bch_data_insert_op_init(struct data_insert_op *op,
+			     struct cache_set *c,
+			     struct bio *bio,
+			     struct write_point *wp,
+			     bool wait, bool discard, bool flush,
+			     struct bkey *insert_key,
+			     struct bkey *replace_key)
+{
+	if (!wp) {
+		unsigned wp_idx = hash_long((unsigned long) current,
+					    ilog2(ARRAY_SIZE(c->write_points)));
+
+		BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
+		wp = &c->write_points[wp_idx];
+	}
+
+	op->c		= c;
+	op->io_wq	= NULL;
+	op->bio		= bio;
+	op->error	= 0;
+	op->flags	= 0;
+	op->wait	= wait;
+	op->discard	= discard;
+	op->flush	= flush;
+	op->wp		= wp;
+	op->btree_alloc_reserve = BTREE_ID_EXTENTS;
+
+	memset(op->open_buckets, 0, sizeof(op->open_buckets));
+	bch_keylist_init(&op->insert_keys);
+	bkey_copy(&op->insert_key, insert_key);
+
+	if (replace_key) {
+		op->replace = true;
+		bkey_copy(&op->replace_key, replace_key);
+	}
+}
+
+/* Cache promotion on read */
+
+struct cache_promote_op {
+	struct closure		cl;
+	struct bio		*orig_bio;
+	struct data_insert_op	iop;
+	bool			stale; /* was the ptr stale after the read? */
+	struct bbio		bio; /* must be last */
+};
+
+static void cache_promote_done(struct closure *cl)
+{
+	struct cache_promote_op *op = container_of(cl,
+					struct cache_promote_op, cl);
+	struct cache_set *c = op->iop.c;
+
+	if (op->iop.replace_collision) {
+		trace_bcache_promote_collision(&op->iop.replace_key);
+		atomic_inc(&c->accounting.collector.cache_miss_collisions);
+	}
+
+	bio_free_pages(op->iop.bio);
+	kfree(op);
+}
+
+static void cache_promote_write(struct closure *cl)
+{
+	struct cache_promote_op *op = container_of(cl,
+					struct cache_promote_op, cl);
+	struct bio *bio = op->iop.bio;
+
+	bio_reset(bio);
+	bio->bi_iter.bi_sector	= KEY_START(&op->iop.insert_key);
+	bio->bi_iter.bi_size	= KEY_SIZE(&op->iop.insert_key) << 9;
+	/* needed to reinit bi_vcnt so pages can be freed later */
+	bch_bio_map(bio, NULL);
+
+	bio_copy_data(op->orig_bio, bio);
+	op->orig_bio->bi_error = op->iop.error;
+	bio_endio(op->orig_bio);
+
+	if (!op->stale &&
+	    !op->iop.error &&
+	    !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
+		closure_call(&op->iop.cl, bch_data_insert, NULL, cl);
+
+	closure_return_with_destructor(cl, cache_promote_done);
+}
+
+static void cache_promote_endio(struct bio *bio)
+{
+	struct bbio *b = to_bbio(bio);
+	struct cache_promote_op *op = container_of(b,
+					struct cache_promote_op, bio);
+
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s->error but not error so it doesn't get
+	 * counted against the cache device, but we'll still reread the data
+	 * from the backing device.
+	 */
+
+	if (bio->bi_error)
+		op->iop.error = bio->bi_error;
+	else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0))
+		op->stale = 1;
+
+	bch_bbio_endio(b, bio->bi_error, "reading from cache");
+}
+
+/**
+ * __cache_promote -- insert result of read bio into cache
+ *
+ * Used for backing devices and flash-only volumes.
+ *
+ * @orig_bio must actually be a bbio with a valid key.
+ */
+void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
+		     struct bkey *replace_key)
+{
+	struct cache_promote_op *op;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
+
+	/* XXX: readahead? */
+
+	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	if (!op)
+		goto out_submit;
+
+	/* clone the bbio */
+	memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
+
+	bio = &op->bio.bio;
+	bio_init(bio);
+	bio_get(bio);
+	bio->bi_bdev		= orig_bio->bio.bi_bdev;
+	bio->bi_iter.bi_sector	= orig_bio->bio.bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= orig_bio->bio.bi_iter.bi_size;
+	bio->bi_end_io		= cache_promote_endio;
+	bio->bi_private		= &op->cl;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bch_bio_map(bio, NULL);
+
+	if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
+		goto out_free;
+
+	orig_bio->ca = NULL;
+
+	closure_init(&op->cl, &c->cl);
+	op->orig_bio		= &orig_bio->bio;
+	op->stale		= 0;
+
+	bch_data_insert_op_init(&op->iop, c, bio,
+				&c->tier_write_points[0],
+				false, false, false,
+				replace_key,
+				replace_key);
+
+	bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key);
+	bch_cut_back(&orig_bio->key, &op->iop.insert_key);
+
+	trace_bcache_promote(&orig_bio->bio);
+
+	op->bio.submit_time_us = local_clock_us();
+	closure_bio_submit(bio, &op->cl);
+
+	continue_at(&op->cl, cache_promote_write, c->wq);
+out_free:
+	kfree(op);
+out_submit:
+	generic_make_request(&orig_bio->bio);
+}
+
+/**
+ * cache_promote - promote data stored in higher tiers
+ *
+ * Used for flash only volumes.
+ *
+ * @bio must actually be a bbio with valid key.
+ */
+bool cache_promote(struct cache_set *c, struct bbio *bio,
+		   struct bkey *k, unsigned ptr)
+{
+	if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) {
+		generic_make_request(&bio->bio);
+		return 0;
+	}
+
+	__cache_promote(c, bio, k);
+	return 1;
+}
+
+/* Read */
+
+struct bch_read_op {
+	struct btree_op		op;
+	struct cache_set	*c;
+	struct bio		*bio;
+	u64			inode;
+};
+
+static void bch_read_requeue(struct cache_set *c, struct bio *bio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&c->read_race_lock, flags);
+	bio_list_add(&c->read_race_list, bio);
+	spin_unlock_irqrestore(&c->read_race_lock, flags);
+	queue_work(c->wq, &c->read_race_work);
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+	struct bbio *b = to_bbio(bio);
+	struct cache *ca = b->ca;
+	struct bio *orig = bio->bi_private;
+
+	bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
+
+	if (!bio->bi_error && ca &&
+	    (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) {
+		/* Read bucket invalidate race */
+		atomic_long_inc(&ca->set->cache_read_races);
+		bch_read_requeue(ca->set, bio);
+	} else {
+		if (bio->bi_error)
+			orig->bi_error = bio->bi_error;
+
+		bio_endio(orig);
+		bio_put(bio);
+	}
+
+	if (ca)
+		percpu_ref_put(&ca->ref);
+}
+
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+/* XXX: this looks a lot like cache_lookup_fn() */
+static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k)
+{
+	struct bch_read_op *op = container_of(b_op,
+			struct bch_read_op, op);
+	struct bio *n, *bio = op->bio;
+	struct bbio *bbio;
+	int sectors, ret;
+	unsigned ptr;
+	struct cache *ca;
+
+	BUG_ON(bkey_cmp(&START_KEY(k),
+			&KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0);
+
+	BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0);
+
+	sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector;
+
+	ca = bch_extent_pick_ptr(b->c, k, &ptr);
+	if (!ca) {
+		if (!KEY_CACHED(k) && bch_extent_ptrs(k)) {
+			bio_io_error(bio);
+			return MAP_DONE;
+		} else {
+			unsigned bytes = min_t(unsigned, sectors,
+					       bio_sectors(bio)) << 9;
+
+			swap(bio->bi_iter.bi_size, bytes);
+			zero_fill_bio(bio);
+			swap(bio->bi_iter.bi_size, bytes);
+
+			bio_advance(bio, bytes);
+
+			return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE;
+		}
+	}
+
+	PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand;
+
+	if (sectors >= bio_sectors(bio)) {
+		n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split);
+		ret = MAP_DONE;
+	} else {
+		n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split);
+		ret = MAP_CONTINUE;
+	}
+
+	n->bi_private		= bio;
+	n->bi_end_io		= bch_read_endio;
+	__bio_inc_remaining(bio);
+
+	bbio = to_bbio(n);
+	bch_bkey_copy_single_ptr(&bbio->key, k, ptr);
+
+	/* Trim the key to match what we're actually reading */
+	bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key);
+	bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key);
+
+	bch_bbio_prep(bbio, ca);
+
+	cache_promote(b->c, bbio, k, ptr);
+
+	return ret;
+}
+
+int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
+{
+	struct bch_read_op op;
+	int ret;
+
+	bch_increment_clock(c, bio_sectors(bio), READ);
+
+	bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1);
+	op.c = c;
+	op.bio = bio;
+	op.inode = inode;
+
+	ret = bch_btree_map_keys(&op.op, c,
+				 &KEY(inode, bio->bi_iter.bi_sector, 0),
+				 bch_read_fn, MAP_HOLES);
+	return ret < 0 ? ret : 0;
+}
+EXPORT_SYMBOL(bch_read);
+
+/**
+ * bch_read_retry - re-submit a bio originally from bch_read()
+ */
+static void bch_read_retry(struct bbio *bbio)
+{
+	struct bio *bio = &bbio->bio;
+	struct bio *parent;
+	u64 inode;
+
+	trace_bcache_read_retry(bio);
+
+	/*
+	 * This used to be a leaf bio from bch_read_fn(), but
+	 * since we don't know what happened to the btree in
+	 * the meantime, we have to re-submit it via the
+	 * top-level bch_read() entry point. Before doing that,
+	 * we have to reset the bio, preserving the biovec.
+	 *
+	 * The inode, offset and size come from the bbio's key,
+	 * which was set by bch_read_fn().
+	 */
+	inode = KEY_INODE(&bbio->key);
+	parent = bio->bi_private;
+
+	bch_bbio_reset(bbio);
+	bio_chain(bio, parent);
+
+	bch_read(bbio->ca->set, bio, inode);
+	bio_endio(parent);  /* for bio_chain() in bch_read_fn() */
+	bio_endio(bio);
+}
+
+void bch_read_race_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(work, struct cache_set,
+					   read_race_work);
+	unsigned long flags;
+	struct bio *bio;
+
+	while (1) {
+		spin_lock_irqsave(&c->read_race_lock, flags);
+		bio = bio_list_pop(&c->read_race_list);
+		spin_unlock_irqrestore(&c->read_race_lock, flags);
+
+		if (!bio)
+			break;
+
+		bch_read_retry(to_bbio(bio));
+	}
+}
diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h
new file mode 100644
index 000000000000..2086bd6840ff
--- /dev/null
+++ b/drivers/md/bcache/io.h
@@ -0,0 +1,73 @@
+#ifndef _BCACHE_IO_H
+#define _BCACHE_IO_H
+
+struct data_insert_op {
+	struct closure		cl;
+	struct cache_set	*c;
+	struct workqueue_struct	*io_wq;
+	struct bio		*bio;
+
+	/* Used internally, do not touch */
+	struct btree_op		op;
+
+	short			error;
+
+	union {
+		u8		flags;
+
+	struct {
+		/* Wait for data bucket allocation or just
+		 * fail when out of space? */
+		unsigned	wait:1;
+		/* Discard key range? */
+		unsigned	discard:1;
+		/* Wait for journal commit? */
+		unsigned	flush:1;
+		/* Perform a compare-exchange with replace_key? */
+		unsigned	replace:1;
+
+		/* Set on completion, if cmpxchg index update failed */
+		unsigned	replace_collision:1;
+		/* Internal */
+		unsigned	insert_data_done:1;
+	};
+	};
+
+	u8			btree_alloc_reserve;
+
+	struct write_point	*wp;
+	struct open_bucket	*open_buckets[2];
+
+	struct keylist		insert_keys;
+	BKEY_PADDED(insert_key);
+	BKEY_PADDED(replace_key);
+};
+
+void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
+			     struct bio *, struct write_point *, bool,
+			     bool, bool, struct bkey *, struct bkey *);
+void bch_data_insert(struct closure *cl);
+
+int bch_read(struct cache_set *, struct bio *, u64);
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct bbio *, int, const char *);
+void bch_bbio_endio(struct bbio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+void bch_generic_make_request(struct bio *, struct cache_set *);
+void bch_bio_submit_work(struct work_struct *);
+void bch_bbio_prep(struct bbio *, struct cache *);
+void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
+		     unsigned, bool);
+void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
+			      struct bkey *, unsigned, bool);
+void bch_bbio_reset(struct bbio *bio);
+
+void __cache_promote(struct cache_set *, struct bbio *, struct bkey *);
+bool cache_promote(struct cache_set *, struct bbio *, struct bkey *, unsigned);
+
+void bch_read_race_work(struct work_struct *work);
+
+#endif /* _BCACHE_IO_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ffc1d2151285..770b72755641 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -8,6 +8,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
+#include "io.h"
 #include "journal.h"
 
 #include <trace/events/bcache.h>
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index a1548ffd14a2..da407f9011e0 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -2,6 +2,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "extents.h"
+#include "io.h"
 #include "keybuf.h"
 #include "move.h"
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 83dcebabc54a..6c7445275aaa 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -8,6 +8,7 @@
 #include "btree.h"
 #include "buckets.h"
 #include "extents.h"
+#include "io.h"
 #include "keybuf.h"
 #include "move.h"
 #include "movinggc.h"
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 66b927d78589..d0256d85859c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -28,6 +28,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
+#include "io.h"
 #include "journal.h"
 #include "keybuf.h"
 #include "request.h"
@@ -46,596 +47,6 @@
 
 struct kmem_cache *bch_search_cache;
 
-static inline void mark_cache_stats(struct cache_stat_collector *stats,
-				    bool hit, bool bypass)
-{
-	atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
-}
-
-static inline void bch_mark_cache_accounting(struct cache_set *c,
-					     struct cached_dev *dc,
-					     bool hit, bool bypass)
-{
-	mark_cache_stats(&dc->accounting.collector, hit, bypass);
-	mark_cache_stats(&c->accounting.collector, hit, bypass);
-}
-
-static inline void bch_mark_sectors_bypassed(struct cache_set *c,
-					     struct cached_dev *dc,
-					     unsigned sectors)
-{
-	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
-	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
-}
-
-static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
-{
-	atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
-}
-
-static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
-{
-	atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
-}
-
-static inline void bch_mark_discard(struct cache_set *c, int sectors)
-{
-	atomic_add(sectors, &c->accounting.collector.discard_sectors);
-}
-
-static void bch_data_insert_start(struct closure *);
-
-static void bio_csum(struct bio *bio, struct bkey *k)
-{
-	struct bio_vec bv;
-	struct bvec_iter iter;
-	u64 crc = 0xffffffffffffffffULL;
-
-	bio_for_each_segment(bv, bio, iter) {
-		void *d = kmap(bv.bv_page) + bv.bv_offset;
-
-		crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
-		kunmap(bv.bv_page);
-	}
-
-	k->val[bch_extent_ptrs(k)] = crc;
-}
-
-/* Insert data into cache */
-
-static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
-{
-	struct data_insert_op *op = container_of(b_op,
-					struct data_insert_op, op);
-	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
-
-	int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys,
-					replace_key,
-					op->flush ? &op->cl : NULL);
-	return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret;
-}
-
-static void bch_data_insert_keys_done(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	unsigned i;
-
-	if (op->op.insert_collision)
-		op->replace_collision = true;
-
-	for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
-		if (op->open_buckets[i]) {
-			bch_open_bucket_put(op->c, op->open_buckets[i]);
-			op->open_buckets[i] = NULL;
-		}
-
-	if (!op->insert_data_done)
-		continue_at(cl, bch_data_insert_start, op->io_wq);
-
-	bch_keylist_free(&op->insert_keys);
-	closure_return(cl);
-}
-
-static void __bch_data_insert_keys(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op,
-					op.cl);
-	struct keylist *keys = &op->insert_keys;
-	int ret = 0;
-
-	while (!ret && !bch_keylist_empty(keys)) {
-		op->op.locks_want = 0;
-		ret = bch_btree_map_nodes(&op->op, op->c,
-					  &START_KEY(keys->keys),
-					  btree_insert_fn,
-					  MAP_ASYNC);
-	}
-
-	if (ret == -EAGAIN)
-		continue_at(cl, __bch_data_insert_keys, op->c->wq);
-
-	closure_return(cl);
-}
-
-/**
- * bch_data_insert_keys - insert extent btree keys for a write
- */
-static void bch_data_insert_keys(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	enum btree_id id = BTREE_ID_EXTENTS;
-
-	__bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
-
-	closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
-	continue_at(cl, bch_data_insert_keys_done, op->c->wq);
-}
-
-/**
- * bch_data_invalidate - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch_data_invalidate(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	struct keylist *keys = &op->insert_keys;
-	struct bio *bio = op->bio;
-
-	pr_debug("invalidating %i sectors from %llu",
-		 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
-
-	while (bio_sectors(bio)) {
-		unsigned sectors = min(bio_sectors(bio),
-				       1U << (KEY_SIZE_BITS - 1));
-
-		if (bch_keylist_realloc(keys, BKEY_U64s))
-			goto out;
-
-		bio->bi_iter.bi_sector	+= sectors;
-		bio->bi_iter.bi_size	-= sectors << 9;
-
-		*keys->top = KEY(KEY_INODE(&op->insert_key),
-				 bio->bi_iter.bi_sector, sectors);
-		SET_KEY_DELETED(keys->top, true);
-
-		bch_keylist_push(keys);
-	}
-
-	op->insert_data_done = true;
-	bio_put(bio);
-out:
-	continue_at(cl, bch_data_insert_keys, op->c->wq);
-}
-
-static void bch_data_insert_error(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-
-	/*
-	 * Our data write just errored, which means we've got a bunch of keys to
-	 * insert that point to data that wasn't successfully written.
-	 *
-	 * We don't have to insert those keys but we still have to invalidate
-	 * that region of the cache - so, if we just strip off all the pointers
-	 * from the keys we'll accomplish just that.
-	 */
-
-	struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
-
-	while (src != op->insert_keys.top) {
-		struct bkey *n = bkey_next(src);
-
-		bch_set_extent_ptrs(src, 0);
-		memmove(dst, src, bkey_bytes(src));
-
-		dst = bkey_next(dst);
-		src = n;
-	}
-
-	op->insert_keys.top = dst;
-
-	bch_data_insert_keys(cl);
-}
-
-static void bch_data_insert_endio(struct bio *bio)
-{
-	struct closure *cl = bio->bi_private;
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-
-	if (bio->bi_error) {
-		/* TODO: We could try to recover from this. */
-		if (!KEY_CACHED(&op->insert_key))
-			op->error = bio->bi_error;
-		else if (!op->replace)
-			set_closure_fn(cl, bch_data_insert_error,
-				       op->c->wq);
-		else
-			set_closure_fn(cl, NULL, NULL);
-	}
-
-	bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
-}
-
-static void bch_data_insert_start(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	struct bio *bio = op->bio, *n;
-	unsigned open_bucket_nr = 0, ptrs_from;
-	struct open_bucket *b;
-
-	if (op->discard)
-		return bch_data_invalidate(cl);
-
-	bch_extent_drop_stale(op->c, &op->insert_key);
-	ptrs_from = bch_extent_ptrs(&op->insert_key);
-
-	/*
-	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
-	 * flush, it'll wait on the journal write.
-	 */
-	bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
-	do {
-		struct bkey *k;
-		struct bio_set *split = op->c->bio_split;
-
-		BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key));
-
-		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
-			continue_at(cl, bch_data_insert_keys,
-				    op->c->wq);
-
-		/* for the device pointers and 1 for the chksum */
-		if (bch_keylist_realloc(&op->insert_keys,
-					BKEY_EXTENT_MAX_U64s +
-					(KEY_CSUM(&op->insert_key) ? 1 : 0)))
-			continue_at(cl, bch_data_insert_keys, op->c->wq);
-
-		k = op->insert_keys.top;
-		bkey_copy(k, &op->insert_key);
-
-		b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
-		BUG_ON(!b);
-
-		if (PTR_ERR(b) == -EAGAIN) {
-			/* If we already have some keys, must insert them first
-			 * before allocating another open bucket. We only hit
-			 * this case if open_bucket_nr > 1. */
-			if (bch_keylist_empty(&op->insert_keys))
-				continue_at(cl, bch_data_insert_start,
-					    op->io_wq);
-			else
-				continue_at(cl, bch_data_insert_keys,
-					    op->c->wq);
-		} else if (IS_ERR(b))
-			goto err;
-
-		op->open_buckets[open_bucket_nr++] = b;
-
-		bch_cut_front(k, &op->insert_key);
-
-		n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
-		n->bi_end_io	= bch_data_insert_endio;
-		n->bi_private	= cl;
-
-		if (KEY_CSUM(k))
-			bio_csum(n, k);
-
-		trace_bcache_cache_insert(k);
-
-		bio_set_op_attrs(n, REQ_OP_WRITE, 0);
-		bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
-
-		bch_extent_normalize(op->c, k);
-		bch_check_mark_super(op->c, k, false);
-
-		bch_keylist_push(&op->insert_keys);
-	} while (n != bio);
-
-	op->insert_data_done = true;
-	continue_at(cl, bch_data_insert_keys, op->c->wq);
-err:
-	if (KEY_CACHED(&op->insert_key)) {
-		/*
-		 * If we were writing cached data, not doing the write is fine
-		 * so long as we discard whatever would have been overwritten -
-		 * then it's equivalent to doing the write and immediately
-		 * reclaiming it.
-		 */
-
-		op->discard = true;
-		return bch_data_invalidate(cl);
-	}
-
-	op->error		= -ENOSPC;
-	op->insert_data_done	= true;
-	bio_put(bio);
-
-	/*
-	 * No reason not to insert keys for whatever data was successfully
-	 * written (especially for a cmpxchg operation that's moving data
-	 * around)
-	 */
-	if (!bch_keylist_empty(&op->insert_keys))
-		continue_at(cl, bch_data_insert_keys, op->c->wq);
-	else
-		closure_return(cl);
-}
-
-/**
- * bch_data_insert - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch_data_insert(struct closure *cl)
-{
-	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-	struct cache_set *c = op->c;
-	u64 inode = KEY_INODE(&op->insert_key);
-
-	trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key),
-			   op->discard);
-
-	if (!bio_sectors(op->bio)) {
-		WARN_ONCE(1, "bch_data_insert() called with empty bio");
-		closure_return(cl);
-	}
-
-	/*
-	 * This ought to be initialized in bch_data_insert_op_init(), but struct
-	 * cache_set isn't exported
-	 */
-	if (!op->io_wq)
-		op->io_wq = op->c->wq;
-
-	if (!op->discard)
-		bch_increment_clock(c, bio_sectors(op->bio), WRITE);
-
-	if (!op->replace) {
-		/* XXX: discards may be for more sectors than max key size */
-
-		struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0);
-		struct bkey end = KEY(inode, bio_end_sector(op->bio), 0);
-
-		unsigned i;
-		struct cache *ca;
-
-		for_each_cache(ca, c, i)
-			bch_keybuf_check_overlapping(&ca->moving_gc_keys,
-						     &start, &end);
-
-		bch_keybuf_check_overlapping(&c->tiering_keys,
-					     &start, &end);
-	}
-
-	if (op->wp->ca)
-		bch_mark_gc_write(c, bio_sectors(op->bio));
-	else if (!op->discard)
-		bch_mark_foreground_write(c, bio_sectors(op->bio));
-	else
-		bch_mark_discard(c, bio_sectors(op->bio));
-
-	if (atomic64_sub_return(bio_sectors(op->bio),
-				&c->sectors_until_gc) < 0) {
-		set_gc_sectors(c);
-		wake_up_process(c->gc_thread);
-	}
-
-	SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio));
-	SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio));
-
-	bch_keylist_init(&op->insert_keys);
-	bio_get(op->bio);
-	continue_at_nobarrier(cl, bch_data_insert_start, NULL);
-}
-
-void bch_data_insert_op_init(struct data_insert_op *op,
-			     struct cache_set *c,
-			     struct bio *bio,
-			     struct write_point *wp,
-			     bool wait, bool discard, bool flush,
-			     struct bkey *insert_key,
-			     struct bkey *replace_key)
-{
-	if (!wp) {
-		unsigned wp_idx = hash_long((unsigned long) current,
-					    ilog2(ARRAY_SIZE(c->write_points)));
-
-		BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
-		wp = &c->write_points[wp_idx];
-	}
-
-	op->c		= c;
-	op->io_wq	= NULL;
-	op->bio		= bio;
-	op->error	= 0;
-	op->flags	= 0;
-	op->wait	= wait;
-	op->discard	= discard;
-	op->flush	= flush;
-	op->wp		= wp;
-	op->btree_alloc_reserve = BTREE_ID_EXTENTS;
-
-	memset(op->open_buckets, 0, sizeof(op->open_buckets));
-	bch_keylist_init(&op->insert_keys);
-	bkey_copy(&op->insert_key, insert_key);
-
-	if (replace_key) {
-		op->replace = true;
-		bkey_copy(&op->replace_key, replace_key);
-	}
-}
-EXPORT_SYMBOL(bch_data_insert_op_init);
-
-/* Cache promotion on read */
-
-struct cache_promote_op {
-	struct closure		cl;
-	struct bio		*orig_bio;
-	struct data_insert_op	iop;
-	bool			stale; /* was the ptr stale after the read? */
-	struct bbio		bio; /* must be last */
-};
-
-static void cache_promote_done(struct closure *cl)
-{
-	struct cache_promote_op *op = container_of(cl,
-					struct cache_promote_op, cl);
-	struct cache_set *c = op->iop.c;
-
-	if (op->iop.replace_collision) {
-		trace_bcache_promote_collision(&op->iop.replace_key);
-		atomic_inc(&c->accounting.collector.cache_miss_collisions);
-	}
-
-	bio_free_pages(op->iop.bio);
-	kfree(op);
-}
-
-static void cache_promote_write(struct closure *cl)
-{
-	struct cache_promote_op *op = container_of(cl,
-					struct cache_promote_op, cl);
-	struct bio *bio = op->iop.bio;
-
-	bio_reset(bio);
-	bio->bi_iter.bi_sector	= KEY_START(&op->iop.insert_key);
-	bio->bi_iter.bi_size	= KEY_SIZE(&op->iop.insert_key) << 9;
-	/* needed to reinit bi_vcnt so pages can be freed later */
-	bch_bio_map(bio, NULL);
-
-	bio_copy_data(op->orig_bio, bio);
-	op->orig_bio->bi_error = op->iop.error;
-	bio_endio(op->orig_bio);
-
-	if (!op->stale &&
-	    !op->iop.error &&
-	    !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
-		closure_call(&op->iop.cl, bch_data_insert, NULL, cl);
-
-	closure_return_with_destructor(cl, cache_promote_done);
-}
-
-static void cache_promote_endio(struct bio *bio)
-{
-	struct bbio *b = to_bbio(bio);
-	struct cache_promote_op *op = container_of(b,
-					struct cache_promote_op, bio);
-
-	/*
-	 * If the bucket was reused while our bio was in flight, we might have
-	 * read the wrong data. Set s->error but not error so it doesn't get
-	 * counted against the cache device, but we'll still reread the data
-	 * from the backing device.
-	 */
-
-	if (bio->bi_error)
-		op->iop.error = bio->bi_error;
-	else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0))
-		op->stale = 1;
-
-	bch_bbio_endio(b, bio->bi_error, "reading from cache");
-}
-
-/**
- * __cache_promote -- insert result of read bio into cache
- *
- * Used for backing devices and flash-only volumes.
- *
- * @orig_bio must actually be a bbio with a valid key.
- */
-static void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
-			    struct bkey *replace_key)
-{
-	struct cache_promote_op *op;
-	struct bio *bio;
-	unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
-
-	/* XXX: readahead? */
-
-	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
-	if (!op)
-		goto out_submit;
-
-	/* clone the bbio */
-	memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
-
-	bio = &op->bio.bio;
-	bio_init(bio);
-	bio_get(bio);
-	bio->bi_bdev		= orig_bio->bio.bi_bdev;
-	bio->bi_iter.bi_sector	= orig_bio->bio.bi_iter.bi_sector;
-	bio->bi_iter.bi_size	= orig_bio->bio.bi_iter.bi_size;
-	bio->bi_end_io		= cache_promote_endio;
-	bio->bi_private		= &op->cl;
-	bio->bi_io_vec		= bio->bi_inline_vecs;
-	bch_bio_map(bio, NULL);
-
-	if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
-		goto out_free;
-
-	orig_bio->ca = NULL;
-
-	closure_init(&op->cl, &c->cl);
-	op->orig_bio		= &orig_bio->bio;
-	op->stale		= 0;
-
-	bch_data_insert_op_init(&op->iop, c, bio,
-				&c->tier_write_points[0],
-				false, false, false,
-				replace_key,
-				replace_key);
-
-	bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key);
-	bch_cut_back(&orig_bio->key, &op->iop.insert_key);
-
-	trace_bcache_promote(&orig_bio->bio);
-
-	op->bio.submit_time_us = local_clock_us();
-	closure_bio_submit(bio, &op->cl);
-
-	continue_at(&op->cl, cache_promote_write, c->wq);
-out_free:
-	kfree(op);
-out_submit:
-	generic_make_request(&orig_bio->bio);
-}
-
-/**
- * cache_promote - promote data stored in higher tiers
- *
- * Used for flash only volumes.
- *
- * @bio must actually be a bbio with valid key.
- */
-static bool cache_promote(struct cache_set *c, struct bbio *bio,
-			  struct bkey *k, unsigned ptr)
-{
-	if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) {
-		generic_make_request(&bio->bio);
-		return 0;
-	}
-
-	__cache_promote(c, bio, k);
-	return 1;
-}
-
 /* Congested? */
 
 unsigned bch_get_congested(struct cache_set *c)
@@ -764,194 +175,6 @@ skip:
 	return true;
 }
 
-/* Cache lookup */
-
-/* XXX: consolidate these somehow */
-
-struct bch_read_op {
-	struct btree_op		op;
-	struct cache_set	*c;
-	struct bio		*bio;
-	u64			inode;
-};
-
-static void bch_read_requeue(struct cache_set *c, struct bio *bio)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->read_race_lock, flags);
-	bio_list_add(&c->read_race_list, bio);
-	spin_unlock_irqrestore(&c->read_race_lock, flags);
-	queue_work(c->wq, &c->read_race_work);
-}
-
-static void bch_read_endio(struct bio *bio)
-{
-	struct bbio *b = to_bbio(bio);
-	struct cache *ca = b->ca;
-	struct bio *orig = bio->bi_private;
-
-	bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
-
-	if (!bio->bi_error && ca &&
-	    (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) {
-		/* Read bucket invalidate race */
-		atomic_long_inc(&ca->set->cache_read_races);
-		bch_read_requeue(ca->set, bio);
-	} else {
-		if (bio->bi_error)
-			orig->bi_error = bio->bi_error;
-
-		bio_endio(orig);
-		bio_put(bio);
-	}
-
-	if (ca)
-		percpu_ref_put(&ca->ref);
-}
-
-static inline void __bio_inc_remaining(struct bio *bio)
-{
-	bio->bi_flags |= (1 << BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
-/* XXX: this looks a lot like cache_lookup_fn() */
-static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k)
-{
-	struct bch_read_op *op = container_of(b_op,
-			struct bch_read_op, op);
-	struct bio *n, *bio = op->bio;
-	struct bbio *bbio;
-	int sectors, ret;
-	unsigned ptr;
-	struct cache *ca;
-
-	BUG_ON(bkey_cmp(&START_KEY(k),
-			&KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0);
-
-	BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0);
-
-	sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector;
-
-	ca = bch_extent_pick_ptr(b->c, k, &ptr);
-	if (!ca) {
-		if (!KEY_CACHED(k) && bch_extent_ptrs(k)) {
-			bio_io_error(bio);
-			return MAP_DONE;
-		} else {
-			unsigned bytes = min_t(unsigned, sectors,
-					       bio_sectors(bio)) << 9;
-
-			swap(bio->bi_iter.bi_size, bytes);
-			zero_fill_bio(bio);
-			swap(bio->bi_iter.bi_size, bytes);
-
-			bio_advance(bio, bytes);
-
-			return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE;
-		}
-	}
-
-	PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand;
-
-	if (sectors >= bio_sectors(bio)) {
-		n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split);
-		ret = MAP_DONE;
-	} else {
-		n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split);
-		ret = MAP_CONTINUE;
-	}
-
-	n->bi_private		= bio;
-	n->bi_end_io		= bch_read_endio;
-	__bio_inc_remaining(bio);
-
-	bbio = to_bbio(n);
-	bch_bkey_copy_single_ptr(&bbio->key, k, ptr);
-
-	/* Trim the key to match what we're actually reading */
-	bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key);
-	bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key);
-
-	bch_bbio_prep(bbio, ca);
-
-	cache_promote(b->c, bbio, k, ptr);
-
-	return ret;
-}
-
-int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
-{
-	struct bch_read_op op;
-	int ret;
-
-	bch_increment_clock(c, bio_sectors(bio), READ);
-
-	bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1);
-	op.c = c;
-	op.bio = bio;
-	op.inode = inode;
-
-	ret = bch_btree_map_keys(&op.op, c,
-				 &KEY(inode, bio->bi_iter.bi_sector, 0),
-				 bch_read_fn, MAP_HOLES);
-	return ret < 0 ? ret : 0;
-}
-EXPORT_SYMBOL(bch_read);
-
-/**
- * bch_read_retry - re-submit a bio originally from bch_read()
- */
-static void bch_read_retry(struct bbio *bbio)
-{
-	struct bio *bio = &bbio->bio;
-	struct bio *parent;
-	u64 inode;
-
-	trace_bcache_read_retry(bio);
-
-	/*
-	 * This used to be a leaf bio from bch_read_fn(), but
-	 * since we don't know what happened to the btree in
-	 * the meantime, we have to re-submit it via the
-	 * top-level bch_read() entry point. Before doing that,
-	 * we have to reset the bio, preserving the biovec.
-	 *
-	 * The inode, offset and size come from the bbio's key,
-	 * which was set by bch_read_fn().
-	 */
-	inode = KEY_INODE(&bbio->key);
-	parent = bio->bi_private;
-
-	bch_bbio_reset(bbio);
-	bio_chain(bio, parent);
-
-	bch_read(bbio->ca->set, bio, inode);
-	bio_endio(parent);  /* for bio_chain() in bch_read_fn() */
-	bio_endio(bio);
-}
-
-void bch_read_race_work(struct work_struct *work)
-{
-	struct cache_set *c = container_of(work, struct cache_set,
-					   read_race_work);
-	unsigned long flags;
-	struct bio *bio;
-
-	while (1) {
-		spin_lock_irqsave(&c->read_race_lock, flags);
-		bio = bio_list_pop(&c->read_race_list);
-		spin_unlock_irqrestore(&c->read_race_lock, flags);
-
-		if (!bio)
-			break;
-
-		bch_read_retry(to_bbio(bio));
-	}
-}
-
 /* struct search based code */
 
 struct search {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 386f452f6951..edec16a917e2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -8,61 +8,11 @@ struct cached_dev;
 struct bcache_device;
 struct kmem_cache;
 
-struct data_insert_op {
-	struct closure		cl;
-	struct cache_set	*c;
-	struct workqueue_struct	*io_wq;
-	struct bio		*bio;
-
-	/* Used internally, do not touch */
-	struct btree_op		op;
-
-	short			error;
-
-	union {
-		u8		flags;
-
-	struct {
-		/* Wait for data bucket allocation or just
-		 * fail when out of space? */
-		unsigned	wait:1;
-		/* Discard key range? */
-		unsigned	discard:1;
-		/* Wait for journal commit? */
-		unsigned	flush:1;
-		/* Perform a compare-exchange with replace_key? */
-		unsigned	replace:1;
-
-		/* Set on completion, if cmpxchg index update failed */
-		unsigned	replace_collision:1;
-		/* Internal */
-		unsigned	insert_data_done:1;
-	};
-	};
-
-	u8			btree_alloc_reserve;
-
-	struct write_point	*wp;
-	struct open_bucket	*open_buckets[2];
-
-	struct keylist		insert_keys;
-	BKEY_PADDED(insert_key);
-	BKEY_PADDED(replace_key);
-};
-
-void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
-			     struct bio *, struct write_point *, bool,
-			     bool, bool, struct bkey *, struct bkey *);
-
 unsigned bch_get_congested(struct cache_set *);
-int bch_read(struct cache_set *, struct bio *, u64);
-void bch_data_insert(struct closure *cl);
 
 void bch_cached_dev_request_init(struct cached_dev *dc);
 void bch_flash_dev_request_init(struct bcache_device *d);
 
-void bch_read_race_work(struct work_struct *work);
-
 extern struct kmem_cache *bch_search_cache;
 
 #endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index 0a0da6a460ee..39877f9aa132 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -1,57 +1,7 @@
 #ifndef _BCACHE_STATS_H_
 #define _BCACHE_STATS_H_
 
-struct cache_stat_collector {
-	union {
-	struct {
-		atomic_t	cache_hits;
-		atomic_t	cache_misses;
-		atomic_t	cache_bypass_hits;
-		atomic_t	cache_bypass_misses;
-	};
-
-	/* cache_hit_array[!bypass][!hit]: */
-	atomic_t		cache_hit_array[2][2];
-	};
-
-
-	atomic_t		cache_readaheads;
-	atomic_t		cache_miss_collisions;
-	atomic_t		sectors_bypassed;
-	atomic_t		foreground_write_sectors;
-	atomic_t		gc_write_sectors;
-	atomic_t		discard_sectors;
-};
-
-struct cache_stats {
-	struct kobject		kobj;
-
-	unsigned long		cache_hits;
-	unsigned long		cache_misses;
-	unsigned long		cache_bypass_hits;
-	unsigned long		cache_bypass_misses;
-	unsigned long		cache_readaheads;
-	unsigned long		cache_miss_collisions;
-	unsigned long		sectors_bypassed;
-	unsigned long		foreground_write_sectors;
-	unsigned long		gc_write_sectors;
-	unsigned long		discard_sectors;
-
-	unsigned		rescale;
-};
-
-struct cache_accounting {
-	struct closure		cl;
-	struct timer_list	timer;
-	atomic_t		closing;
-
-	struct cache_stat_collector collector;
-
-	struct cache_stats	total;
-	struct cache_stats	five_minute;
-	struct cache_stats	hour;
-	struct cache_stats	day;
-};
+#include "stats_types.h"
 
 struct cache_set;
 struct cached_dev;
@@ -62,4 +12,41 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *);
 void bch_cache_accounting_clear(struct cache_accounting *);
 void bch_cache_accounting_destroy(struct cache_accounting *);
 
+static inline void mark_cache_stats(struct cache_stat_collector *stats,
+				    bool hit, bool bypass)
+{
+	atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
+}
+
+static inline void bch_mark_cache_accounting(struct cache_set *c,
+					     struct cached_dev *dc,
+					     bool hit, bool bypass)
+{
+	mark_cache_stats(&dc->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+static inline void bch_mark_sectors_bypassed(struct cache_set *c,
+					     struct cached_dev *dc,
+					     unsigned sectors)
+{
+	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
+}
+
+static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
+}
+
+static inline void bch_mark_discard(struct cache_set *c, int sectors)
+{
+	atomic_add(sectors, &c->accounting.collector.discard_sectors);
+}
+
 #endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/stats_types.h b/drivers/md/bcache/stats_types.h
new file mode 100644
index 000000000000..28e4c69e8e6d
--- /dev/null
+++ b/drivers/md/bcache/stats_types.h
@@ -0,0 +1,56 @@
+#ifndef _BCACHE_STATS_TYPES_H_
+#define _BCACHE_STATS_TYPES_H_
+
+struct cache_stat_collector {
+	union {
+	struct {
+		atomic_t	cache_hits;
+		atomic_t	cache_misses;
+		atomic_t	cache_bypass_hits;
+		atomic_t	cache_bypass_misses;
+	};
+
+	/* cache_hit_array[!bypass][!hit]: */
+	atomic_t		cache_hit_array[2][2];
+	};
+
+
+	atomic_t		cache_readaheads;
+	atomic_t		cache_miss_collisions;
+	atomic_t		sectors_bypassed;
+	atomic_t		foreground_write_sectors;
+	atomic_t		gc_write_sectors;
+	atomic_t		discard_sectors;
+};
+
+struct cache_stats {
+	struct kobject		kobj;
+
+	unsigned long		cache_hits;
+	unsigned long		cache_misses;
+	unsigned long		cache_bypass_hits;
+	unsigned long		cache_bypass_misses;
+	unsigned long		cache_readaheads;
+	unsigned long		cache_miss_collisions;
+	unsigned long		sectors_bypassed;
+	unsigned long		foreground_write_sectors;
+	unsigned long		gc_write_sectors;
+	unsigned long		discard_sectors;
+
+	unsigned		rescale;
+};
+
+struct cache_accounting {
+	struct closure		cl;
+	struct timer_list	timer;
+	atomic_t		closing;
+
+	struct cache_stat_collector collector;
+
+	struct cache_stats	total;
+	struct cache_stats	five_minute;
+	struct cache_stats	hour;
+	struct cache_stats	day;
+};
+
+#endif /* _BCACHE_STATS_TYPES_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c787db192728..6ad3b54f5bfa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,6 +10,7 @@
 #include "alloc.h"
 #include "btree.h"
 #include "debug.h"
+#include "io.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "request.h"
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 162931d95ee8..c1724bc844d9 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -3,6 +3,7 @@
 #include "btree.h"
 #include "buckets.h"
 #include "extents.h"
+#include "io.h"
 #include "keybuf.h"
 #include "move.h"
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 62a1c2f136ae..0843f378cd37 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -10,6 +10,7 @@
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
+#include "io.h"
 #include "keybuf.h"
 #include "writeback.h"