summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bcache/bcache.h17
-rw-r--r--drivers/md/bcache/btree.c1
-rw-r--r--drivers/md/bcache/debug.c1
-rw-r--r--drivers/md/bcache/io.c748
-rw-r--r--drivers/md/bcache/io.h73
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/move.c1
-rw-r--r--drivers/md/bcache/movinggc.c1
-rw-r--r--drivers/md/bcache/request.c779
-rw-r--r--drivers/md/bcache/request.h50
-rw-r--r--drivers/md/bcache/stats.h89
-rw-r--r--drivers/md/bcache/stats_types.h56
-rw-r--r--drivers/md/bcache/super.c1
-rw-r--r--drivers/md/bcache/tier.c1
-rw-r--r--drivers/md/bcache/writeback.c1
15 files changed, 924 insertions, 896 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 81c5544e84ee..8b27e2471d0c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -238,7 +238,7 @@ struct bucket {
u8 copygc_gen;
};
-#include "stats.h"
+#include "stats_types.h"
#include "inode.h"
struct search;
struct btree;
@@ -1172,21 +1172,6 @@ static inline void bch_check_mark_super(struct cache_set *c,
/* Forward declarations */
-void bch_count_io_errors(struct cache *, int, const char *);
-void bch_bbio_count_io_errors(struct bbio *, int, const char *);
-void bch_bbio_endio(struct bbio *, int, const char *);
-void bch_bbio_free(struct bio *, struct cache_set *);
-struct bio *bch_bbio_alloc(struct cache_set *);
-
-void bch_generic_make_request(struct bio *, struct cache_set *);
-void bch_bio_submit_work(struct work_struct *);
-void bch_bbio_prep(struct bbio *, struct cache *);
-void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
- unsigned, bool);
-void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
- struct bkey *, unsigned, bool);
-void bch_bbio_reset(struct bbio *bio);
-
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *, const char *, ...);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cacd6c4b7ac4..1e01a5e77b26 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -26,6 +26,7 @@
#include "buckets.h"
#include "debug.h"
#include "extents.h"
+#include "io.h"
#include "journal.h"
#include "movinggc.h"
#include "writeback.h"
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c2b81bebe3df..eb964573095f 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -9,6 +9,7 @@
#include "btree.h"
#include "debug.h"
#include "extents.h"
+#include "io.h"
#include "keybuf.h"
#include <linux/console.h>
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index d565ad1f496d..c4f6d1ec984c 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -6,13 +6,19 @@
*/
#include "bcache.h"
+#include "alloc.h"
#include "bset.h"
-#include "debug.h"
#include "btree.h"
+#include "debug.h"
#include "extents.h"
+#include "io.h"
+#include "keybuf.h"
+#include "stats.h"
#include <linux/blkdev.h>
+#include <trace/events/bcache.h>
+
void bch_generic_make_request(struct bio *bio, struct cache_set *c)
{
if (current->bio_list) {
@@ -234,3 +240,743 @@ void bch_bbio_endio(struct bbio *bio, int error, const char *m)
percpu_ref_put(&ca->ref);
closure_put(cl);
}
+
+/* */
+
+static void bch_data_insert_start(struct closure *);
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ u64 crc = 0xffffffffffffffffULL;
+
+ bio_for_each_segment(bv, bio, iter) {
+ void *d = kmap(bv.bv_page) + bv.bv_offset;
+
+ crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
+ kunmap(bv.bv_page);
+ }
+
+ k->val[bch_extent_ptrs(k)] = crc;
+}
+
+/* Writes */
+
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+{
+ struct data_insert_op *op = container_of(b_op,
+ struct data_insert_op, op);
+ struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+
+ int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys,
+ replace_key,
+ op->flush ? &op->cl : NULL);
+ return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret;
+}
+
+static void bch_data_insert_keys_done(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ unsigned i;
+
+ if (op->op.insert_collision)
+ op->replace_collision = true;
+
+ for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
+ if (op->open_buckets[i]) {
+ bch_open_bucket_put(op->c, op->open_buckets[i]);
+ op->open_buckets[i] = NULL;
+ }
+
+ if (!op->insert_data_done)
+ continue_at(cl, bch_data_insert_start, op->io_wq);
+
+ bch_keylist_free(&op->insert_keys);
+ closure_return(cl);
+}
+
+static void __bch_data_insert_keys(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op,
+ op.cl);
+ struct keylist *keys = &op->insert_keys;
+ int ret = 0;
+
+ while (!ret && !bch_keylist_empty(keys)) {
+ op->op.locks_want = 0;
+ ret = bch_btree_map_nodes(&op->op, op->c,
+ &START_KEY(keys->keys),
+ btree_insert_fn,
+ MAP_ASYNC);
+ }
+
+ if (ret == -EAGAIN)
+ continue_at(cl, __bch_data_insert_keys, op->c->wq);
+
+ closure_return(cl);
+}
+
+/**
+ * bch_data_insert_keys - insert extent btree keys for a write
+ */
+static void bch_data_insert_keys(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ enum btree_id id = BTREE_ID_EXTENTS;
+
+ __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
+
+ closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
+ continue_at(cl, bch_data_insert_keys_done, op->c->wq);
+}
+
+/**
+ * bch_data_invalidate - discard range of keys
+ *
+ * Used to implement discard, and to handle when writethrough write hits
+ * a write error on the cache device.
+ */
+static void bch_data_invalidate(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ struct keylist *keys = &op->insert_keys;
+ struct bio *bio = op->bio;
+
+ pr_debug("invalidating %i sectors from %llu",
+ bio_sectors(bio), (u64) bio->bi_iter.bi_sector);
+
+ while (bio_sectors(bio)) {
+ unsigned sectors = min(bio_sectors(bio),
+ 1U << (KEY_SIZE_BITS - 1));
+
+ if (bch_keylist_realloc(keys, BKEY_U64s))
+ goto out;
+
+ bio->bi_iter.bi_sector += sectors;
+ bio->bi_iter.bi_size -= sectors << 9;
+
+ *keys->top = KEY(KEY_INODE(&op->insert_key),
+ bio->bi_iter.bi_sector, sectors);
+ SET_KEY_DELETED(keys->top, true);
+
+ bch_keylist_push(keys);
+ }
+
+ op->insert_data_done = true;
+ bio_put(bio);
+out:
+ continue_at(cl, bch_data_insert_keys, op->c->wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+ /*
+ * Our data write just errored, which means we've got a bunch of keys to
+ * insert that point to data that wasn't successfully written.
+ *
+ * We don't have to insert those keys but we still have to invalidate
+ * that region of the cache - so, if we just strip off all the pointers
+ * from the keys we'll accomplish just that.
+ */
+
+ struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
+
+ while (src != op->insert_keys.top) {
+ struct bkey *n = bkey_next(src);
+
+ bch_set_extent_ptrs(src, 0);
+ memmove(dst, src, bkey_bytes(src));
+
+ dst = bkey_next(dst);
+ src = n;
+ }
+
+ op->insert_keys.top = dst;
+
+ bch_data_insert_keys(cl);
+}
+
+static void bch_data_insert_endio(struct bio *bio)
+{
+ struct closure *cl = bio->bi_private;
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+
+ if (bio->bi_error) {
+ /* TODO: We could try to recover from this. */
+ if (!KEY_CACHED(&op->insert_key))
+ op->error = bio->bi_error;
+ else if (!op->replace)
+ set_closure_fn(cl, bch_data_insert_error,
+ op->c->wq);
+ else
+ set_closure_fn(cl, NULL, NULL);
+ }
+
+ bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
+}
+
+static void bch_data_insert_start(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ struct bio *bio = op->bio, *n;
+ unsigned open_bucket_nr = 0, ptrs_from;
+ struct open_bucket *b;
+
+ if (op->discard)
+ return bch_data_invalidate(cl);
+
+ bch_extent_drop_stale(op->c, &op->insert_key);
+ ptrs_from = bch_extent_ptrs(&op->insert_key);
+
+ /*
+ * Journal writes are marked REQ_PREFLUSH; if the original write was a
+ * flush, it'll wait on the journal write.
+ */
+ bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
+
+ do {
+ struct bkey *k;
+ struct bio_set *split = op->c->bio_split;
+
+ BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key));
+
+ if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+ continue_at(cl, bch_data_insert_keys,
+ op->c->wq);
+
+ /* for the device pointers and 1 for the chksum */
+ if (bch_keylist_realloc(&op->insert_keys,
+ BKEY_EXTENT_MAX_U64s +
+ (KEY_CSUM(&op->insert_key) ? 1 : 0)))
+ continue_at(cl, bch_data_insert_keys, op->c->wq);
+
+ k = op->insert_keys.top;
+ bkey_copy(k, &op->insert_key);
+
+ b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
+ BUG_ON(!b);
+
+ if (PTR_ERR(b) == -EAGAIN) {
+ /* If we already have some keys, must insert them first
+ * before allocating another open bucket. We only hit
+ * this case if open_bucket_nr > 1. */
+ if (bch_keylist_empty(&op->insert_keys))
+ continue_at(cl, bch_data_insert_start,
+ op->io_wq);
+ else
+ continue_at(cl, bch_data_insert_keys,
+ op->c->wq);
+ } else if (IS_ERR(b))
+ goto err;
+
+ op->open_buckets[open_bucket_nr++] = b;
+
+ bch_cut_front(k, &op->insert_key);
+
+ n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+ n->bi_end_io = bch_data_insert_endio;
+ n->bi_private = cl;
+
+ if (KEY_CSUM(k))
+ bio_csum(n, k);
+
+ trace_bcache_cache_insert(k);
+
+ bio_set_op_attrs(n, REQ_OP_WRITE, 0);
+ bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
+
+ bch_extent_normalize(op->c, k);
+ bch_check_mark_super(op->c, k, false);
+
+ bch_keylist_push(&op->insert_keys);
+ } while (n != bio);
+
+ op->insert_data_done = true;
+ continue_at(cl, bch_data_insert_keys, op->c->wq);
+err:
+ if (KEY_CACHED(&op->insert_key)) {
+ /*
+ * If we were writing cached data, not doing the write is fine
+ * so long as we discard whatever would have been overwritten -
+ * then it's equivalent to doing the write and immediately
+ * reclaiming it.
+ */
+
+ op->discard = true;
+ return bch_data_invalidate(cl);
+ }
+
+ op->error = -ENOSPC;
+ op->insert_data_done = true;
+ bio_put(bio);
+
+ /*
+ * No reason not to insert keys for whatever data was successfully
+ * written (especially for a cmpxchg operation that's moving data
+ * around)
+ */
+ if (!bch_keylist_empty(&op->insert_keys))
+ continue_at(cl, bch_data_insert_keys, op->c->wq);
+ else
+ closure_return(cl);
+}
+
+/**
+ * bch_data_insert - handle a write to a cache device or flash only volume
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->bio; bi_sector is used for the key offset, and
+ * op->inode is used for the key inode.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch_data_insert(struct closure *cl)
+{
+ struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+ struct cache_set *c = op->c;
+ u64 inode = KEY_INODE(&op->insert_key);
+
+ trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key),
+ op->discard);
+
+ if (!bio_sectors(op->bio)) {
+ WARN_ONCE(1, "bch_data_insert() called with empty bio");
+ closure_return(cl);
+ }
+
+ /*
+ * This ought to be initialized in bch_data_insert_op_init(), but struct
+ * cache_set isn't exported
+ */
+ if (!op->io_wq)
+ op->io_wq = op->c->wq;
+
+ if (!op->discard)
+ bch_increment_clock(c, bio_sectors(op->bio), WRITE);
+
+ if (!op->replace) {
+ /* XXX: discards may be for more sectors than max key size */
+
+ struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0);
+ struct bkey end = KEY(inode, bio_end_sector(op->bio), 0);
+
+ unsigned i;
+ struct cache *ca;
+
+ for_each_cache(ca, c, i)
+ bch_keybuf_check_overlapping(&ca->moving_gc_keys,
+ &start, &end);
+
+ bch_keybuf_check_overlapping(&c->tiering_keys,
+ &start, &end);
+ }
+
+ if (op->wp->ca)
+ bch_mark_gc_write(c, bio_sectors(op->bio));
+ else if (!op->discard)
+ bch_mark_foreground_write(c, bio_sectors(op->bio));
+ else
+ bch_mark_discard(c, bio_sectors(op->bio));
+
+ if (atomic64_sub_return(bio_sectors(op->bio),
+ &c->sectors_until_gc) < 0) {
+ set_gc_sectors(c);
+ wake_up_process(c->gc_thread);
+ }
+
+ SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio));
+ SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio));
+
+ bch_keylist_init(&op->insert_keys);
+ bio_get(op->bio);
+ continue_at_nobarrier(cl, bch_data_insert_start, NULL);
+}
+
+void bch_data_insert_op_init(struct data_insert_op *op,
+ struct cache_set *c,
+ struct bio *bio,
+ struct write_point *wp,
+ bool wait, bool discard, bool flush,
+ struct bkey *insert_key,
+ struct bkey *replace_key)
+{
+ if (!wp) {
+ unsigned wp_idx = hash_long((unsigned long) current,
+ ilog2(ARRAY_SIZE(c->write_points)));
+
+ BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
+ wp = &c->write_points[wp_idx];
+ }
+
+ op->c = c;
+ op->io_wq = NULL;
+ op->bio = bio;
+ op->error = 0;
+ op->flags = 0;
+ op->wait = wait;
+ op->discard = discard;
+ op->flush = flush;
+ op->wp = wp;
+ op->btree_alloc_reserve = BTREE_ID_EXTENTS;
+
+ memset(op->open_buckets, 0, sizeof(op->open_buckets));
+ bch_keylist_init(&op->insert_keys);
+ bkey_copy(&op->insert_key, insert_key);
+
+ if (replace_key) {
+ op->replace = true;
+ bkey_copy(&op->replace_key, replace_key);
+ }
+}
+
+/* Cache promotion on read */
+
+struct cache_promote_op {
+ struct closure cl;
+ struct bio *orig_bio;
+ struct data_insert_op iop;
+ bool stale; /* was the ptr stale after the read? */
+ struct bbio bio; /* must be last */
+};
+
+static void cache_promote_done(struct closure *cl)
+{
+ struct cache_promote_op *op = container_of(cl,
+ struct cache_promote_op, cl);
+ struct cache_set *c = op->iop.c;
+
+ if (op->iop.replace_collision) {
+ trace_bcache_promote_collision(&op->iop.replace_key);
+ atomic_inc(&c->accounting.collector.cache_miss_collisions);
+ }
+
+ bio_free_pages(op->iop.bio);
+ kfree(op);
+}
+
+static void cache_promote_write(struct closure *cl)
+{
+ struct cache_promote_op *op = container_of(cl,
+ struct cache_promote_op, cl);
+ struct bio *bio = op->iop.bio;
+
+ bio_reset(bio);
+ bio->bi_iter.bi_sector = KEY_START(&op->iop.insert_key);
+ bio->bi_iter.bi_size = KEY_SIZE(&op->iop.insert_key) << 9;
+ /* needed to reinit bi_vcnt so pages can be freed later */
+ bch_bio_map(bio, NULL);
+
+ bio_copy_data(op->orig_bio, bio);
+ op->orig_bio->bi_error = op->iop.error;
+ bio_endio(op->orig_bio);
+
+ if (!op->stale &&
+ !op->iop.error &&
+ !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
+ closure_call(&op->iop.cl, bch_data_insert, NULL, cl);
+
+ closure_return_with_destructor(cl, cache_promote_done);
+}
+
+static void cache_promote_endio(struct bio *bio)
+{
+ struct bbio *b = to_bbio(bio);
+ struct cache_promote_op *op = container_of(b,
+ struct cache_promote_op, bio);
+
+ /*
+ * If the bucket was reused while our bio was in flight, we might have
+ * read the wrong data. Set s->error but not error so it doesn't get
+ * counted against the cache device, but we'll still reread the data
+ * from the backing device.
+ */
+
+ if (bio->bi_error)
+ op->iop.error = bio->bi_error;
+ else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0))
+ op->stale = 1;
+
+ bch_bbio_endio(b, bio->bi_error, "reading from cache");
+}
+
+/**
+ * __cache_promote -- insert result of read bio into cache
+ *
+ * Used for backing devices and flash-only volumes.
+ *
+ * @orig_bio must actually be a bbio with a valid key.
+ */
+void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
+ struct bkey *replace_key)
+{
+ struct cache_promote_op *op;
+ struct bio *bio;
+ unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
+
+ /* XXX: readahead? */
+
+ op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+ if (!op)
+ goto out_submit;
+
+ /* clone the bbio */
+ memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
+
+ bio = &op->bio.bio;
+ bio_init(bio);
+ bio_get(bio);
+ bio->bi_bdev = orig_bio->bio.bi_bdev;
+ bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector;
+ bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size;
+ bio->bi_end_io = cache_promote_endio;
+ bio->bi_private = &op->cl;
+ bio->bi_io_vec = bio->bi_inline_vecs;
+ bch_bio_map(bio, NULL);
+
+ if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
+ goto out_free;
+
+ orig_bio->ca = NULL;
+
+ closure_init(&op->cl, &c->cl);
+ op->orig_bio = &orig_bio->bio;
+ op->stale = 0;
+
+ bch_data_insert_op_init(&op->iop, c, bio,
+ &c->tier_write_points[0],
+ false, false, false,
+ replace_key,
+ replace_key);
+
+ bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key);
+ bch_cut_back(&orig_bio->key, &op->iop.insert_key);
+
+ trace_bcache_promote(&orig_bio->bio);
+
+ op->bio.submit_time_us = local_clock_us();
+ closure_bio_submit(bio, &op->cl);
+
+ continue_at(&op->cl, cache_promote_write, c->wq);
+out_free:
+ kfree(op);
+out_submit:
+ generic_make_request(&orig_bio->bio);
+}
+
+/**
+ * cache_promote - promote data stored in higher tiers
+ *
+ * Used for flash only volumes.
+ *
+ * @bio must actually be a bbio with valid key.
+ */
+bool cache_promote(struct cache_set *c, struct bbio *bio,
+ struct bkey *k, unsigned ptr)
+{
+ if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) {
+ generic_make_request(&bio->bio);
+ return 0;
+ }
+
+ __cache_promote(c, bio, k);
+ return 1;
+}
+
+/* Read */
+
+struct bch_read_op {
+ struct btree_op op;
+ struct cache_set *c;
+ struct bio *bio;
+ u64 inode;
+};
+
+static void bch_read_requeue(struct cache_set *c, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->read_race_lock, flags);
+ bio_list_add(&c->read_race_list, bio);
+ spin_unlock_irqrestore(&c->read_race_lock, flags);
+ queue_work(c->wq, &c->read_race_work);
+}
+
+static void bch_read_endio(struct bio *bio)
+{
+ struct bbio *b = to_bbio(bio);
+ struct cache *ca = b->ca;
+ struct bio *orig = bio->bi_private;
+
+ bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
+
+ if (!bio->bi_error && ca &&
+ (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) {
+ /* Read bucket invalidate race */
+ atomic_long_inc(&ca->set->cache_read_races);
+ bch_read_requeue(ca->set, bio);
+ } else {
+ if (bio->bi_error)
+ orig->bi_error = bio->bi_error;
+
+ bio_endio(orig);
+ bio_put(bio);
+ }
+
+ if (ca)
+ percpu_ref_put(&ca->ref);
+}
+
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
+
+/* XXX: this looks a lot like cache_lookup_fn() */
+static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k)
+{
+ struct bch_read_op *op = container_of(b_op,
+ struct bch_read_op, op);
+ struct bio *n, *bio = op->bio;
+ struct bbio *bbio;
+ int sectors, ret;
+ unsigned ptr;
+ struct cache *ca;
+
+ BUG_ON(bkey_cmp(&START_KEY(k),
+ &KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0);
+
+ BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0);
+
+ sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector;
+
+ ca = bch_extent_pick_ptr(b->c, k, &ptr);
+ if (!ca) {
+ if (!KEY_CACHED(k) && bch_extent_ptrs(k)) {
+ bio_io_error(bio);
+ return MAP_DONE;
+ } else {
+ unsigned bytes = min_t(unsigned, sectors,
+ bio_sectors(bio)) << 9;
+
+ swap(bio->bi_iter.bi_size, bytes);
+ zero_fill_bio(bio);
+ swap(bio->bi_iter.bi_size, bytes);
+
+ bio_advance(bio, bytes);
+
+ return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE;
+ }
+ }
+
+ PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand;
+
+ if (sectors >= bio_sectors(bio)) {
+ n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split);
+ ret = MAP_DONE;
+ } else {
+ n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split);
+ ret = MAP_CONTINUE;
+ }
+
+ n->bi_private = bio;
+ n->bi_end_io = bch_read_endio;
+ __bio_inc_remaining(bio);
+
+ bbio = to_bbio(n);
+ bch_bkey_copy_single_ptr(&bbio->key, k, ptr);
+
+ /* Trim the key to match what we're actually reading */
+ bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key);
+ bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key);
+
+ bch_bbio_prep(bbio, ca);
+
+ cache_promote(b->c, bbio, k, ptr);
+
+ return ret;
+}
+
+int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
+{
+ struct bch_read_op op;
+ int ret;
+
+ bch_increment_clock(c, bio_sectors(bio), READ);
+
+ bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1);
+ op.c = c;
+ op.bio = bio;
+ op.inode = inode;
+
+ ret = bch_btree_map_keys(&op.op, c,
+ &KEY(inode, bio->bi_iter.bi_sector, 0),
+ bch_read_fn, MAP_HOLES);
+ return ret < 0 ? ret : 0;
+}
+EXPORT_SYMBOL(bch_read);
+
+/**
+ * bch_read_retry - re-submit a bio originally from bch_read()
+ */
+static void bch_read_retry(struct bbio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ struct bio *parent;
+ u64 inode;
+
+ trace_bcache_read_retry(bio);
+
+ /*
+ * This used to be a leaf bio from bch_read_fn(), but
+ * since we don't know what happened to the btree in
+ * the meantime, we have to re-submit it via the
+ * top-level bch_read() entry point. Before doing that,
+ * we have to reset the bio, preserving the biovec.
+ *
+ * The inode, offset and size come from the bbio's key,
+ * which was set by bch_read_fn().
+ */
+ inode = KEY_INODE(&bbio->key);
+ parent = bio->bi_private;
+
+ bch_bbio_reset(bbio);
+ bio_chain(bio, parent);
+
+ bch_read(bbio->ca->set, bio, inode);
+ bio_endio(parent); /* for bio_chain() in bch_read_fn() */
+ bio_endio(bio);
+}
+
+void bch_read_race_work(struct work_struct *work)
+{
+ struct cache_set *c = container_of(work, struct cache_set,
+ read_race_work);
+ unsigned long flags;
+ struct bio *bio;
+
+ while (1) {
+ spin_lock_irqsave(&c->read_race_lock, flags);
+ bio = bio_list_pop(&c->read_race_list);
+ spin_unlock_irqrestore(&c->read_race_lock, flags);
+
+ if (!bio)
+ break;
+
+ bch_read_retry(to_bbio(bio));
+ }
+}
diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h
new file mode 100644
index 000000000000..2086bd6840ff
--- /dev/null
+++ b/drivers/md/bcache/io.h
@@ -0,0 +1,73 @@
+#ifndef _BCACHE_IO_H
+#define _BCACHE_IO_H
+
+struct data_insert_op {
+ struct closure cl;
+ struct cache_set *c;
+ struct workqueue_struct *io_wq;
+ struct bio *bio;
+
+ /* Used internally, do not touch */
+ struct btree_op op;
+
+ short error;
+
+ union {
+ u8 flags;
+
+ struct {
+ /* Wait for data bucket allocation or just
+ * fail when out of space? */
+ unsigned wait:1;
+ /* Discard key range? */
+ unsigned discard:1;
+ /* Wait for journal commit? */
+ unsigned flush:1;
+ /* Perform a compare-exchange with replace_key? */
+ unsigned replace:1;
+
+ /* Set on completion, if cmpxchg index update failed */
+ unsigned replace_collision:1;
+ /* Internal */
+ unsigned insert_data_done:1;
+ };
+ };
+
+ u8 btree_alloc_reserve;
+
+ struct write_point *wp;
+ struct open_bucket *open_buckets[2];
+
+ struct keylist insert_keys;
+ BKEY_PADDED(insert_key);
+ BKEY_PADDED(replace_key);
+};
+
+void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
+ struct bio *, struct write_point *, bool,
+ bool, bool, struct bkey *, struct bkey *);
+void bch_data_insert(struct closure *cl);
+
+int bch_read(struct cache_set *, struct bio *, u64);
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct bbio *, int, const char *);
+void bch_bbio_endio(struct bbio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+void bch_generic_make_request(struct bio *, struct cache_set *);
+void bch_bio_submit_work(struct work_struct *);
+void bch_bbio_prep(struct bbio *, struct cache *);
+void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *,
+ unsigned, bool);
+void bch_submit_bbio_replicas(struct bio *, struct cache_set *,
+ struct bkey *, unsigned, bool);
+void bch_bbio_reset(struct bbio *bio);
+
+void __cache_promote(struct cache_set *, struct bbio *, struct bkey *);
+bool cache_promote(struct cache_set *, struct bbio *, struct bkey *, unsigned);
+
+void bch_read_race_work(struct work_struct *work);
+
+#endif /* _BCACHE_IO_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ffc1d2151285..770b72755641 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -8,6 +8,7 @@
#include "btree.h"
#include "debug.h"
#include "extents.h"
+#include "io.h"
#include "journal.h"
#include <trace/events/bcache.h>
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index a1548ffd14a2..da407f9011e0 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -2,6 +2,7 @@
#include "bcache.h"
#include "btree.h"
#include "extents.h"
+#include "io.h"
#include "keybuf.h"
#include "move.h"
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 83dcebabc54a..6c7445275aaa 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -8,6 +8,7 @@
#include "btree.h"
#include "buckets.h"
#include "extents.h"
+#include "io.h"
#include "keybuf.h"
#include "move.h"
#include "movinggc.h"
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 66b927d78589..d0256d85859c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -28,6 +28,7 @@
#include "btree.h"
#include "debug.h"
#include "extents.h"
+#include "io.h"
#include "journal.h"
#include "keybuf.h"
#include "request.h"
@@ -46,596 +47,6 @@
struct kmem_cache *bch_search_cache;
-static inline void mark_cache_stats(struct cache_stat_collector *stats,
- bool hit, bool bypass)
-{
- atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
-}
-
-static inline void bch_mark_cache_accounting(struct cache_set *c,
- struct cached_dev *dc,
- bool hit, bool bypass)
-{
- mark_cache_stats(&dc->accounting.collector, hit, bypass);
- mark_cache_stats(&c->accounting.collector, hit, bypass);
-}
-
-static inline void bch_mark_sectors_bypassed(struct cache_set *c,
- struct cached_dev *dc,
- unsigned sectors)
-{
- atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
- atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
-}
-
-static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
-}
-
-static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
-}
-
-static inline void bch_mark_discard(struct cache_set *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.discard_sectors);
-}
-
-static void bch_data_insert_start(struct closure *);
-
-static void bio_csum(struct bio *bio, struct bkey *k)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- u64 crc = 0xffffffffffffffffULL;
-
- bio_for_each_segment(bv, bio, iter) {
- void *d = kmap(bv.bv_page) + bv.bv_offset;
-
- crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len);
- kunmap(bv.bv_page);
- }
-
- k->val[bch_extent_ptrs(k)] = crc;
-}
-
-/* Insert data into cache */
-
-static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
-{
- struct data_insert_op *op = container_of(b_op,
- struct data_insert_op, op);
- struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
-
- int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys,
- replace_key,
- op->flush ? &op->cl : NULL);
- return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret;
-}
-
-static void bch_data_insert_keys_done(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- unsigned i;
-
- if (op->op.insert_collision)
- op->replace_collision = true;
-
- for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
- if (op->open_buckets[i]) {
- bch_open_bucket_put(op->c, op->open_buckets[i]);
- op->open_buckets[i] = NULL;
- }
-
- if (!op->insert_data_done)
- continue_at(cl, bch_data_insert_start, op->io_wq);
-
- bch_keylist_free(&op->insert_keys);
- closure_return(cl);
-}
-
-static void __bch_data_insert_keys(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op,
- op.cl);
- struct keylist *keys = &op->insert_keys;
- int ret = 0;
-
- while (!ret && !bch_keylist_empty(keys)) {
- op->op.locks_want = 0;
- ret = bch_btree_map_nodes(&op->op, op->c,
- &START_KEY(keys->keys),
- btree_insert_fn,
- MAP_ASYNC);
- }
-
- if (ret == -EAGAIN)
- continue_at(cl, __bch_data_insert_keys, op->c->wq);
-
- closure_return(cl);
-}
-
-/**
- * bch_data_insert_keys - insert extent btree keys for a write
- */
-static void bch_data_insert_keys(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- enum btree_id id = BTREE_ID_EXTENTS;
-
- __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0);
-
- closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl);
- continue_at(cl, bch_data_insert_keys_done, op->c->wq);
-}
-
-/**
- * bch_data_invalidate - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch_data_invalidate(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- struct keylist *keys = &op->insert_keys;
- struct bio *bio = op->bio;
-
- pr_debug("invalidating %i sectors from %llu",
- bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
-
- while (bio_sectors(bio)) {
- unsigned sectors = min(bio_sectors(bio),
- 1U << (KEY_SIZE_BITS - 1));
-
- if (bch_keylist_realloc(keys, BKEY_U64s))
- goto out;
-
- bio->bi_iter.bi_sector += sectors;
- bio->bi_iter.bi_size -= sectors << 9;
-
- *keys->top = KEY(KEY_INODE(&op->insert_key),
- bio->bi_iter.bi_sector, sectors);
- SET_KEY_DELETED(keys->top, true);
-
- bch_keylist_push(keys);
- }
-
- op->insert_data_done = true;
- bio_put(bio);
-out:
- continue_at(cl, bch_data_insert_keys, op->c->wq);
-}
-
-static void bch_data_insert_error(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-
- /*
- * Our data write just errored, which means we've got a bunch of keys to
- * insert that point to data that wasn't successfully written.
- *
- * We don't have to insert those keys but we still have to invalidate
- * that region of the cache - so, if we just strip off all the pointers
- * from the keys we'll accomplish just that.
- */
-
- struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
-
- while (src != op->insert_keys.top) {
- struct bkey *n = bkey_next(src);
-
- bch_set_extent_ptrs(src, 0);
- memmove(dst, src, bkey_bytes(src));
-
- dst = bkey_next(dst);
- src = n;
- }
-
- op->insert_keys.top = dst;
-
- bch_data_insert_keys(cl);
-}
-
-static void bch_data_insert_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
-
- if (bio->bi_error) {
- /* TODO: We could try to recover from this. */
- if (!KEY_CACHED(&op->insert_key))
- op->error = bio->bi_error;
- else if (!op->replace)
- set_closure_fn(cl, bch_data_insert_error,
- op->c->wq);
- else
- set_closure_fn(cl, NULL, NULL);
- }
-
- bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache");
-}
-
-static void bch_data_insert_start(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- struct bio *bio = op->bio, *n;
- unsigned open_bucket_nr = 0, ptrs_from;
- struct open_bucket *b;
-
- if (op->discard)
- return bch_data_invalidate(cl);
-
- bch_extent_drop_stale(op->c, &op->insert_key);
- ptrs_from = bch_extent_ptrs(&op->insert_key);
-
- /*
- * Journal writes are marked REQ_PREFLUSH; if the original write was a
- * flush, it'll wait on the journal write.
- */
- bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
- do {
- struct bkey *k;
- struct bio_set *split = op->c->bio_split;
-
- BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key));
-
- if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
- continue_at(cl, bch_data_insert_keys,
- op->c->wq);
-
- /* for the device pointers and 1 for the chksum */
- if (bch_keylist_realloc(&op->insert_keys,
- BKEY_EXTENT_MAX_U64s +
- (KEY_CSUM(&op->insert_key) ? 1 : 0)))
- continue_at(cl, bch_data_insert_keys, op->c->wq);
-
- k = op->insert_keys.top;
- bkey_copy(k, &op->insert_key);
-
- b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL);
- BUG_ON(!b);
-
- if (PTR_ERR(b) == -EAGAIN) {
- /* If we already have some keys, must insert them first
- * before allocating another open bucket. We only hit
- * this case if open_bucket_nr > 1. */
- if (bch_keylist_empty(&op->insert_keys))
- continue_at(cl, bch_data_insert_start,
- op->io_wq);
- else
- continue_at(cl, bch_data_insert_keys,
- op->c->wq);
- } else if (IS_ERR(b))
- goto err;
-
- op->open_buckets[open_bucket_nr++] = b;
-
- bch_cut_front(k, &op->insert_key);
-
- n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
- n->bi_end_io = bch_data_insert_endio;
- n->bi_private = cl;
-
- if (KEY_CSUM(k))
- bio_csum(n, k);
-
- trace_bcache_cache_insert(k);
-
- bio_set_op_attrs(n, REQ_OP_WRITE, 0);
- bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false);
-
- bch_extent_normalize(op->c, k);
- bch_check_mark_super(op->c, k, false);
-
- bch_keylist_push(&op->insert_keys);
- } while (n != bio);
-
- op->insert_data_done = true;
- continue_at(cl, bch_data_insert_keys, op->c->wq);
-err:
- if (KEY_CACHED(&op->insert_key)) {
- /*
- * If we were writing cached data, not doing the write is fine
- * so long as we discard whatever would have been overwritten -
- * then it's equivalent to doing the write and immediately
- * reclaiming it.
- */
-
- op->discard = true;
- return bch_data_invalidate(cl);
- }
-
- op->error = -ENOSPC;
- op->insert_data_done = true;
- bio_put(bio);
-
- /*
- * No reason not to insert keys for whatever data was successfully
- * written (especially for a cmpxchg operation that's moving data
- * around)
- */
- if (!bch_keylist_empty(&op->insert_keys))
- continue_at(cl, bch_data_insert_keys, op->c->wq);
- else
- closure_return(cl);
-}
-
-/**
- * bch_data_insert - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch_data_insert(struct closure *cl)
-{
- struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
- struct cache_set *c = op->c;
- u64 inode = KEY_INODE(&op->insert_key);
-
- trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key),
- op->discard);
-
- if (!bio_sectors(op->bio)) {
- WARN_ONCE(1, "bch_data_insert() called with empty bio");
- closure_return(cl);
- }
-
- /*
- * This ought to be initialized in bch_data_insert_op_init(), but struct
- * cache_set isn't exported
- */
- if (!op->io_wq)
- op->io_wq = op->c->wq;
-
- if (!op->discard)
- bch_increment_clock(c, bio_sectors(op->bio), WRITE);
-
- if (!op->replace) {
- /* XXX: discards may be for more sectors than max key size */
-
- struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0);
- struct bkey end = KEY(inode, bio_end_sector(op->bio), 0);
-
- unsigned i;
- struct cache *ca;
-
- for_each_cache(ca, c, i)
- bch_keybuf_check_overlapping(&ca->moving_gc_keys,
- &start, &end);
-
- bch_keybuf_check_overlapping(&c->tiering_keys,
- &start, &end);
- }
-
- if (op->wp->ca)
- bch_mark_gc_write(c, bio_sectors(op->bio));
- else if (!op->discard)
- bch_mark_foreground_write(c, bio_sectors(op->bio));
- else
- bch_mark_discard(c, bio_sectors(op->bio));
-
- if (atomic64_sub_return(bio_sectors(op->bio),
- &c->sectors_until_gc) < 0) {
- set_gc_sectors(c);
- wake_up_process(c->gc_thread);
- }
-
- SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio));
- SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio));
-
- bch_keylist_init(&op->insert_keys);
- bio_get(op->bio);
- continue_at_nobarrier(cl, bch_data_insert_start, NULL);
-}
-
-void bch_data_insert_op_init(struct data_insert_op *op,
- struct cache_set *c,
- struct bio *bio,
- struct write_point *wp,
- bool wait, bool discard, bool flush,
- struct bkey *insert_key,
- struct bkey *replace_key)
-{
- if (!wp) {
- unsigned wp_idx = hash_long((unsigned long) current,
- ilog2(ARRAY_SIZE(c->write_points)));
-
- BUG_ON(wp_idx > ARRAY_SIZE(c->write_points));
- wp = &c->write_points[wp_idx];
- }
-
- op->c = c;
- op->io_wq = NULL;
- op->bio = bio;
- op->error = 0;
- op->flags = 0;
- op->wait = wait;
- op->discard = discard;
- op->flush = flush;
- op->wp = wp;
- op->btree_alloc_reserve = BTREE_ID_EXTENTS;
-
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
- bch_keylist_init(&op->insert_keys);
- bkey_copy(&op->insert_key, insert_key);
-
- if (replace_key) {
- op->replace = true;
- bkey_copy(&op->replace_key, replace_key);
- }
-}
-EXPORT_SYMBOL(bch_data_insert_op_init);
-
-/* Cache promotion on read */
-
-struct cache_promote_op {
- struct closure cl;
- struct bio *orig_bio;
- struct data_insert_op iop;
- bool stale; /* was the ptr stale after the read? */
- struct bbio bio; /* must be last */
-};
-
-static void cache_promote_done(struct closure *cl)
-{
- struct cache_promote_op *op = container_of(cl,
- struct cache_promote_op, cl);
- struct cache_set *c = op->iop.c;
-
- if (op->iop.replace_collision) {
- trace_bcache_promote_collision(&op->iop.replace_key);
- atomic_inc(&c->accounting.collector.cache_miss_collisions);
- }
-
- bio_free_pages(op->iop.bio);
- kfree(op);
-}
-
-static void cache_promote_write(struct closure *cl)
-{
- struct cache_promote_op *op = container_of(cl,
- struct cache_promote_op, cl);
- struct bio *bio = op->iop.bio;
-
- bio_reset(bio);
- bio->bi_iter.bi_sector = KEY_START(&op->iop.insert_key);
- bio->bi_iter.bi_size = KEY_SIZE(&op->iop.insert_key) << 9;
- /* needed to reinit bi_vcnt so pages can be freed later */
- bch_bio_map(bio, NULL);
-
- bio_copy_data(op->orig_bio, bio);
- op->orig_bio->bi_error = op->iop.error;
- bio_endio(op->orig_bio);
-
- if (!op->stale &&
- !op->iop.error &&
- !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags))
- closure_call(&op->iop.cl, bch_data_insert, NULL, cl);
-
- closure_return_with_destructor(cl, cache_promote_done);
-}
-
-static void cache_promote_endio(struct bio *bio)
-{
- struct bbio *b = to_bbio(bio);
- struct cache_promote_op *op = container_of(b,
- struct cache_promote_op, bio);
-
- /*
- * If the bucket was reused while our bio was in flight, we might have
- * read the wrong data. Set s->error but not error so it doesn't get
- * counted against the cache device, but we'll still reread the data
- * from the backing device.
- */
-
- if (bio->bi_error)
- op->iop.error = bio->bi_error;
- else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0))
- op->stale = 1;
-
- bch_bbio_endio(b, bio->bi_error, "reading from cache");
-}
-
-/**
- * __cache_promote -- insert result of read bio into cache
- *
- * Used for backing devices and flash-only volumes.
- *
- * @orig_bio must actually be a bbio with a valid key.
- */
-static void __cache_promote(struct cache_set *c, struct bbio *orig_bio,
- struct bkey *replace_key)
-{
- struct cache_promote_op *op;
- struct bio *bio;
- unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
-
- /* XXX: readahead? */
-
- op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
- if (!op)
- goto out_submit;
-
- /* clone the bbio */
- memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
-
- bio = &op->bio.bio;
- bio_init(bio);
- bio_get(bio);
- bio->bi_bdev = orig_bio->bio.bi_bdev;
- bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector;
- bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size;
- bio->bi_end_io = cache_promote_endio;
- bio->bi_private = &op->cl;
- bio->bi_io_vec = bio->bi_inline_vecs;
- bch_bio_map(bio, NULL);
-
- if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
- goto out_free;
-
- orig_bio->ca = NULL;
-
- closure_init(&op->cl, &c->cl);
- op->orig_bio = &orig_bio->bio;
- op->stale = 0;
-
- bch_data_insert_op_init(&op->iop, c, bio,
- &c->tier_write_points[0],
- false, false, false,
- replace_key,
- replace_key);
-
- bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key);
- bch_cut_back(&orig_bio->key, &op->iop.insert_key);
-
- trace_bcache_promote(&orig_bio->bio);
-
- op->bio.submit_time_us = local_clock_us();
- closure_bio_submit(bio, &op->cl);
-
- continue_at(&op->cl, cache_promote_write, c->wq);
-out_free:
- kfree(op);
-out_submit:
- generic_make_request(&orig_bio->bio);
-}
-
-/**
- * cache_promote - promote data stored in higher tiers
- *
- * Used for flash only volumes.
- *
- * @bio must actually be a bbio with valid key.
- */
-static bool cache_promote(struct cache_set *c, struct bbio *bio,
- struct bkey *k, unsigned ptr)
-{
- if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) {
- generic_make_request(&bio->bio);
- return 0;
- }
-
- __cache_promote(c, bio, k);
- return 1;
-}
-
/* Congested? */
unsigned bch_get_congested(struct cache_set *c)
@@ -764,194 +175,6 @@ skip:
return true;
}
-/* Cache lookup */
-
-/* XXX: consolidate these somehow */
-
-struct bch_read_op {
- struct btree_op op;
- struct cache_set *c;
- struct bio *bio;
- u64 inode;
-};
-
-static void bch_read_requeue(struct cache_set *c, struct bio *bio)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&c->read_race_lock, flags);
- bio_list_add(&c->read_race_list, bio);
- spin_unlock_irqrestore(&c->read_race_lock, flags);
- queue_work(c->wq, &c->read_race_work);
-}
-
-static void bch_read_endio(struct bio *bio)
-{
- struct bbio *b = to_bbio(bio);
- struct cache *ca = b->ca;
- struct bio *orig = bio->bi_private;
-
- bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache");
-
- if (!bio->bi_error && ca &&
- (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) {
- /* Read bucket invalidate race */
- atomic_long_inc(&ca->set->cache_read_races);
- bch_read_requeue(ca->set, bio);
- } else {
- if (bio->bi_error)
- orig->bi_error = bio->bi_error;
-
- bio_endio(orig);
- bio_put(bio);
- }
-
- if (ca)
- percpu_ref_put(&ca->ref);
-}
-
-static inline void __bio_inc_remaining(struct bio *bio)
-{
- bio->bi_flags |= (1 << BIO_CHAIN);
- smp_mb__before_atomic();
- atomic_inc(&bio->__bi_remaining);
-}
-
-/* XXX: this looks a lot like cache_lookup_fn() */
-static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k)
-{
- struct bch_read_op *op = container_of(b_op,
- struct bch_read_op, op);
- struct bio *n, *bio = op->bio;
- struct bbio *bbio;
- int sectors, ret;
- unsigned ptr;
- struct cache *ca;
-
- BUG_ON(bkey_cmp(&START_KEY(k),
- &KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0);
-
- BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0);
-
- sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector;
-
- ca = bch_extent_pick_ptr(b->c, k, &ptr);
- if (!ca) {
- if (!KEY_CACHED(k) && bch_extent_ptrs(k)) {
- bio_io_error(bio);
- return MAP_DONE;
- } else {
- unsigned bytes = min_t(unsigned, sectors,
- bio_sectors(bio)) << 9;
-
- swap(bio->bi_iter.bi_size, bytes);
- zero_fill_bio(bio);
- swap(bio->bi_iter.bi_size, bytes);
-
- bio_advance(bio, bytes);
-
- return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE;
- }
- }
-
- PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand;
-
- if (sectors >= bio_sectors(bio)) {
- n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split);
- ret = MAP_DONE;
- } else {
- n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split);
- ret = MAP_CONTINUE;
- }
-
- n->bi_private = bio;
- n->bi_end_io = bch_read_endio;
- __bio_inc_remaining(bio);
-
- bbio = to_bbio(n);
- bch_bkey_copy_single_ptr(&bbio->key, k, ptr);
-
- /* Trim the key to match what we're actually reading */
- bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key);
- bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key);
-
- bch_bbio_prep(bbio, ca);
-
- cache_promote(b->c, bbio, k, ptr);
-
- return ret;
-}
-
-int bch_read(struct cache_set *c, struct bio *bio, u64 inode)
-{
- struct bch_read_op op;
- int ret;
-
- bch_increment_clock(c, bio_sectors(bio), READ);
-
- bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1);
- op.c = c;
- op.bio = bio;
- op.inode = inode;
-
- ret = bch_btree_map_keys(&op.op, c,
- &KEY(inode, bio->bi_iter.bi_sector, 0),
- bch_read_fn, MAP_HOLES);
- return ret < 0 ? ret : 0;
-}
-EXPORT_SYMBOL(bch_read);
-
-/**
- * bch_read_retry - re-submit a bio originally from bch_read()
- */
-static void bch_read_retry(struct bbio *bbio)
-{
- struct bio *bio = &bbio->bio;
- struct bio *parent;
- u64 inode;
-
- trace_bcache_read_retry(bio);
-
- /*
- * This used to be a leaf bio from bch_read_fn(), but
- * since we don't know what happened to the btree in
- * the meantime, we have to re-submit it via the
- * top-level bch_read() entry point. Before doing that,
- * we have to reset the bio, preserving the biovec.
- *
- * The inode, offset and size come from the bbio's key,
- * which was set by bch_read_fn().
- */
- inode = KEY_INODE(&bbio->key);
- parent = bio->bi_private;
-
- bch_bbio_reset(bbio);
- bio_chain(bio, parent);
-
- bch_read(bbio->ca->set, bio, inode);
- bio_endio(parent); /* for bio_chain() in bch_read_fn() */
- bio_endio(bio);
-}
-
-void bch_read_race_work(struct work_struct *work)
-{
- struct cache_set *c = container_of(work, struct cache_set,
- read_race_work);
- unsigned long flags;
- struct bio *bio;
-
- while (1) {
- spin_lock_irqsave(&c->read_race_lock, flags);
- bio = bio_list_pop(&c->read_race_list);
- spin_unlock_irqrestore(&c->read_race_lock, flags);
-
- if (!bio)
- break;
-
- bch_read_retry(to_bbio(bio));
- }
-}
-
/* struct search based code */
struct search {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 386f452f6951..edec16a917e2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -8,61 +8,11 @@ struct cached_dev;
struct bcache_device;
struct kmem_cache;
-struct data_insert_op {
- struct closure cl;
- struct cache_set *c;
- struct workqueue_struct *io_wq;
- struct bio *bio;
-
- /* Used internally, do not touch */
- struct btree_op op;
-
- short error;
-
- union {
- u8 flags;
-
- struct {
- /* Wait for data bucket allocation or just
- * fail when out of space? */
- unsigned wait:1;
- /* Discard key range? */
- unsigned discard:1;
- /* Wait for journal commit? */
- unsigned flush:1;
- /* Perform a compare-exchange with replace_key? */
- unsigned replace:1;
-
- /* Set on completion, if cmpxchg index update failed */
- unsigned replace_collision:1;
- /* Internal */
- unsigned insert_data_done:1;
- };
- };
-
- u8 btree_alloc_reserve;
-
- struct write_point *wp;
- struct open_bucket *open_buckets[2];
-
- struct keylist insert_keys;
- BKEY_PADDED(insert_key);
- BKEY_PADDED(replace_key);
-};
-
-void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *,
- struct bio *, struct write_point *, bool,
- bool, bool, struct bkey *, struct bkey *);
-
unsigned bch_get_congested(struct cache_set *);
-int bch_read(struct cache_set *, struct bio *, u64);
-void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
void bch_flash_dev_request_init(struct bcache_device *d);
-void bch_read_race_work(struct work_struct *work);
-
extern struct kmem_cache *bch_search_cache;
#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index 0a0da6a460ee..39877f9aa132 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -1,57 +1,7 @@
#ifndef _BCACHE_STATS_H_
#define _BCACHE_STATS_H_
-struct cache_stat_collector {
- union {
- struct {
- atomic_t cache_hits;
- atomic_t cache_misses;
- atomic_t cache_bypass_hits;
- atomic_t cache_bypass_misses;
- };
-
- /* cache_hit_array[!bypass][!hit]: */
- atomic_t cache_hit_array[2][2];
- };
-
-
- atomic_t cache_readaheads;
- atomic_t cache_miss_collisions;
- atomic_t sectors_bypassed;
- atomic_t foreground_write_sectors;
- atomic_t gc_write_sectors;
- atomic_t discard_sectors;
-};
-
-struct cache_stats {
- struct kobject kobj;
-
- unsigned long cache_hits;
- unsigned long cache_misses;
- unsigned long cache_bypass_hits;
- unsigned long cache_bypass_misses;
- unsigned long cache_readaheads;
- unsigned long cache_miss_collisions;
- unsigned long sectors_bypassed;
- unsigned long foreground_write_sectors;
- unsigned long gc_write_sectors;
- unsigned long discard_sectors;
-
- unsigned rescale;
-};
-
-struct cache_accounting {
- struct closure cl;
- struct timer_list timer;
- atomic_t closing;
-
- struct cache_stat_collector collector;
-
- struct cache_stats total;
- struct cache_stats five_minute;
- struct cache_stats hour;
- struct cache_stats day;
-};
+#include "stats_types.h"
struct cache_set;
struct cached_dev;
@@ -62,4 +12,41 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *);
void bch_cache_accounting_clear(struct cache_accounting *);
void bch_cache_accounting_destroy(struct cache_accounting *);
+static inline void mark_cache_stats(struct cache_stat_collector *stats,
+ bool hit, bool bypass)
+{
+ atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
+}
+
+static inline void bch_mark_cache_accounting(struct cache_set *c,
+ struct cached_dev *dc,
+ bool hit, bool bypass)
+{
+ mark_cache_stats(&dc->accounting.collector, hit, bypass);
+ mark_cache_stats(&c->accounting.collector, hit, bypass);
+}
+
+static inline void bch_mark_sectors_bypassed(struct cache_set *c,
+ struct cached_dev *dc,
+ unsigned sectors)
+{
+ atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+ atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
+}
+
+static inline void bch_mark_gc_write(struct cache_set *c, int sectors)
+{
+ atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
+}
+
+static inline void bch_mark_foreground_write(struct cache_set *c, int sectors)
+{
+ atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
+}
+
+static inline void bch_mark_discard(struct cache_set *c, int sectors)
+{
+ atomic_add(sectors, &c->accounting.collector.discard_sectors);
+}
+
#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/stats_types.h b/drivers/md/bcache/stats_types.h
new file mode 100644
index 000000000000..28e4c69e8e6d
--- /dev/null
+++ b/drivers/md/bcache/stats_types.h
@@ -0,0 +1,56 @@
+#ifndef _BCACHE_STATS_TYPES_H_
+#define _BCACHE_STATS_TYPES_H_
+
+struct cache_stat_collector {
+ union {
+ struct {
+ atomic_t cache_hits;
+ atomic_t cache_misses;
+ atomic_t cache_bypass_hits;
+ atomic_t cache_bypass_misses;
+ };
+
+ /* cache_hit_array[!bypass][!hit]: */
+ atomic_t cache_hit_array[2][2];
+ };
+
+
+ atomic_t cache_readaheads;
+ atomic_t cache_miss_collisions;
+ atomic_t sectors_bypassed;
+ atomic_t foreground_write_sectors;
+ atomic_t gc_write_sectors;
+ atomic_t discard_sectors;
+};
+
+struct cache_stats {
+ struct kobject kobj;
+
+ unsigned long cache_hits;
+ unsigned long cache_misses;
+ unsigned long cache_bypass_hits;
+ unsigned long cache_bypass_misses;
+ unsigned long cache_readaheads;
+ unsigned long cache_miss_collisions;
+ unsigned long sectors_bypassed;
+ unsigned long foreground_write_sectors;
+ unsigned long gc_write_sectors;
+ unsigned long discard_sectors;
+
+ unsigned rescale;
+};
+
+struct cache_accounting {
+ struct closure cl;
+ struct timer_list timer;
+ atomic_t closing;
+
+ struct cache_stat_collector collector;
+
+ struct cache_stats total;
+ struct cache_stats five_minute;
+ struct cache_stats hour;
+ struct cache_stats day;
+};
+
+#endif /* _BCACHE_STATS_TYPES_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c787db192728..6ad3b54f5bfa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -10,6 +10,7 @@
#include "alloc.h"
#include "btree.h"
#include "debug.h"
+#include "io.h"
#include "journal.h"
#include "movinggc.h"
#include "request.h"
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 162931d95ee8..c1724bc844d9 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -3,6 +3,7 @@
#include "btree.h"
#include "buckets.h"
#include "extents.h"
+#include "io.h"
#include "keybuf.h"
#include "move.h"
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 62a1c2f136ae..0843f378cd37 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -10,6 +10,7 @@
#include "btree.h"
#include "debug.h"
#include "extents.h"
+#include "io.h"
#include "keybuf.h"
#include "writeback.h"