diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2015-08-21 00:36:55 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 20:23:36 -0900 |
commit | e3c0969fad9374ce00c79bd66ddd50ccc98016ac (patch) | |
tree | d10573effb9f5c946524f34d7a3a0063598a8722 | |
parent | 5090b2ba94d70fbdcc1d97dd208b7fc25abdc2b8 (diff) |
bcache: move top level read/write code to io.c
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r-- | drivers/md/bcache/bcache.h | 17 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/debug.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 748 | ||||
-rw-r--r-- | drivers/md/bcache/io.h | 73 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 779 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 50 | ||||
-rw-r--r-- | drivers/md/bcache/stats.h | 89 | ||||
-rw-r--r-- | drivers/md/bcache/stats_types.h | 56 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 1 |
15 files changed, 924 insertions, 896 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 81c5544e84ee..8b27e2471d0c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -238,7 +238,7 @@ struct bucket { u8 copygc_gen; }; -#include "stats.h" +#include "stats_types.h" #include "inode.h" struct search; struct btree; @@ -1172,21 +1172,6 @@ static inline void bch_check_mark_super(struct cache_set *c, /* Forward declarations */ -void bch_count_io_errors(struct cache *, int, const char *); -void bch_bbio_count_io_errors(struct bbio *, int, const char *); -void bch_bbio_endio(struct bbio *, int, const char *); -void bch_bbio_free(struct bio *, struct cache_set *); -struct bio *bch_bbio_alloc(struct cache_set *); - -void bch_generic_make_request(struct bio *, struct cache_set *); -void bch_bio_submit_work(struct work_struct *); -void bch_bbio_prep(struct bbio *, struct cache *); -void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *, - unsigned, bool); -void bch_submit_bbio_replicas(struct bio *, struct cache_set *, - struct bkey *, unsigned, bool); -void bch_bbio_reset(struct bbio *bio); - __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index cacd6c4b7ac4..1e01a5e77b26 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -26,6 +26,7 @@ #include "buckets.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "journal.h" #include "movinggc.h" #include "writeback.h" diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c2b81bebe3df..eb964573095f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -9,6 +9,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "keybuf.h" #include <linux/console.h> diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index d565ad1f496d..c4f6d1ec984c 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -6,13 +6,19 @@ */ #include "bcache.h" +#include "alloc.h" #include "bset.h" -#include "debug.h" #include "btree.h" +#include "debug.h" #include "extents.h" +#include "io.h" +#include "keybuf.h" +#include "stats.h" #include <linux/blkdev.h> +#include <trace/events/bcache.h> + void bch_generic_make_request(struct bio *bio, struct cache_set *c) { if (current->bio_list) { @@ -234,3 +240,743 @@ void bch_bbio_endio(struct bbio *bio, int error, const char *m) percpu_ref_put(&ca->ref); closure_put(cl); } + +/* */ + +static void bch_data_insert_start(struct closure *); + +static void bio_csum(struct bio *bio, struct bkey *k) +{ + struct bio_vec bv; + struct bvec_iter iter; + u64 crc = 0xffffffffffffffffULL; + + bio_for_each_segment(bv, bio, iter) { + void *d = kmap(bv.bv_page) + bv.bv_offset; + + crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len); + kunmap(bv.bv_page); + } + + k->val[bch_extent_ptrs(k)] = crc; +} + +/* Writes */ + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) +{ + struct data_insert_op *op = container_of(b_op, + struct data_insert_op, op); + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + + int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys, + replace_key, + op->flush ? &op->cl : NULL); + return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret; +} + +static void bch_data_insert_keys_done(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + unsigned i; + + if (op->op.insert_collision) + op->replace_collision = true; + + for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) + if (op->open_buckets[i]) { + bch_open_bucket_put(op->c, op->open_buckets[i]); + op->open_buckets[i] = NULL; + } + + if (!op->insert_data_done) + continue_at(cl, bch_data_insert_start, op->io_wq); + + bch_keylist_free(&op->insert_keys); + closure_return(cl); +} + +static void __bch_data_insert_keys(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, + op.cl); + struct keylist *keys = &op->insert_keys; + int ret = 0; + + while (!ret && !bch_keylist_empty(keys)) { + op->op.locks_want = 0; + ret = bch_btree_map_nodes(&op->op, op->c, + &START_KEY(keys->keys), + btree_insert_fn, + MAP_ASYNC); + } + + if (ret == -EAGAIN) + continue_at(cl, __bch_data_insert_keys, op->c->wq); + + closure_return(cl); +} + +/** + * bch_data_insert_keys - insert extent btree keys for a write + */ +static void bch_data_insert_keys(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + enum btree_id id = BTREE_ID_EXTENTS; + + __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0); + + closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl); + continue_at(cl, bch_data_insert_keys_done, op->c->wq); +} + +/** + * bch_data_invalidate - discard range of keys + * + * Used to implement discard, and to handle when writethrough write hits + * a write error on the cache device. + */ +static void bch_data_invalidate(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct keylist *keys = &op->insert_keys; + struct bio *bio = op->bio; + + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (u64) bio->bi_iter.bi_sector); + + while (bio_sectors(bio)) { + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); + + if (bch_keylist_realloc(keys, BKEY_U64s)) + goto out; + + bio->bi_iter.bi_sector += sectors; + bio->bi_iter.bi_size -= sectors << 9; + + *keys->top = KEY(KEY_INODE(&op->insert_key), + bio->bi_iter.bi_sector, sectors); + SET_KEY_DELETED(keys->top, true); + + bch_keylist_push(keys); + } + + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, op->c->wq); +} + +static void bch_data_insert_error(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + /* + * Our data write just errored, which means we've got a bunch of keys to + * insert that point to data that wasn't successfully written. + * + * We don't have to insert those keys but we still have to invalidate + * that region of the cache - so, if we just strip off all the pointers + * from the keys we'll accomplish just that. + */ + + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; + + while (src != op->insert_keys.top) { + struct bkey *n = bkey_next(src); + + bch_set_extent_ptrs(src, 0); + memmove(dst, src, bkey_bytes(src)); + + dst = bkey_next(dst); + src = n; + } + + op->insert_keys.top = dst; + + bch_data_insert_keys(cl); +} + +static void bch_data_insert_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + if (bio->bi_error) { + /* TODO: We could try to recover from this. */ + if (!KEY_CACHED(&op->insert_key)) + op->error = bio->bi_error; + else if (!op->replace) + set_closure_fn(cl, bch_data_insert_error, + op->c->wq); + else + set_closure_fn(cl, NULL, NULL); + } + + bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache"); +} + +static void bch_data_insert_start(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; + unsigned open_bucket_nr = 0, ptrs_from; + struct open_bucket *b; + + if (op->discard) + return bch_data_invalidate(cl); + + bch_extent_drop_stale(op->c, &op->insert_key); + ptrs_from = bch_extent_ptrs(&op->insert_key); + + /* + * Journal writes are marked REQ_PREFLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); + + do { + struct bkey *k; + struct bio_set *split = op->c->bio_split; + + BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key)); + + if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) + continue_at(cl, bch_data_insert_keys, + op->c->wq); + + /* for the device pointers and 1 for the chksum */ + if (bch_keylist_realloc(&op->insert_keys, + BKEY_EXTENT_MAX_U64s + + (KEY_CSUM(&op->insert_key) ? 1 : 0))) + continue_at(cl, bch_data_insert_keys, op->c->wq); + + k = op->insert_keys.top; + bkey_copy(k, &op->insert_key); + + b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL); + BUG_ON(!b); + + if (PTR_ERR(b) == -EAGAIN) { + /* If we already have some keys, must insert them first + * before allocating another open bucket. We only hit + * this case if open_bucket_nr > 1. */ + if (bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_start, + op->io_wq); + else + continue_at(cl, bch_data_insert_keys, + op->c->wq); + } else if (IS_ERR(b)) + goto err; + + op->open_buckets[open_bucket_nr++] = b; + + bch_cut_front(k, &op->insert_key); + + n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); + n->bi_end_io = bch_data_insert_endio; + n->bi_private = cl; + + if (KEY_CSUM(k)) + bio_csum(n, k); + + trace_bcache_cache_insert(k); + + bio_set_op_attrs(n, REQ_OP_WRITE, 0); + bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false); + + bch_extent_normalize(op->c, k); + bch_check_mark_super(op->c, k, false); + + bch_keylist_push(&op->insert_keys); + } while (n != bio); + + op->insert_data_done = true; + continue_at(cl, bch_data_insert_keys, op->c->wq); +err: + if (KEY_CACHED(&op->insert_key)) { + /* + * If we were writing cached data, not doing the write is fine + * so long as we discard whatever would have been overwritten - + * then it's equivalent to doing the write and immediately + * reclaiming it. + */ + + op->discard = true; + return bch_data_invalidate(cl); + } + + op->error = -ENOSPC; + op->insert_data_done = true; + bio_put(bio); + + /* + * No reason not to insert keys for whatever data was successfully + * written (especially for a cmpxchg operation that's moving data + * around) + */ + if (!bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_keys, op->c->wq); + else + closure_return(cl); +} + +/** + * bch_data_insert - handle a write to a cache device or flash only volume + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data won't fit in a single open bucket, there will be multiple keys); + * after the data is written it calls bch_journal, and after the keys have been + * added to the next journal write they're inserted into the btree. + * + * It inserts the data in op->bio; bi_sector is used for the key offset, and + * op->inode is used for the key inode. + * + * If op->discard is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +void bch_data_insert(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct cache_set *c = op->c; + u64 inode = KEY_INODE(&op->insert_key); + + trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key), + op->discard); + + if (!bio_sectors(op->bio)) { + WARN_ONCE(1, "bch_data_insert() called with empty bio"); + closure_return(cl); + } + + /* + * This ought to be initialized in bch_data_insert_op_init(), but struct + * cache_set isn't exported + */ + if (!op->io_wq) + op->io_wq = op->c->wq; + + if (!op->discard) + bch_increment_clock(c, bio_sectors(op->bio), WRITE); + + if (!op->replace) { + /* XXX: discards may be for more sectors than max key size */ + + struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0); + struct bkey end = KEY(inode, bio_end_sector(op->bio), 0); + + unsigned i; + struct cache *ca; + + for_each_cache(ca, c, i) + bch_keybuf_check_overlapping(&ca->moving_gc_keys, + &start, &end); + + bch_keybuf_check_overlapping(&c->tiering_keys, + &start, &end); + } + + if (op->wp->ca) + bch_mark_gc_write(c, bio_sectors(op->bio)); + else if (!op->discard) + bch_mark_foreground_write(c, bio_sectors(op->bio)); + else + bch_mark_discard(c, bio_sectors(op->bio)); + + if (atomic64_sub_return(bio_sectors(op->bio), + &c->sectors_until_gc) < 0) { + set_gc_sectors(c); + wake_up_process(c->gc_thread); + } + + SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio)); + SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio)); + + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); + continue_at_nobarrier(cl, bch_data_insert_start, NULL); +} + +void bch_data_insert_op_init(struct data_insert_op *op, + struct cache_set *c, + struct bio *bio, + struct write_point *wp, + bool wait, bool discard, bool flush, + struct bkey *insert_key, + struct bkey *replace_key) +{ + if (!wp) { + unsigned wp_idx = hash_long((unsigned long) current, + ilog2(ARRAY_SIZE(c->write_points))); + + BUG_ON(wp_idx > ARRAY_SIZE(c->write_points)); + wp = &c->write_points[wp_idx]; + } + + op->c = c; + op->io_wq = NULL; + op->bio = bio; + op->error = 0; + op->flags = 0; + op->wait = wait; + op->discard = discard; + op->flush = flush; + op->wp = wp; + op->btree_alloc_reserve = BTREE_ID_EXTENTS; + + memset(op->open_buckets, 0, sizeof(op->open_buckets)); + bch_keylist_init(&op->insert_keys); + bkey_copy(&op->insert_key, insert_key); + + if (replace_key) { + op->replace = true; + bkey_copy(&op->replace_key, replace_key); + } +} + +/* Cache promotion on read */ + +struct cache_promote_op { + struct closure cl; + struct bio *orig_bio; + struct data_insert_op iop; + bool stale; /* was the ptr stale after the read? */ + struct bbio bio; /* must be last */ +}; + +static void cache_promote_done(struct closure *cl) +{ + struct cache_promote_op *op = container_of(cl, + struct cache_promote_op, cl); + struct cache_set *c = op->iop.c; + + if (op->iop.replace_collision) { + trace_bcache_promote_collision(&op->iop.replace_key); + atomic_inc(&c->accounting.collector.cache_miss_collisions); + } + + bio_free_pages(op->iop.bio); + kfree(op); +} + +static void cache_promote_write(struct closure *cl) +{ + struct cache_promote_op *op = container_of(cl, + struct cache_promote_op, cl); + struct bio *bio = op->iop.bio; + + bio_reset(bio); + bio->bi_iter.bi_sector = KEY_START(&op->iop.insert_key); + bio->bi_iter.bi_size = KEY_SIZE(&op->iop.insert_key) << 9; + /* needed to reinit bi_vcnt so pages can be freed later */ + bch_bio_map(bio, NULL); + + bio_copy_data(op->orig_bio, bio); + op->orig_bio->bi_error = op->iop.error; + bio_endio(op->orig_bio); + + if (!op->stale && + !op->iop.error && + !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags)) + closure_call(&op->iop.cl, bch_data_insert, NULL, cl); + + closure_return_with_destructor(cl, cache_promote_done); +} + +static void cache_promote_endio(struct bio *bio) +{ + struct bbio *b = to_bbio(bio); + struct cache_promote_op *op = container_of(b, + struct cache_promote_op, bio); + + /* + * If the bucket was reused while our bio was in flight, we might have + * read the wrong data. Set s->error but not error so it doesn't get + * counted against the cache device, but we'll still reread the data + * from the backing device. + */ + + if (bio->bi_error) + op->iop.error = bio->bi_error; + else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0)) + op->stale = 1; + + bch_bbio_endio(b, bio->bi_error, "reading from cache"); +} + +/** + * __cache_promote -- insert result of read bio into cache + * + * Used for backing devices and flash-only volumes. + * + * @orig_bio must actually be a bbio with a valid key. + */ +void __cache_promote(struct cache_set *c, struct bbio *orig_bio, + struct bkey *replace_key) +{ + struct cache_promote_op *op; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE); + + /* XXX: readahead? */ + + op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); + if (!op) + goto out_submit; + + /* clone the bbio */ + memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio)); + + bio = &op->bio.bio; + bio_init(bio); + bio_get(bio); + bio->bi_bdev = orig_bio->bio.bi_bdev; + bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector; + bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size; + bio->bi_end_io = cache_promote_endio; + bio->bi_private = &op->cl; + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); + + if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO)) + goto out_free; + + orig_bio->ca = NULL; + + closure_init(&op->cl, &c->cl); + op->orig_bio = &orig_bio->bio; + op->stale = 0; + + bch_data_insert_op_init(&op->iop, c, bio, + &c->tier_write_points[0], + false, false, false, + replace_key, + replace_key); + + bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key); + bch_cut_back(&orig_bio->key, &op->iop.insert_key); + + trace_bcache_promote(&orig_bio->bio); + + op->bio.submit_time_us = local_clock_us(); + closure_bio_submit(bio, &op->cl); + + continue_at(&op->cl, cache_promote_write, c->wq); +out_free: + kfree(op); +out_submit: + generic_make_request(&orig_bio->bio); +} + +/** + * cache_promote - promote data stored in higher tiers + * + * Used for flash only volumes. + * + * @bio must actually be a bbio with valid key. + */ +bool cache_promote(struct cache_set *c, struct bbio *bio, + struct bkey *k, unsigned ptr) +{ + if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) { + generic_make_request(&bio->bio); + return 0; + } + + __cache_promote(c, bio, k); + return 1; +} + +/* Read */ + +struct bch_read_op { + struct btree_op op; + struct cache_set *c; + struct bio *bio; + u64 inode; +}; + +static void bch_read_requeue(struct cache_set *c, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&c->read_race_lock, flags); + bio_list_add(&c->read_race_list, bio); + spin_unlock_irqrestore(&c->read_race_lock, flags); + queue_work(c->wq, &c->read_race_work); +} + +static void bch_read_endio(struct bio *bio) +{ + struct bbio *b = to_bbio(bio); + struct cache *ca = b->ca; + struct bio *orig = bio->bi_private; + + bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache"); + + if (!bio->bi_error && ca && + (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) { + /* Read bucket invalidate race */ + atomic_long_inc(&ca->set->cache_read_races); + bch_read_requeue(ca->set, bio); + } else { + if (bio->bi_error) + orig->bi_error = bio->bi_error; + + bio_endio(orig); + bio_put(bio); + } + + if (ca) + percpu_ref_put(&ca->ref); +} + +static inline void __bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + +/* XXX: this looks a lot like cache_lookup_fn() */ +static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k) +{ + struct bch_read_op *op = container_of(b_op, + struct bch_read_op, op); + struct bio *n, *bio = op->bio; + struct bbio *bbio; + int sectors, ret; + unsigned ptr; + struct cache *ca; + + BUG_ON(bkey_cmp(&START_KEY(k), + &KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0); + + BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0); + + sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector; + + ca = bch_extent_pick_ptr(b->c, k, &ptr); + if (!ca) { + if (!KEY_CACHED(k) && bch_extent_ptrs(k)) { + bio_io_error(bio); + return MAP_DONE; + } else { + unsigned bytes = min_t(unsigned, sectors, + bio_sectors(bio)) << 9; + + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); + + bio_advance(bio, bytes); + + return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE; + } + } + + PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand; + + if (sectors >= bio_sectors(bio)) { + n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split); + ret = MAP_DONE; + } else { + n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split); + ret = MAP_CONTINUE; + } + + n->bi_private = bio; + n->bi_end_io = bch_read_endio; + __bio_inc_remaining(bio); + + bbio = to_bbio(n); + bch_bkey_copy_single_ptr(&bbio->key, k, ptr); + + /* Trim the key to match what we're actually reading */ + bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key); + bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key); + + bch_bbio_prep(bbio, ca); + + cache_promote(b->c, bbio, k, ptr); + + return ret; +} + +int bch_read(struct cache_set *c, struct bio *bio, u64 inode) +{ + struct bch_read_op op; + int ret; + + bch_increment_clock(c, bio_sectors(bio), READ); + + bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1); + op.c = c; + op.bio = bio; + op.inode = inode; + + ret = bch_btree_map_keys(&op.op, c, + &KEY(inode, bio->bi_iter.bi_sector, 0), + bch_read_fn, MAP_HOLES); + return ret < 0 ? ret : 0; +} +EXPORT_SYMBOL(bch_read); + +/** + * bch_read_retry - re-submit a bio originally from bch_read() + */ +static void bch_read_retry(struct bbio *bbio) +{ + struct bio *bio = &bbio->bio; + struct bio *parent; + u64 inode; + + trace_bcache_read_retry(bio); + + /* + * This used to be a leaf bio from bch_read_fn(), but + * since we don't know what happened to the btree in + * the meantime, we have to re-submit it via the + * top-level bch_read() entry point. Before doing that, + * we have to reset the bio, preserving the biovec. + * + * The inode, offset and size come from the bbio's key, + * which was set by bch_read_fn(). + */ + inode = KEY_INODE(&bbio->key); + parent = bio->bi_private; + + bch_bbio_reset(bbio); + bio_chain(bio, parent); + + bch_read(bbio->ca->set, bio, inode); + bio_endio(parent); /* for bio_chain() in bch_read_fn() */ + bio_endio(bio); +} + +void bch_read_race_work(struct work_struct *work) +{ + struct cache_set *c = container_of(work, struct cache_set, + read_race_work); + unsigned long flags; + struct bio *bio; + + while (1) { + spin_lock_irqsave(&c->read_race_lock, flags); + bio = bio_list_pop(&c->read_race_list); + spin_unlock_irqrestore(&c->read_race_lock, flags); + + if (!bio) + break; + + bch_read_retry(to_bbio(bio)); + } +} diff --git a/drivers/md/bcache/io.h b/drivers/md/bcache/io.h new file mode 100644 index 000000000000..2086bd6840ff --- /dev/null +++ b/drivers/md/bcache/io.h @@ -0,0 +1,73 @@ +#ifndef _BCACHE_IO_H +#define _BCACHE_IO_H + +struct data_insert_op { + struct closure cl; + struct cache_set *c; + struct workqueue_struct *io_wq; + struct bio *bio; + + /* Used internally, do not touch */ + struct btree_op op; + + short error; + + union { + u8 flags; + + struct { + /* Wait for data bucket allocation or just + * fail when out of space? */ + unsigned wait:1; + /* Discard key range? */ + unsigned discard:1; + /* Wait for journal commit? */ + unsigned flush:1; + /* Perform a compare-exchange with replace_key? */ + unsigned replace:1; + + /* Set on completion, if cmpxchg index update failed */ + unsigned replace_collision:1; + /* Internal */ + unsigned insert_data_done:1; + }; + }; + + u8 btree_alloc_reserve; + + struct write_point *wp; + struct open_bucket *open_buckets[2]; + + struct keylist insert_keys; + BKEY_PADDED(insert_key); + BKEY_PADDED(replace_key); +}; + +void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *, + struct bio *, struct write_point *, bool, + bool, bool, struct bkey *, struct bkey *); +void bch_data_insert(struct closure *cl); + +int bch_read(struct cache_set *, struct bio *, u64); + +void bch_count_io_errors(struct cache *, int, const char *); +void bch_bbio_count_io_errors(struct bbio *, int, const char *); +void bch_bbio_endio(struct bbio *, int, const char *); +void bch_bbio_free(struct bio *, struct cache_set *); +struct bio *bch_bbio_alloc(struct cache_set *); + +void bch_generic_make_request(struct bio *, struct cache_set *); +void bch_bio_submit_work(struct work_struct *); +void bch_bbio_prep(struct bbio *, struct cache *); +void bch_submit_bbio(struct bbio *, struct cache *, struct bkey *, + unsigned, bool); +void bch_submit_bbio_replicas(struct bio *, struct cache_set *, + struct bkey *, unsigned, bool); +void bch_bbio_reset(struct bbio *bio); + +void __cache_promote(struct cache_set *, struct bbio *, struct bkey *); +bool cache_promote(struct cache_set *, struct bbio *, struct bkey *, unsigned); + +void bch_read_race_work(struct work_struct *work); + +#endif /* _BCACHE_IO_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ffc1d2151285..770b72755641 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -8,6 +8,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "journal.h" #include <trace/events/bcache.h> diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index a1548ffd14a2..da407f9011e0 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -2,6 +2,7 @@ #include "bcache.h" #include "btree.h" #include "extents.h" +#include "io.h" #include "keybuf.h" #include "move.h" diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 83dcebabc54a..6c7445275aaa 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -8,6 +8,7 @@ #include "btree.h" #include "buckets.h" #include "extents.h" +#include "io.h" #include "keybuf.h" #include "move.h" #include "movinggc.h" diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 66b927d78589..d0256d85859c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -28,6 +28,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "journal.h" #include "keybuf.h" #include "request.h" @@ -46,596 +47,6 @@ struct kmem_cache *bch_search_cache; -static inline void mark_cache_stats(struct cache_stat_collector *stats, - bool hit, bool bypass) -{ - atomic_inc(&stats->cache_hit_array[!bypass][!hit]); -} - -static inline void bch_mark_cache_accounting(struct cache_set *c, - struct cached_dev *dc, - bool hit, bool bypass) -{ - mark_cache_stats(&dc->accounting.collector, hit, bypass); - mark_cache_stats(&c->accounting.collector, hit, bypass); -} - -static inline void bch_mark_sectors_bypassed(struct cache_set *c, - struct cached_dev *dc, - unsigned sectors) -{ - atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); - atomic_add(sectors, &c->accounting.collector.sectors_bypassed); -} - -static inline void bch_mark_gc_write(struct cache_set *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.gc_write_sectors); -} - -static inline void bch_mark_foreground_write(struct cache_set *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.foreground_write_sectors); -} - -static inline void bch_mark_discard(struct cache_set *c, int sectors) -{ - atomic_add(sectors, &c->accounting.collector.discard_sectors); -} - -static void bch_data_insert_start(struct closure *); - -static void bio_csum(struct bio *bio, struct bkey *k) -{ - struct bio_vec bv; - struct bvec_iter iter; - u64 crc = 0xffffffffffffffffULL; - - bio_for_each_segment(bv, bio, iter) { - void *d = kmap(bv.bv_page) + bv.bv_offset; - - crc = bch_checksum_update(KEY_CSUM(k), crc, d, bv.bv_len); - kunmap(bv.bv_page); - } - - k->val[bch_extent_ptrs(k)] = crc; -} - -/* Insert data into cache */ - -static int btree_insert_fn(struct btree_op *b_op, struct btree *b) -{ - struct data_insert_op *op = container_of(b_op, - struct data_insert_op, op); - struct bkey *replace_key = op->replace ? &op->replace_key : NULL; - - int ret = bch_btree_insert_node(b, &op->op, &op->insert_keys, - replace_key, - op->flush ? &op->cl : NULL); - return bch_keylist_empty(&op->insert_keys) ? MAP_DONE : ret; -} - -static void bch_data_insert_keys_done(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - unsigned i; - - if (op->op.insert_collision) - op->replace_collision = true; - - for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) - if (op->open_buckets[i]) { - bch_open_bucket_put(op->c, op->open_buckets[i]); - op->open_buckets[i] = NULL; - } - - if (!op->insert_data_done) - continue_at(cl, bch_data_insert_start, op->io_wq); - - bch_keylist_free(&op->insert_keys); - closure_return(cl); -} - -static void __bch_data_insert_keys(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, - op.cl); - struct keylist *keys = &op->insert_keys; - int ret = 0; - - while (!ret && !bch_keylist_empty(keys)) { - op->op.locks_want = 0; - ret = bch_btree_map_nodes(&op->op, op->c, - &START_KEY(keys->keys), - btree_insert_fn, - MAP_ASYNC); - } - - if (ret == -EAGAIN) - continue_at(cl, __bch_data_insert_keys, op->c->wq); - - closure_return(cl); -} - -/** - * bch_data_insert_keys - insert extent btree keys for a write - */ -static void bch_data_insert_keys(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - enum btree_id id = BTREE_ID_EXTENTS; - - __bch_btree_op_init(&op->op, id, op->btree_alloc_reserve, 0); - - closure_call(&op->op.cl, __bch_data_insert_keys, NULL, cl); - continue_at(cl, bch_data_insert_keys_done, op->c->wq); -} - -/** - * bch_data_invalidate - discard range of keys - * - * Used to implement discard, and to handle when writethrough write hits - * a write error on the cache device. - */ -static void bch_data_invalidate(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - struct keylist *keys = &op->insert_keys; - struct bio *bio = op->bio; - - pr_debug("invalidating %i sectors from %llu", - bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); - - while (bio_sectors(bio)) { - unsigned sectors = min(bio_sectors(bio), - 1U << (KEY_SIZE_BITS - 1)); - - if (bch_keylist_realloc(keys, BKEY_U64s)) - goto out; - - bio->bi_iter.bi_sector += sectors; - bio->bi_iter.bi_size -= sectors << 9; - - *keys->top = KEY(KEY_INODE(&op->insert_key), - bio->bi_iter.bi_sector, sectors); - SET_KEY_DELETED(keys->top, true); - - bch_keylist_push(keys); - } - - op->insert_data_done = true; - bio_put(bio); -out: - continue_at(cl, bch_data_insert_keys, op->c->wq); -} - -static void bch_data_insert_error(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - - /* - * Our data write just errored, which means we've got a bunch of keys to - * insert that point to data that wasn't successfully written. - * - * We don't have to insert those keys but we still have to invalidate - * that region of the cache - so, if we just strip off all the pointers - * from the keys we'll accomplish just that. - */ - - struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; - - while (src != op->insert_keys.top) { - struct bkey *n = bkey_next(src); - - bch_set_extent_ptrs(src, 0); - memmove(dst, src, bkey_bytes(src)); - - dst = bkey_next(dst); - src = n; - } - - op->insert_keys.top = dst; - - bch_data_insert_keys(cl); -} - -static void bch_data_insert_endio(struct bio *bio) -{ - struct closure *cl = bio->bi_private; - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - - if (bio->bi_error) { - /* TODO: We could try to recover from this. */ - if (!KEY_CACHED(&op->insert_key)) - op->error = bio->bi_error; - else if (!op->replace) - set_closure_fn(cl, bch_data_insert_error, - op->c->wq); - else - set_closure_fn(cl, NULL, NULL); - } - - bch_bbio_endio(to_bbio(bio), bio->bi_error, "writing data to cache"); -} - -static void bch_data_insert_start(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - struct bio *bio = op->bio, *n; - unsigned open_bucket_nr = 0, ptrs_from; - struct open_bucket *b; - - if (op->discard) - return bch_data_invalidate(cl); - - bch_extent_drop_stale(op->c, &op->insert_key); - ptrs_from = bch_extent_ptrs(&op->insert_key); - - /* - * Journal writes are marked REQ_PREFLUSH; if the original write was a - * flush, it'll wait on the journal write. - */ - bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); - - do { - struct bkey *k; - struct bio_set *split = op->c->bio_split; - - BUG_ON(bio_sectors(bio) != KEY_SIZE(&op->insert_key)); - - if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) - continue_at(cl, bch_data_insert_keys, - op->c->wq); - - /* for the device pointers and 1 for the chksum */ - if (bch_keylist_realloc(&op->insert_keys, - BKEY_EXTENT_MAX_U64s + - (KEY_CSUM(&op->insert_key) ? 1 : 0))) - continue_at(cl, bch_data_insert_keys, op->c->wq); - - k = op->insert_keys.top; - bkey_copy(k, &op->insert_key); - - b = bch_alloc_sectors(op->c, op->wp, k, op->wait ? cl : NULL); - BUG_ON(!b); - - if (PTR_ERR(b) == -EAGAIN) { - /* If we already have some keys, must insert them first - * before allocating another open bucket. We only hit - * this case if open_bucket_nr > 1. */ - if (bch_keylist_empty(&op->insert_keys)) - continue_at(cl, bch_data_insert_start, - op->io_wq); - else - continue_at(cl, bch_data_insert_keys, - op->c->wq); - } else if (IS_ERR(b)) - goto err; - - op->open_buckets[open_bucket_nr++] = b; - - bch_cut_front(k, &op->insert_key); - - n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); - n->bi_end_io = bch_data_insert_endio; - n->bi_private = cl; - - if (KEY_CSUM(k)) - bio_csum(n, k); - - trace_bcache_cache_insert(k); - - bio_set_op_attrs(n, REQ_OP_WRITE, 0); - bch_submit_bbio_replicas(n, op->c, k, ptrs_from, false); - - bch_extent_normalize(op->c, k); - bch_check_mark_super(op->c, k, false); - - bch_keylist_push(&op->insert_keys); - } while (n != bio); - - op->insert_data_done = true; - continue_at(cl, bch_data_insert_keys, op->c->wq); -err: - if (KEY_CACHED(&op->insert_key)) { - /* - * If we were writing cached data, not doing the write is fine - * so long as we discard whatever would have been overwritten - - * then it's equivalent to doing the write and immediately - * reclaiming it. - */ - - op->discard = true; - return bch_data_invalidate(cl); - } - - op->error = -ENOSPC; - op->insert_data_done = true; - bio_put(bio); - - /* - * No reason not to insert keys for whatever data was successfully - * written (especially for a cmpxchg operation that's moving data - * around) - */ - if (!bch_keylist_empty(&op->insert_keys)) - continue_at(cl, bch_data_insert_keys, op->c->wq); - else - closure_return(cl); -} - -/** - * bch_data_insert - handle a write to a cache device or flash only volume - * - * This is the starting point for any data to end up in a cache device; it could - * be from a normal write, or a writeback write, or a write to a flash only - * volume - it's also used by the moving garbage collector to compact data in - * mostly empty buckets. - * - * It first writes the data to the cache, creating a list of keys to be inserted - * (if the data won't fit in a single open bucket, there will be multiple keys); - * after the data is written it calls bch_journal, and after the keys have been - * added to the next journal write they're inserted into the btree. - * - * It inserts the data in op->bio; bi_sector is used for the key offset, and - * op->inode is used for the key inode. - * - * If op->discard is true, instead of inserting the data it invalidates the - * region of the cache represented by op->bio and op->inode. - */ -void bch_data_insert(struct closure *cl) -{ - struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - struct cache_set *c = op->c; - u64 inode = KEY_INODE(&op->insert_key); - - trace_bcache_write(c, inode, op->bio, !KEY_CACHED(&op->insert_key), - op->discard); - - if (!bio_sectors(op->bio)) { - WARN_ONCE(1, "bch_data_insert() called with empty bio"); - closure_return(cl); - } - - /* - * This ought to be initialized in bch_data_insert_op_init(), but struct - * cache_set isn't exported - */ - if (!op->io_wq) - op->io_wq = op->c->wq; - - if (!op->discard) - bch_increment_clock(c, bio_sectors(op->bio), WRITE); - - if (!op->replace) { - /* XXX: discards may be for more sectors than max key size */ - - struct bkey start = KEY(inode, op->bio->bi_iter.bi_sector, 0); - struct bkey end = KEY(inode, bio_end_sector(op->bio), 0); - - unsigned i; - struct cache *ca; - - for_each_cache(ca, c, i) - bch_keybuf_check_overlapping(&ca->moving_gc_keys, - &start, &end); - - bch_keybuf_check_overlapping(&c->tiering_keys, - &start, &end); - } - - if (op->wp->ca) - bch_mark_gc_write(c, bio_sectors(op->bio)); - else if (!op->discard) - bch_mark_foreground_write(c, bio_sectors(op->bio)); - else - bch_mark_discard(c, bio_sectors(op->bio)); - - if (atomic64_sub_return(bio_sectors(op->bio), - &c->sectors_until_gc) < 0) { - set_gc_sectors(c); - wake_up_process(c->gc_thread); - } - - SET_KEY_OFFSET(&op->insert_key, bio_end_sector(op->bio)); - SET_KEY_SIZE(&op->insert_key, bio_sectors(op->bio)); - - bch_keylist_init(&op->insert_keys); - bio_get(op->bio); - continue_at_nobarrier(cl, bch_data_insert_start, NULL); -} - -void bch_data_insert_op_init(struct data_insert_op *op, - struct cache_set *c, - struct bio *bio, - struct write_point *wp, - bool wait, bool discard, bool flush, - struct bkey *insert_key, - struct bkey *replace_key) -{ - if (!wp) { - unsigned wp_idx = hash_long((unsigned long) current, - ilog2(ARRAY_SIZE(c->write_points))); - - BUG_ON(wp_idx > ARRAY_SIZE(c->write_points)); - wp = &c->write_points[wp_idx]; - } - - op->c = c; - op->io_wq = NULL; - op->bio = bio; - op->error = 0; - op->flags = 0; - op->wait = wait; - op->discard = discard; - op->flush = flush; - op->wp = wp; - op->btree_alloc_reserve = BTREE_ID_EXTENTS; - - memset(op->open_buckets, 0, sizeof(op->open_buckets)); - bch_keylist_init(&op->insert_keys); - bkey_copy(&op->insert_key, insert_key); - - if (replace_key) { - op->replace = true; - bkey_copy(&op->replace_key, replace_key); - } -} -EXPORT_SYMBOL(bch_data_insert_op_init); - -/* Cache promotion on read */ - -struct cache_promote_op { - struct closure cl; - struct bio *orig_bio; - struct data_insert_op iop; - bool stale; /* was the ptr stale after the read? */ - struct bbio bio; /* must be last */ -}; - -static void cache_promote_done(struct closure *cl) -{ - struct cache_promote_op *op = container_of(cl, - struct cache_promote_op, cl); - struct cache_set *c = op->iop.c; - - if (op->iop.replace_collision) { - trace_bcache_promote_collision(&op->iop.replace_key); - atomic_inc(&c->accounting.collector.cache_miss_collisions); - } - - bio_free_pages(op->iop.bio); - kfree(op); -} - -static void cache_promote_write(struct closure *cl) -{ - struct cache_promote_op *op = container_of(cl, - struct cache_promote_op, cl); - struct bio *bio = op->iop.bio; - - bio_reset(bio); - bio->bi_iter.bi_sector = KEY_START(&op->iop.insert_key); - bio->bi_iter.bi_size = KEY_SIZE(&op->iop.insert_key) << 9; - /* needed to reinit bi_vcnt so pages can be freed later */ - bch_bio_map(bio, NULL); - - bio_copy_data(op->orig_bio, bio); - op->orig_bio->bi_error = op->iop.error; - bio_endio(op->orig_bio); - - if (!op->stale && - !op->iop.error && - !test_bit(CACHE_SET_STOPPING, &op->iop.c->flags)) - closure_call(&op->iop.cl, bch_data_insert, NULL, cl); - - closure_return_with_destructor(cl, cache_promote_done); -} - -static void cache_promote_endio(struct bio *bio) -{ - struct bbio *b = to_bbio(bio); - struct cache_promote_op *op = container_of(b, - struct cache_promote_op, bio); - - /* - * If the bucket was reused while our bio was in flight, we might have - * read the wrong data. Set s->error but not error so it doesn't get - * counted against the cache device, but we'll still reread the data - * from the backing device. - */ - - if (bio->bi_error) - op->iop.error = bio->bi_error; - else if (b->ca && ptr_stale(b->ca->set, b->ca, &b->key, 0)) - op->stale = 1; - - bch_bbio_endio(b, bio->bi_error, "reading from cache"); -} - -/** - * __cache_promote -- insert result of read bio into cache - * - * Used for backing devices and flash-only volumes. - * - * @orig_bio must actually be a bbio with a valid key. - */ -static void __cache_promote(struct cache_set *c, struct bbio *orig_bio, - struct bkey *replace_key) -{ - struct cache_promote_op *op; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE); - - /* XXX: readahead? */ - - op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); - if (!op) - goto out_submit; - - /* clone the bbio */ - memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio)); - - bio = &op->bio.bio; - bio_init(bio); - bio_get(bio); - bio->bi_bdev = orig_bio->bio.bi_bdev; - bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector; - bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size; - bio->bi_end_io = cache_promote_endio; - bio->bi_private = &op->cl; - bio->bi_io_vec = bio->bi_inline_vecs; - bch_bio_map(bio, NULL); - - if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO)) - goto out_free; - - orig_bio->ca = NULL; - - closure_init(&op->cl, &c->cl); - op->orig_bio = &orig_bio->bio; - op->stale = 0; - - bch_data_insert_op_init(&op->iop, c, bio, - &c->tier_write_points[0], - false, false, false, - replace_key, - replace_key); - - bch_cut_front(&START_KEY(&orig_bio->key), &op->iop.insert_key); - bch_cut_back(&orig_bio->key, &op->iop.insert_key); - - trace_bcache_promote(&orig_bio->bio); - - op->bio.submit_time_us = local_clock_us(); - closure_bio_submit(bio, &op->cl); - - continue_at(&op->cl, cache_promote_write, c->wq); -out_free: - kfree(op); -out_submit: - generic_make_request(&orig_bio->bio); -} - -/** - * cache_promote - promote data stored in higher tiers - * - * Used for flash only volumes. - * - * @bio must actually be a bbio with valid key. - */ -static bool cache_promote(struct cache_set *c, struct bbio *bio, - struct bkey *k, unsigned ptr) -{ - if (!CACHE_TIER(&c->members[PTR_DEV(k, ptr)])) { - generic_make_request(&bio->bio); - return 0; - } - - __cache_promote(c, bio, k); - return 1; -} - /* Congested? */ unsigned bch_get_congested(struct cache_set *c) @@ -764,194 +175,6 @@ skip: return true; } -/* Cache lookup */ - -/* XXX: consolidate these somehow */ - -struct bch_read_op { - struct btree_op op; - struct cache_set *c; - struct bio *bio; - u64 inode; -}; - -static void bch_read_requeue(struct cache_set *c, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&c->read_race_lock, flags); - bio_list_add(&c->read_race_list, bio); - spin_unlock_irqrestore(&c->read_race_lock, flags); - queue_work(c->wq, &c->read_race_work); -} - -static void bch_read_endio(struct bio *bio) -{ - struct bbio *b = to_bbio(bio); - struct cache *ca = b->ca; - struct bio *orig = bio->bi_private; - - bch_bbio_count_io_errors(b, bio->bi_error, "reading from cache"); - - if (!bio->bi_error && ca && - (race_fault() || ptr_stale(ca->set, ca, &b->key, 0))) { - /* Read bucket invalidate race */ - atomic_long_inc(&ca->set->cache_read_races); - bch_read_requeue(ca->set, bio); - } else { - if (bio->bi_error) - orig->bi_error = bio->bi_error; - - bio_endio(orig); - bio_put(bio); - } - - if (ca) - percpu_ref_put(&ca->ref); -} - -static inline void __bio_inc_remaining(struct bio *bio) -{ - bio->bi_flags |= (1 << BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - -/* XXX: this looks a lot like cache_lookup_fn() */ -static int bch_read_fn(struct btree_op *b_op, struct btree *b, struct bkey *k) -{ - struct bch_read_op *op = container_of(b_op, - struct bch_read_op, op); - struct bio *n, *bio = op->bio; - struct bbio *bbio; - int sectors, ret; - unsigned ptr; - struct cache *ca; - - BUG_ON(bkey_cmp(&START_KEY(k), - &KEY(op->inode, bio->bi_iter.bi_sector, 0)) > 0); - - BUG_ON(bkey_cmp(k, &KEY(op->inode, bio->bi_iter.bi_sector, 0)) <= 0); - - sectors = KEY_OFFSET(k) - bio->bi_iter.bi_sector; - - ca = bch_extent_pick_ptr(b->c, k, &ptr); - if (!ca) { - if (!KEY_CACHED(k) && bch_extent_ptrs(k)) { - bio_io_error(bio); - return MAP_DONE; - } else { - unsigned bytes = min_t(unsigned, sectors, - bio_sectors(bio)) << 9; - - swap(bio->bi_iter.bi_size, bytes); - zero_fill_bio(bio); - swap(bio->bi_iter.bi_size, bytes); - - bio_advance(bio, bytes); - - return bio->bi_iter.bi_size ? MAP_CONTINUE : MAP_DONE; - } - } - - PTR_BUCKET(b->c, ca, k, ptr)->read_prio = b->c->prio_clock[READ].hand; - - if (sectors >= bio_sectors(bio)) { - n = bio_clone_fast(bio, GFP_NOIO, b->c->bio_split); - ret = MAP_DONE; - } else { - n = bio_split(bio, sectors, GFP_NOIO, b->c->bio_split); - ret = MAP_CONTINUE; - } - - n->bi_private = bio; - n->bi_end_io = bch_read_endio; - __bio_inc_remaining(bio); - - bbio = to_bbio(n); - bch_bkey_copy_single_ptr(&bbio->key, k, ptr); - - /* Trim the key to match what we're actually reading */ - bch_cut_front(&KEY(op->inode, n->bi_iter.bi_sector, 0), &bbio->key); - bch_cut_back(&KEY(op->inode, bio_end_sector(n), 0), &bbio->key); - - bch_bbio_prep(bbio, ca); - - cache_promote(b->c, bbio, k, ptr); - - return ret; -} - -int bch_read(struct cache_set *c, struct bio *bio, u64 inode) -{ - struct bch_read_op op; - int ret; - - bch_increment_clock(c, bio_sectors(bio), READ); - - bch_btree_op_init(&op.op, BTREE_ID_EXTENTS, -1); - op.c = c; - op.bio = bio; - op.inode = inode; - - ret = bch_btree_map_keys(&op.op, c, - &KEY(inode, bio->bi_iter.bi_sector, 0), - bch_read_fn, MAP_HOLES); - return ret < 0 ? ret : 0; -} -EXPORT_SYMBOL(bch_read); - -/** - * bch_read_retry - re-submit a bio originally from bch_read() - */ -static void bch_read_retry(struct bbio *bbio) -{ - struct bio *bio = &bbio->bio; - struct bio *parent; - u64 inode; - - trace_bcache_read_retry(bio); - - /* - * This used to be a leaf bio from bch_read_fn(), but - * since we don't know what happened to the btree in - * the meantime, we have to re-submit it via the - * top-level bch_read() entry point. Before doing that, - * we have to reset the bio, preserving the biovec. - * - * The inode, offset and size come from the bbio's key, - * which was set by bch_read_fn(). - */ - inode = KEY_INODE(&bbio->key); - parent = bio->bi_private; - - bch_bbio_reset(bbio); - bio_chain(bio, parent); - - bch_read(bbio->ca->set, bio, inode); - bio_endio(parent); /* for bio_chain() in bch_read_fn() */ - bio_endio(bio); -} - -void bch_read_race_work(struct work_struct *work) -{ - struct cache_set *c = container_of(work, struct cache_set, - read_race_work); - unsigned long flags; - struct bio *bio; - - while (1) { - spin_lock_irqsave(&c->read_race_lock, flags); - bio = bio_list_pop(&c->read_race_list); - spin_unlock_irqrestore(&c->read_race_lock, flags); - - if (!bio) - break; - - bch_read_retry(to_bbio(bio)); - } -} - /* struct search based code */ struct search { diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 386f452f6951..edec16a917e2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -8,61 +8,11 @@ struct cached_dev; struct bcache_device; struct kmem_cache; -struct data_insert_op { - struct closure cl; - struct cache_set *c; - struct workqueue_struct *io_wq; - struct bio *bio; - - /* Used internally, do not touch */ - struct btree_op op; - - short error; - - union { - u8 flags; - - struct { - /* Wait for data bucket allocation or just - * fail when out of space? */ - unsigned wait:1; - /* Discard key range? */ - unsigned discard:1; - /* Wait for journal commit? */ - unsigned flush:1; - /* Perform a compare-exchange with replace_key? */ - unsigned replace:1; - - /* Set on completion, if cmpxchg index update failed */ - unsigned replace_collision:1; - /* Internal */ - unsigned insert_data_done:1; - }; - }; - - u8 btree_alloc_reserve; - - struct write_point *wp; - struct open_bucket *open_buckets[2]; - - struct keylist insert_keys; - BKEY_PADDED(insert_key); - BKEY_PADDED(replace_key); -}; - -void bch_data_insert_op_init(struct data_insert_op *, struct cache_set *, - struct bio *, struct write_point *, bool, - bool, bool, struct bkey *, struct bkey *); - unsigned bch_get_congested(struct cache_set *); -int bch_read(struct cache_set *, struct bio *, u64); -void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d); -void bch_read_race_work(struct work_struct *work); - extern struct kmem_cache *bch_search_cache; #endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index 0a0da6a460ee..39877f9aa132 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -1,57 +1,7 @@ #ifndef _BCACHE_STATS_H_ #define _BCACHE_STATS_H_ -struct cache_stat_collector { - union { - struct { - atomic_t cache_hits; - atomic_t cache_misses; - atomic_t cache_bypass_hits; - atomic_t cache_bypass_misses; - }; - - /* cache_hit_array[!bypass][!hit]: */ - atomic_t cache_hit_array[2][2]; - }; - - - atomic_t cache_readaheads; - atomic_t cache_miss_collisions; - atomic_t sectors_bypassed; - atomic_t foreground_write_sectors; - atomic_t gc_write_sectors; - atomic_t discard_sectors; -}; - -struct cache_stats { - struct kobject kobj; - - unsigned long cache_hits; - unsigned long cache_misses; - unsigned long cache_bypass_hits; - unsigned long cache_bypass_misses; - unsigned long cache_readaheads; - unsigned long cache_miss_collisions; - unsigned long sectors_bypassed; - unsigned long foreground_write_sectors; - unsigned long gc_write_sectors; - unsigned long discard_sectors; - - unsigned rescale; -}; - -struct cache_accounting { - struct closure cl; - struct timer_list timer; - atomic_t closing; - - struct cache_stat_collector collector; - - struct cache_stats total; - struct cache_stats five_minute; - struct cache_stats hour; - struct cache_stats day; -}; +#include "stats_types.h" struct cache_set; struct cached_dev; @@ -62,4 +12,41 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *); void bch_cache_accounting_clear(struct cache_accounting *); void bch_cache_accounting_destroy(struct cache_accounting *); +static inline void mark_cache_stats(struct cache_stat_collector *stats, + bool hit, bool bypass) +{ + atomic_inc(&stats->cache_hit_array[!bypass][!hit]); +} + +static inline void bch_mark_cache_accounting(struct cache_set *c, + struct cached_dev *dc, + bool hit, bool bypass) +{ + mark_cache_stats(&dc->accounting.collector, hit, bypass); + mark_cache_stats(&c->accounting.collector, hit, bypass); +} + +static inline void bch_mark_sectors_bypassed(struct cache_set *c, + struct cached_dev *dc, + unsigned sectors) +{ + atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); +} + +static inline void bch_mark_gc_write(struct cache_set *c, int sectors) +{ + atomic_add(sectors, &c->accounting.collector.gc_write_sectors); +} + +static inline void bch_mark_foreground_write(struct cache_set *c, int sectors) +{ + atomic_add(sectors, &c->accounting.collector.foreground_write_sectors); +} + +static inline void bch_mark_discard(struct cache_set *c, int sectors) +{ + atomic_add(sectors, &c->accounting.collector.discard_sectors); +} + #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/stats_types.h b/drivers/md/bcache/stats_types.h new file mode 100644 index 000000000000..28e4c69e8e6d --- /dev/null +++ b/drivers/md/bcache/stats_types.h @@ -0,0 +1,56 @@ +#ifndef _BCACHE_STATS_TYPES_H_ +#define _BCACHE_STATS_TYPES_H_ + +struct cache_stat_collector { + union { + struct { + atomic_t cache_hits; + atomic_t cache_misses; + atomic_t cache_bypass_hits; + atomic_t cache_bypass_misses; + }; + + /* cache_hit_array[!bypass][!hit]: */ + atomic_t cache_hit_array[2][2]; + }; + + + atomic_t cache_readaheads; + atomic_t cache_miss_collisions; + atomic_t sectors_bypassed; + atomic_t foreground_write_sectors; + atomic_t gc_write_sectors; + atomic_t discard_sectors; +}; + +struct cache_stats { + struct kobject kobj; + + unsigned long cache_hits; + unsigned long cache_misses; + unsigned long cache_bypass_hits; + unsigned long cache_bypass_misses; + unsigned long cache_readaheads; + unsigned long cache_miss_collisions; + unsigned long sectors_bypassed; + unsigned long foreground_write_sectors; + unsigned long gc_write_sectors; + unsigned long discard_sectors; + + unsigned rescale; +}; + +struct cache_accounting { + struct closure cl; + struct timer_list timer; + atomic_t closing; + + struct cache_stat_collector collector; + + struct cache_stats total; + struct cache_stats five_minute; + struct cache_stats hour; + struct cache_stats day; +}; + +#endif /* _BCACHE_STATS_TYPES_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c787db192728..6ad3b54f5bfa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -10,6 +10,7 @@ #include "alloc.h" #include "btree.h" #include "debug.h" +#include "io.h" #include "journal.h" #include "movinggc.h" #include "request.h" diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index 162931d95ee8..c1724bc844d9 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -3,6 +3,7 @@ #include "btree.h" #include "buckets.h" #include "extents.h" +#include "io.h" #include "keybuf.h" #include "move.h" diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 62a1c2f136ae..0843f378cd37 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "extents.h" +#include "io.h" #include "keybuf.h" #include "writeback.h" |