diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-08 00:13:18 -0900 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-20 09:07:08 -0900 |
commit | b33fc8298f7e13226b9895abc57c9bfce5e3fa2d (patch) | |
tree | a3d2a5a909b6372f7777c1c5c18cef5f81d123a9 /libbcache/writeback.c | |
parent | 7f4191a202ea4558ca2d5eb8a47daea33c9999c7 (diff) |
bcache in userspace; userspace fsck
Diffstat (limited to 'libbcache/writeback.c')
-rw-r--r-- | libbcache/writeback.c | 657 |
1 files changed, 657 insertions, 0 deletions
diff --git a/libbcache/writeback.c b/libbcache/writeback.c new file mode 100644 index 00000000..600bfbfb --- /dev/null +++ b/libbcache/writeback.c @@ -0,0 +1,657 @@ +/* + * background writeback - scan btree for dirty data and write it to the backing + * device + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree_update.h" +#include "clock.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "keybuf.h" +#include "keylist.h" +#include "writeback.h" + +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <trace/events/bcache.h> + +/* Rate limiting */ + +static void __update_writeback_rate(struct cached_dev *dc) +{ + struct cache_set *c = dc->disk.c; + u64 cache_dirty_target = + div_u64(c->capacity * dc->writeback_percent, 100); + s64 target = div64_u64(cache_dirty_target * + bdev_sectors(dc->disk_sb.bdev), + c->cached_dev_sectors); + s64 dirty = bcache_dev_sectors_dirty(&dc->disk); + + bch_pd_controller_update(&dc->writeback_pd, target << 9, + dirty << 9, -1); +} + +static void update_writeback_rate(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_pd_update); + + down_read(&dc->writeback_lock); + + if (atomic_read(&dc->has_dirty) && + dc->writeback_percent && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + __update_writeback_rate(dc); + else + dc->writeback_pd.rate.rate = UINT_MAX; + + up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_pd_update, + dc->writeback_pd_update_seconds * HZ); +} + +struct dirty_io { + struct closure cl; + struct bch_replace_info replace; + struct cached_dev *dc; + struct cache *ca; + struct keybuf_key *w; + struct bch_extent_ptr ptr; + int error; + bool from_mempool; + /* Must be last */ + struct bio bio; +}; + +#define DIRTY_IO_MEMPOOL_BVECS 64 +#define DIRTY_IO_MEMPOOL_SECTORS (DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS) + +static void dirty_init(struct dirty_io *io) +{ + struct bio *bio = &io->bio; + + bio_init(bio); + if (!io->dc->writeback_percent) + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = io->replace.key.k.size << 9; + bio->bi_max_vecs = + DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS); + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); +} + +static void dirty_io_destructor(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + if (io->from_mempool) + mempool_free(io, &io->dc->writeback_io_pool); + else + kfree(io); +} + +static void write_dirty_finish(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct cached_dev *dc = io->dc; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, &io->bio, i) + mempool_free(bv->bv_page, &dc->writeback_page_pool); + + if (!io->error) { + BKEY_PADDED(k) tmp; + int ret; + + bkey_copy(&tmp.k, &io->replace.key); + io->replace.hook.fn = bch_extent_cmpxchg; + bkey_extent_set_cached(&tmp.k.k, true); + + ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k, + NULL, &io->replace.hook, NULL, 0); + if (io->replace.successes == 0) + trace_bcache_writeback_collision(&io->replace.key.k); + + atomic_long_inc(ret + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); + } + + bch_keybuf_put(&dc->writeback_keys, io->w); + + closure_return_with_destructor(cl, dirty_io_destructor); +} + +static void dirty_endio(struct bio *bio) +{ + struct dirty_io *io = container_of(bio, struct dirty_io, bio); + + if (bio->bi_error) { + trace_bcache_writeback_error(&io->replace.key.k, + op_is_write(bio_op(&io->bio)), + bio->bi_error); + io->error = bio->bi_error; + } + + closure_put(&io->cl); +} + +static void write_dirty(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + if (!io->error) { + dirty_init(io); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = + bkey_start_offset(&io->replace.key.k); + io->bio.bi_bdev = io->dc->disk_sb.bdev; + io->bio.bi_end_io = dirty_endio; + + closure_bio_submit(&io->bio, cl); + } + + continue_at(cl, write_dirty_finish, io->dc->disk.c->wq); +} + +static void read_dirty_endio(struct bio *bio) +{ + struct dirty_io *io = container_of(bio, struct dirty_io, bio); + + cache_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read"); + + bch_account_io_completion(io->ca); + + if (ptr_stale(io->ca, &io->ptr)) + bio->bi_error = -EINTR; + + dirty_endio(bio); +} + +static void read_dirty_submit(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + closure_bio_submit(&io->bio, cl); + + continue_at(cl, write_dirty, system_freezable_wq); +} + +static u64 read_dirty(struct cached_dev *dc) +{ + struct keybuf_key *w; + struct dirty_io *io; + struct closure cl; + unsigned i; + struct bio_vec *bv; + u64 sectors_written = 0; + BKEY_PADDED(k) tmp; + + closure_init_stack(&cl); + + while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) { + w = bch_keybuf_next(&dc->writeback_keys); + if (!w) + break; + + sectors_written += w->key.k.size; + bkey_copy(&tmp.k, &w->key); + + while (tmp.k.k.size) { + struct extent_pick_ptr pick; + + bch_extent_pick_ptr(dc->disk.c, + bkey_i_to_s_c(&tmp.k), + &pick); + if (IS_ERR_OR_NULL(pick.ca)) + break; + + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(tmp.k.k.size, + PAGE_SECTORS), + GFP_KERNEL); + if (!io) { + trace_bcache_writeback_alloc_fail(pick.ca->set, + tmp.k.k.size); + io = mempool_alloc(&dc->writeback_io_pool, + GFP_KERNEL); + memset(io, 0, sizeof(*io) + + sizeof(struct bio_vec) * + DIRTY_IO_MEMPOOL_BVECS); + io->from_mempool = true; + + bkey_copy(&io->replace.key, &tmp.k); + + if (DIRTY_IO_MEMPOOL_SECTORS < + io->replace.key.k.size) + bch_key_resize(&io->replace.key.k, + DIRTY_IO_MEMPOOL_SECTORS); + } else { + bkey_copy(&io->replace.key, &tmp.k); + } + + io->dc = dc; + io->ca = pick.ca; + io->w = w; + io->ptr = pick.ptr; + atomic_inc(&w->ref); + + dirty_init(io); + bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_iter.bi_sector = pick.ptr.offset; + io->bio.bi_bdev = pick.ca->disk_sb.bdev; + io->bio.bi_end_io = read_dirty_endio; + + bio_for_each_segment_all(bv, &io->bio, i) { + bv->bv_page = + mempool_alloc(&dc->writeback_page_pool, + i ? GFP_NOWAIT + : GFP_KERNEL); + if (!bv->bv_page) { + BUG_ON(!i); + io->bio.bi_vcnt = i; + + io->bio.bi_iter.bi_size = + io->bio.bi_vcnt * PAGE_SIZE; + + bch_key_resize(&io->replace.key.k, + bio_sectors(&io->bio)); + break; + } + } + + bch_cut_front(io->replace.key.k.p, &tmp.k); + trace_bcache_writeback(&io->replace.key.k); + + bch_ratelimit_increment(&dc->writeback_pd.rate, + io->replace.key.k.size << 9); + + closure_call(&io->cl, read_dirty_submit, NULL, &cl); + } + + bch_keybuf_put(&dc->writeback_keys, w); + } + + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + closure_sync(&cl); + + return sectors_written; +} + +/* Scan for dirty data */ + +static void __bcache_dev_sectors_dirty_add(struct bcache_device *d, + u64 offset, int nr_sectors) +{ + unsigned stripe_offset, stripe, sectors_dirty; + + if (!d) + return; + + if (!d->stripe_sectors_dirty) + return; + + stripe = offset_to_stripe(d, offset); + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + u64 offset, int nr_sectors) +{ + struct bcache_device *d; + + rcu_read_lock(); + d = bch_dev_find(c, inode); + if (d) + __bcache_dev_sectors_dirty_add(d, offset, nr_sectors); + rcu_read_unlock(); +} + +static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k) +{ + struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + + BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk)); + + return bkey_extent_is_data(k.k) && + !bkey_extent_is_cached(k.k); +} + +static void refill_full_stripes(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned inode = bcache_dev_inum(&dc->disk); + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset); + + if (stripe >= dc->disk.nr_stripes) + stripe = 0; + + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = POS(inode, + stripe * dc->disk.stripe_size); + + bch_refill_keybuf(dc->disk.c, buf, + POS(inode, + next_stripe * dc->disk.stripe_size), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } + } +} + +static u64 bch_writeback(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned inode = bcache_dev_inum(&dc->disk); + struct bpos start = POS(inode, 0); + struct bpos end = POS(inode, KEY_OFFSET_MAX); + struct bpos start_pos; + u64 sectors_written = 0; + + buf->last_scanned = POS(inode, 0); + + while (bkey_cmp(buf->last_scanned, end) < 0 && + !kthread_should_stop()) { + down_write(&dc->writeback_lock); + + if (!atomic_read(&dc->has_dirty)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return sectors_written; + + schedule(); + try_to_freeze(); + return sectors_written; + } + + if (bkey_cmp(buf->last_scanned, end) >= 0) + buf->last_scanned = POS(inode, 0); + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + goto refill_done; + } + + start_pos = buf->last_scanned; + bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred); + + if (bkey_cmp(buf->last_scanned, end) >= 0) { + /* + * If we get to the end start scanning again from the + * beginning, and only scan up to where we initially + * started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, start_pos, + dirty_pred); + } + + if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } + +refill_done: + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_pd.rate); + sectors_written += read_dirty(dc); + } + + return sectors_written; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + struct cache_set *c = dc->disk.c; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last; + u64 sectors_written; + + set_freezable(); + + while (!kthread_should_stop()) { + if (kthread_wait_freezable(dc->writeback_running || + test_bit(BCACHE_DEV_DETACHING, + &dc->disk.flags))) + break; + + last = atomic_long_read(&clock->now); + + sectors_written = bch_writeback(dc); + + if (sectors_written < c->capacity >> 4) + bch_kthread_io_clock_wait(clock, + last + (c->capacity >> 5)); + } + + return 0; +} + +/** + * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys + * + * This prevents us from wrapping around gens for a bucket only referenced from + * writeback keybufs. We don't actually care that the data in those buckets is + * marked live, only that we don't wrap the gens. + */ +void bch_writeback_recalc_oldest_gens(struct cache_set *c) +{ + struct radix_tree_iter iter; + void **slot; + + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &c->devices, &iter, 0) { + struct bcache_device *d; + struct cached_dev *dc; + + d = radix_tree_deref_slot(slot); + + if (!CACHED_DEV(&d->inode.v)) + continue; + dc = container_of(d, struct cached_dev, disk); + + bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys); + } + + rcu_read_unlock(); +} + +/* Init */ + +void bch_sectors_dirty_init(struct cached_dev *dc, struct cache_set *c) +{ + struct bcache_device *d = &dc->disk; + struct btree_iter iter; + struct bkey_s_c k; + + /* + * We have to do this before the disk is added to the radix tree or we + * race with moving GC + */ + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, + POS(bcache_dev_inum(d), 0), k) { + if (k.k->p.inode > bcache_dev_inum(d)) + break; + + if (bkey_extent_is_data(k.k) && + !bkey_extent_is_cached(k.k)) + __bcache_dev_sectors_dirty_add(d, + bkey_start_offset(k.k), + k.k->size); + + bch_btree_iter_cond_resched(&iter); + } + bch_btree_iter_unlock(&iter); + + dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d); +} + +void bch_cached_dev_writeback_stop(struct cached_dev *dc) +{ + cancel_delayed_work_sync(&dc->writeback_pd_update); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { + kthread_stop(dc->writeback_thread); + dc->writeback_thread = NULL; + } +} + +void bch_cached_dev_writeback_free(struct cached_dev *dc) +{ + struct bcache_device *d = &dc->disk; + + mempool_exit(&dc->writeback_page_pool); + mempool_exit(&dc->writeback_io_pool); + kvfree(d->full_dirty_stripes); + kvfree(d->stripe_sectors_dirty); +} + +int bch_cached_dev_writeback_init(struct cached_dev *dc) +{ + struct bcache_device *d = &dc->disk; + sector_t sectors; + size_t n; + + sectors = get_capacity(dc->disk.disk); + + if (!d->stripe_size) { +#ifdef CONFIG_BCACHE_DEBUG + d->stripe_size = 1 << 0; +#else + d->stripe_size = 1 << 31; +#endif + } + + pr_debug("stripe size: %d sectors", d->stripe_size); + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", + (unsigned)d->nr_stripes); + return -ENOMEM; + } + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->stripe_sectors_dirty) { + pr_err("cannot allocate stripe_sectors_dirty"); + return -ENOMEM; + } + + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) { + pr_err("cannot allocate full_dirty_stripes"); + return -ENOMEM; + } + + if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4, + sizeof(struct dirty_io) + + sizeof(struct bio_vec) * + DIRTY_IO_MEMPOOL_BVECS) || + mempool_init_page_pool(&dc->writeback_page_pool, + (64 << 10) / PAGE_SIZE, 0)) + return -ENOMEM; + + init_rwsem(&dc->writeback_lock); + bch_keybuf_init(&dc->writeback_keys); + + dc->writeback_metadata = true; + dc->writeback_running = true; + dc->writeback_percent = 10; + dc->writeback_pd_update_seconds = 5; + + bch_pd_controller_init(&dc->writeback_pd); + INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate); + + return 0; +} + +int bch_cached_dev_writeback_start(struct cached_dev *dc) +{ + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); + + schedule_delayed_work(&dc->writeback_pd_update, + dc->writeback_pd_update_seconds * HZ); + + bch_writeback_queue(dc); + + return 0; +} |