diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-10 05:30:06 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-10 05:30:06 -0800 |
commit | f355b7a7f201be3edae8926a5c1baf5f2cfb6d33 (patch) | |
tree | b4100eeb14aebd61086b3da35d1ac23d027368fe | |
parent | c5c87c72f3d73acf7883697acb6e25f6b25e24cf (diff) |
bcache: rip out a bunch more data move code
switch to a much much simpler mechanism for perserving io
ordering/locality
-rw-r--r-- | drivers/md/bcache/bcache.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/migrate.c | 55 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 435 | ||||
-rw-r--r-- | drivers/md/bcache/move.h | 165 | ||||
-rw-r--r-- | drivers/md/bcache/move_types.h | 57 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 76 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 14 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 15 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 73 | ||||
-rw-r--r-- | drivers/md/bcache/tier.h | 2 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 38 |
12 files changed, 217 insertions, 719 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 36bf8841c3d4..7256c0ebc3b8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -429,13 +429,10 @@ struct cache { /* Moving GC: */ struct task_struct *moving_gc_read; - struct moving_queue moving_gc_queue; struct bch_pd_controller moving_gc_pd; /* Tiering: */ - struct moving_queue tiering_queue; struct write_point tiering_write_point; - unsigned tiering_stripe_size; struct write_point copygc_write_point; diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c index eb47d6620e1a..363d2b02282a 100644 --- a/drivers/md/bcache/migrate.c +++ b/drivers/md/bcache/migrate.c @@ -16,11 +16,10 @@ static int issue_migration_move(struct cache *ca, struct moving_context *ctxt, struct bkey_s_c k) { - struct moving_queue *q = &ca->moving_gc_queue; struct cache_set *c = ca->set; - struct moving_io *io; struct disk_reservation res; const struct bch_extent_ptr *ptr; + int ret; if (bch_disk_reservation_get(c, &res, k.k->size, 0)) return -ENOSPC; @@ -31,20 +30,15 @@ static int issue_migration_move(struct cache *ca, BUG(); found: - io = moving_io_alloc(c, q, &c->migration_write_point, k, ptr); - if (!io) { - bch_disk_reservation_put(c, &res); - return -ENOMEM; - } + /* XXX: we need to be doing something with the disk reservation */ - bch_data_move(q, ctxt, io); - return 0; + ret = bch_data_move(c, ctxt, &c->migration_write_point, k, ptr); + if (ret) + bch_disk_reservation_put(c, &res); + return ret; } #define MAX_DATA_OFF_ITER 10 -#define MIGRATE_NR 64 -#define MIGRATE_READ_NR 32 -#define MIGRATE_WRITE_NR 32 /* * This moves only the data off, leaving the meta-data (if any) in place. @@ -64,24 +58,13 @@ int bch_move_data_off_device(struct cache *ca) { struct moving_context ctxt; struct cache_set *c = ca->set; - struct moving_queue *queue = &ca->moving_gc_queue; unsigned pass = 0; u64 seen_key_count; int ret = 0; BUG_ON(ca->mi.state == CACHE_ACTIVE); - /* - * This reuses the moving gc queue as it is no longer in use - * by moving gc, which must have been stopped to call this. - */ - - BUG_ON(ca->moving_gc_read != NULL); - - queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR); - - BUG_ON(queue->wq == NULL); - bch_moving_context_init(&ctxt, NULL, MOVING_PURPOSE_MIGRATION); + bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); ctxt.avoid = ca; /* @@ -111,37 +94,27 @@ int bch_move_data_off_device(struct cache *ca) bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - while ((k = bch_btree_iter_peek(&iter)).k) { + while (!bch_move_ctxt_wait(&ctxt) && + (k = bch_btree_iter_peek(&iter)).k) { if (!bkey_extent_is_data(k.k) || !bch_extent_has_device(bkey_s_c_to_extent(k), ca->sb.nr_this_dev)) goto next; - if (bch_queue_full(queue)) { - bch_btree_iter_unlock(&iter); - - if (queue->rotational) - bch_queue_run(queue, &ctxt); - else - wait_event(queue->wait, - !bch_queue_full(queue)); - continue; - } - ret = issue_migration_move(ca, &ctxt, k); if (ret == -ENOMEM) { bch_btree_iter_unlock(&iter); /* - * memory allocation failure, wait for IOs to - * finish + * memory allocation failure, wait for some IO + * to finish */ - bch_queue_run(queue, &ctxt); + bch_move_ctxt_wait_for_io(&ctxt); continue; } if (ret == -ENOSPC) { bch_btree_iter_unlock(&iter); - bch_queue_run(queue, &ctxt); + bch_move_ctxt_exit(&ctxt); return -ENOSPC; } BUG_ON(ret); @@ -153,7 +126,7 @@ next: } ret = bch_btree_iter_unlock(&iter); - bch_queue_run(queue, &ctxt); + bch_move_ctxt_exit(&ctxt); if (ret) return ret; diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index 52a82740304f..a534b0a62801 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -151,38 +151,10 @@ void bch_migrate_write_init(struct cache_set *c, m->op.index_update_fn = bch_migrate_index_update; } -static void moving_error(struct moving_context *ctxt, unsigned flag) -{ - atomic_inc(&ctxt->error_count); - atomic_or(flag, &ctxt->error_flags); -} - -void bch_moving_context_init(struct moving_context *ctxt, - struct bch_ratelimit *rate, - enum moving_purpose purpose) -{ - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->rate = rate; - ctxt->purpose = purpose; - closure_init_stack(&ctxt->cl); -} - -static bool bch_queue_reads_pending(struct moving_queue *q) -{ - return atomic_read(&q->read_count) || !RB_EMPTY_ROOT(&q->tree); -} - -static void bch_queue_write(struct moving_queue *q) -{ - BUG_ON(q->wq == NULL); - queue_work(q->wq, &q->work); -} - static void migrate_bio_init(struct moving_io *io, struct bio *bio, unsigned sectors) { bio_init(bio); - bio_get(bio); bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); bio->bi_iter.bi_size = sectors << 9; @@ -192,91 +164,31 @@ static void migrate_bio_init(struct moving_io *io, struct bio *bio, bch_bio_map(bio, NULL); } -struct moving_io *moving_io_alloc(struct cache_set *c, - struct moving_queue *q, - struct write_point *wp, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr) -{ - struct moving_io *io; - - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(k.k->size, PAGE_SECTORS), - GFP_KERNEL); - if (!io) - return NULL; - - migrate_bio_init(io, &io->rbio.bio, k.k->size); - - if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { - kfree(io); - return NULL; - } - - migrate_bio_init(io, &io->write.wbio.bio.bio, k.k->size); - io->write.wbio.bio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - - bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); - - if (move_ptr) - io->sort_key = move_ptr->offset; - - return io; -} - -void moving_io_free(struct moving_io *io) -{ - bch_bio_free_pages(&io->write.wbio.bio.bio); - kfree(io); -} - static void moving_io_destructor(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_queue *q = io->q; - unsigned long flags; - bool kick_writes = true; + struct moving_context *ctxt = io->ctxt; //if (io->replace.failures) // trace_bcache_copy_collision(q, &io->key.k); - spin_lock_irqsave(&q->lock, flags); - - if (io->read_issued) { - BUG_ON(!atomic_read(&q->read_count)); - atomic_dec(&q->read_count); - } - - if (io->write_issued) { - BUG_ON(!atomic_read(&q->write_count)); - atomic_dec(&q->write_count); - trace_bcache_move_write_done(q, &io->write.key.k); - } - - BUG_ON(!atomic_read(&q->count)); - atomic_dec(&q->count); - wake_up(&q->wait); - - list_del_init(&io->list); + atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); + wake_up(&ctxt->wait); - if (q->rotational && bch_queue_reads_pending(q)) - kick_writes = false; - - if (list_empty(&q->pending)) - kick_writes = false; - - spin_unlock_irqrestore(&q->lock, flags); - - moving_io_free(io); + bch_bio_free_pages(&io->write.wbio.bio.bio); + kfree(io); +} - if (kick_writes) - bch_queue_write(q); +static void moving_error(struct moving_context *ctxt, unsigned flag) +{ + atomic_inc(&ctxt->error_count); + atomic_or(flag, &ctxt->error_flags); } static void moving_io_after_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->context; + struct moving_context *ctxt = io->ctxt; if (io->write.op.error) moving_error(ctxt, MOVING_FLAG_WRITE); @@ -288,174 +200,40 @@ static void write_moving(struct moving_io *io) { struct bch_write_op *op = &io->write.op; - if (op->error) + if (op->error) { closure_return_with_destructor(&io->cl, moving_io_destructor); - else { + } else { closure_call(&op->cl, bch_write, NULL, &io->cl); closure_return_with_destructor(&io->cl, moving_io_after_write); } } -static void bch_queue_write_work(struct work_struct *work) +static inline struct moving_io *next_pending_write(struct moving_context *ctxt) { - struct moving_queue *q = container_of(work, struct moving_queue, work); - struct moving_io *io; - - spin_lock_irq(&q->lock); - - if (q->rotational && bch_queue_reads_pending(q)) { - /* All reads should have finished before writes start */ - spin_unlock_irq(&q->lock); - return; - } - - while (atomic_read(&q->write_count) < q->max_write_count) { - io = list_first_entry_or_null(&q->pending, - struct moving_io, list); - /* - * We only issue the writes in insertion order to preserve - * any linearity in the original key list/tree, so if we - * find an io whose read hasn't completed, we don't - * scan beyond it. Eventually that read will complete, - * at which point we may issue multiple writes (for it - * and any following entries whose reads had already - * completed and we had not examined here). - */ - if (!io || !io->read_completed) - break; + struct moving_io *io = + list_first_entry_or_null(&ctxt->reads, struct moving_io, list); - BUG_ON(io->write_issued); - atomic_inc(&q->write_count); - io->write_issued = 1; - list_del(&io->list); - list_add_tail(&io->list, &q->write_pending); - trace_bcache_move_write(q, &io->write.key.k); - spin_unlock_irq(&q->lock); - write_moving(io); - spin_lock_irq(&q->lock); - } - - spin_unlock_irq(&q->lock); -} - -/* - * IMPORTANT: The caller of queue_init must have zero-filled it when it - * allocates it. - */ - -int bch_queue_init(struct moving_queue *q, - struct cache_set *c, - unsigned max_count, - unsigned max_read_count, - unsigned max_write_count, - bool rotational, - const char *name) -{ - INIT_WORK(&q->work, bch_queue_write_work); - - q->max_count = max_count; - q->max_read_count = max_read_count; - q->max_write_count = max_write_count; - q->rotational = rotational; - - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->pending); - INIT_LIST_HEAD(&q->write_pending); - q->tree = RB_ROOT; - init_waitqueue_head(&q->wait); - - q->wq = alloc_workqueue(name, - WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1); - if (!q->wq) - return -ENOMEM; - - return 0; -} - -void queue_io_resize(struct moving_queue *q, - unsigned max_io, - unsigned max_read, - unsigned max_write) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - q->max_count = max_io; - q->max_read_count = max_read; - q->max_write_count = max_write; - spin_unlock_irqrestore(&q->lock, flags); -} - -void bch_queue_destroy(struct moving_queue *q) -{ - if (q->wq) - destroy_workqueue(q->wq); - q->wq = NULL; -} - -static void pending_recalc_oldest_gens(struct cache_set *c, struct list_head *l) -{ - struct moving_io *io; - - list_for_each_entry(io, l, list) { - /* - * This only marks the (replacement) key and not the - * insertion key in the bch_write_op, as the insertion - * key should be a subset of the replacement key except - * for any new pointers added by the write, and those - * don't need to be marked because they are pointing - * to open buckets until the write completes - */ - bch_btree_key_recalc_oldest_gen(c, - bkey_i_to_s_c(&io->write.key)); - } -} - -void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q) -{ - unsigned long flags; - - /* 2nd, mark the keys in the I/Os */ - spin_lock_irqsave(&q->lock, flags); - - pending_recalc_oldest_gens(c, &q->pending); - pending_recalc_oldest_gens(c, &q->write_pending); - - spin_unlock_irqrestore(&q->lock, flags); + return io && io->read_completed ? io : NULL; } static void read_moving_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_queue *q = io->q; + struct moving_context *ctxt = io->ctxt; - unsigned long flags; - - trace_bcache_move_read_done(q, &io->write.key.k); + trace_bcache_move_read_done(&io->write.key.k); if (bio->bi_error) { io->write.op.error = bio->bi_error; - moving_error(io->context, MOVING_FLAG_READ); + moving_error(io->ctxt, MOVING_FLAG_READ); } - bio_put(bio); - - BUG_ON(!io->read_issued); - BUG_ON(io->read_completed); - - spin_lock_irqsave(&q->lock, flags); - io->read_issued = 0; - io->read_completed = 1; - - BUG_ON(!atomic_read(&q->read_count)); - atomic_dec(&q->read_count); - spin_unlock_irqrestore(&q->lock, flags); + io->read_completed = true; + if (next_pending_write(ctxt)) + wake_up(&ctxt->wait); - wake_up(&q->wait); - - if (!q->rotational) - bch_queue_write(q); + closure_put(&ctxt->cl); } static void __bch_data_move(struct closure *cl) @@ -465,7 +243,7 @@ static void __bch_data_move(struct closure *cl) struct extent_pick_ptr pick; bch_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key), - io->context->avoid, &pick); + io->ctxt->avoid, &pick); if (IS_ERR_OR_NULL(pick.ca)) closure_return_with_destructor(cl, moving_io_destructor); @@ -473,107 +251,120 @@ static void __bch_data_move(struct closure *cl) io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k); io->rbio.bio.bi_end_io = read_moving_endio; + /* + * dropped by read_moving_endio() - guards against use after free of + * ctxt when doing wakeup + */ + closure_get(&io->ctxt->cl); + bch_read_extent(c, &io->rbio, bkey_i_to_s_c(&io->write.key), &pick, BCH_READ_IS_LAST); } -static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2) +int bch_data_move(struct cache_set *c, + struct moving_context *ctxt, + struct write_point *wp, + struct bkey_s_c k, + const struct bch_extent_ptr *move_ptr) { - if (io1->sort_key < io2->sort_key) - return -1; - else if (io1->sort_key > io2->sort_key) - return 1; - else { - /* We don't want duplicate keys. Eventually, we will have - * support for GC with duplicate pointers -- for now, - * just sort them randomly instead */ - if (io1 < io2) - return -1; - else if (io1 > io2) - return 1; - BUG(); + struct moving_io *io; + + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(k.k->size, PAGE_SECTORS), + GFP_KERNEL); + if (!io) + return -ENOMEM; + + io->ctxt = ctxt; + + migrate_bio_init(io, &io->rbio.bio, k.k->size); + + if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { + kfree(io); + return -ENOMEM; } -} -void bch_data_move(struct moving_queue *q, - struct moving_context *ctxt, - struct moving_io *io) -{ - unsigned size = io->write.key.k.size; + migrate_bio_init(io, &io->write.wbio.bio.bio, k.k->size); + bio_get(&io->write.wbio.bio.bio); + io->write.wbio.bio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + + bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0); + + trace_bcache_move_read(&io->write.key.k); ctxt->keys_moved++; - ctxt->sectors_moved += size; + ctxt->sectors_moved += k.k->size; if (ctxt->rate) - bch_ratelimit_increment(ctxt->rate, size); - - BUG_ON(q->wq == NULL); - io->q = q; - io->context = ctxt; - - spin_lock_irq(&q->lock); - atomic_inc(&q->count); - list_add_tail(&io->list, &q->pending); - trace_bcache_move_read(q, &io->write.key.k); - - if (q->rotational) - BUG_ON(RB_INSERT(&q->tree, io, node, moving_io_cmp)); - else { - BUG_ON(io->read_issued); - io->read_issued = 1; - atomic_inc(&q->read_count); - } + bch_ratelimit_increment(ctxt->rate, k.k->size); - spin_unlock_irq(&q->lock); + atomic_add(k.k->size, &ctxt->sectors_in_flight); + list_add_tail(&io->list, &ctxt->reads); - if (!q->rotational) - closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); + closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); + return 0; } -/* Rotational device queues */ - -static bool bch_queue_read(struct moving_queue *q, - struct moving_context *ctxt) +static void do_pending_writes(struct moving_context *ctxt) { - struct rb_node *node; struct moving_io *io; - BUG_ON(!q->rotational); - - spin_lock_irq(&q->lock); - node = rb_first(&q->tree); - if (!node) { - spin_unlock_irq(&q->lock); - return false; + while ((io = next_pending_write(ctxt))) { + list_del(&io->list); + trace_bcache_move_write(&io->write.key.k); + write_moving(io); } +} - io = rb_entry(node, struct moving_io, node); - rb_erase(node, &q->tree); - wake_up(&q->wait); +#define move_ctxt_wait_event(_ctxt, _cond) \ +do { \ + do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + next_pending_write(_ctxt) || (_cond)); \ +} while (1) + +int bch_move_ctxt_wait(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->sectors_in_flight) < + ctxt->max_sectors_in_flight); - io->read_issued = 1; - atomic_inc(&q->read_count); - spin_unlock_irq(&q->lock); + return ctxt->rate + ? bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl) + : 0; +} - closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); - return true; +void bch_move_ctxt_wait_for_io(struct moving_context *ctxt) +{ + unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight); + + move_ctxt_wait_event(ctxt, + !atomic_read(&ctxt->sectors_in_flight) || + atomic_read(&ctxt->sectors_in_flight) != sectors_pending); } -void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt) +void bch_move_ctxt_exit(struct moving_context *ctxt) { - if (!q->rotational) - goto sync; + move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight)); + closure_sync(&ctxt->cl); - while (!bch_moving_context_wait(ctxt)) { - wait_event(q->wait, - atomic_read(&q->read_count) < q->max_read_count); + EBUG_ON(!list_empty(&ctxt->reads)); + EBUG_ON(atomic_read(&ctxt->sectors_in_flight)); +} - if (!bch_queue_read(q, ctxt)) - break; - } +void bch_move_ctxt_init(struct moving_context *ctxt, + struct bch_ratelimit *rate, + unsigned max_sectors_in_flight) +{ + memset(ctxt, 0, sizeof(*ctxt)); + closure_init_stack(&ctxt->cl); - wait_event(q->wait, !bch_queue_reads_pending(q)); - bch_queue_write(q); -sync: - closure_sync(&ctxt->cl); + ctxt->rate = rate; + ctxt->max_sectors_in_flight = max_sectors_in_flight; + + INIT_LIST_HEAD(&ctxt->reads); + init_waitqueue_head(&ctxt->wait); } diff --git a/drivers/md/bcache/move.h b/drivers/md/bcache/move.h index 75f507535887..787023e47649 100644 --- a/drivers/md/bcache/move.h +++ b/drivers/md/bcache/move.h @@ -5,13 +5,6 @@ #include "io_types.h" #include "move_types.h" -enum moving_purpose { - MOVING_PURPOSE_UNKNOWN, /* Un-init */ - MOVING_PURPOSE_MIGRATION, - MOVING_PURPOSE_TIERING, - MOVING_PURPOSE_COPY_GC, -}; - enum moving_flag_bitnos { MOVING_FLAG_BITNO_READ = 0, MOVING_FLAG_BITNO_WRITE, @@ -20,6 +13,24 @@ enum moving_flag_bitnos { #define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ) #define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE) +struct migrate_write { + BKEY_PADDED(key); + bool promote; + bool move; + struct bch_extent_ptr move_ptr; + struct bch_write_op op; + struct bch_write_bio wbio; +}; + +void bch_migrate_write_init(struct cache_set *, + struct migrate_write *, + struct write_point *, + struct bkey_s_c, + const struct bch_extent_ptr *, + unsigned); + +#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 + struct moving_context { /* Closure for waiting on all reads and writes to complete */ struct closure cl; @@ -28,7 +39,6 @@ struct moving_context { atomic_t error_count; atomic_t error_flags; - /* Key and sector moves issued, updated from submission context */ u64 keys_moved; u64 sectors_moved; @@ -39,140 +49,39 @@ struct moving_context { /* Try to avoid reading the following device */ struct cache *avoid; - /* Debugging... */ - enum moving_purpose purpose; -}; - -void bch_moving_context_init(struct moving_context *, struct bch_ratelimit *, - enum moving_purpose); - -static inline int bch_moving_context_wait(struct moving_context *ctxt) -{ - if (ctxt->rate == NULL) - return 0; + struct list_head reads; - return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl); -} + /* Configuration */ + unsigned max_sectors_in_flight; + atomic_t sectors_in_flight; -struct migrate_write { - BKEY_PADDED(key); - bool promote; - bool move; - struct bch_extent_ptr move_ptr; - struct bch_write_op op; - struct bch_write_bio wbio; + wait_queue_head_t wait; }; -void bch_migrate_write_init(struct cache_set *, - struct migrate_write *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *, - unsigned); - struct moving_io { struct list_head list; struct rb_node node; struct closure cl; - struct moving_queue *q; - struct moving_context *context; + struct moving_context *ctxt; struct migrate_write write; - /* Sort key for moving_queue->tree */ - u64 sort_key; - /* Protected by q->lock */ - - /* - * 1) !read_issued && !read_completed - * - Closure is not running yet, starts when read_issued is set - * - IO is in q->tree (if q->rotational) and q->pending - * 2) !write_issued && !write_completed: - * - IO is in q->pending - * 3) write_issued: - * - IO is in q->write_pending - * 4) write_completed: - * - Closure is about to return and the IO is about to be freed - * - * If read_issued, we hold a reference on q->read_count - * If write_issued, we hold a reference on q->write_count - * Until IO is freed, we hold a reference on q->count - */ - unsigned read_issued:1; - unsigned read_completed:1; - unsigned write_issued:1; + bool read_completed; struct bch_read_bio rbio; /* Must be last since it is variable size */ struct bio_vec bi_inline_vecs[0]; }; -void moving_io_free(struct moving_io *); -struct moving_io *moving_io_alloc(struct cache_set *, - struct moving_queue *, - struct write_point *, - struct bkey_s_c, - const struct bch_extent_ptr *); - -typedef struct moving_io *(moving_queue_fn)(struct moving_queue *, - struct moving_context *); - -int bch_queue_init(struct moving_queue *, - struct cache_set *, - unsigned max_ios, - unsigned max_reads, - unsigned max_writes, - bool rotational, - const char *); - -/* - * bch_queue_full() - return if more reads can be queued with bch_data_move(). - * - * In rotational mode, always returns false if no reads are in flight (see - * how max_count is initialized in bch_queue_init()). - */ -static inline bool bch_queue_full(struct moving_queue *q) -{ - EBUG_ON(atomic_read(&q->count) > q->max_count); - EBUG_ON(atomic_read(&q->read_count) > q->max_read_count); - - return atomic_read(&q->count) == q->max_count || - atomic_read(&q->read_count) == q->max_read_count; -} - -void bch_data_move(struct moving_queue *, - struct moving_context *, - struct moving_io *); -void queue_io_resize(struct moving_queue *, - unsigned, - unsigned, - unsigned); -void bch_queue_destroy(struct moving_queue *); - -void bch_queue_recalc_oldest_gens(struct cache_set *, struct moving_queue *); - -void bch_queue_run(struct moving_queue *, struct moving_context *); - -#define sysfs_queue_attribute(name) \ - rw_attribute(name##_max_count); \ - rw_attribute(name##_max_read_count); \ - rw_attribute(name##_max_write_count); - -#define sysfs_queue_files(name) \ - &sysfs_##name##_max_count, \ - &sysfs_##name##_max_read_count, \ - &sysfs_##name##_max_write_count - -#define sysfs_queue_show(name, var) \ -do { \ - sysfs_hprint(name##_max_count, (var)->max_count); \ - sysfs_print(name##_max_read_count, (var)->max_read_count); \ - sysfs_print(name##_max_write_count, (var)->max_write_count);\ -} while (0) - -#define sysfs_queue_store(name, var) \ -do { \ - sysfs_strtoul(name##_max_count, (var)->max_count); \ - sysfs_strtoul(name##_max_read_count, (var)->max_read_count); \ - sysfs_strtoul(name##_max_write_count, (var)->max_write_count); \ -} while (0) +int bch_data_move(struct cache_set *, + struct moving_context *, + struct write_point *, + struct bkey_s_c, + const struct bch_extent_ptr *); + +int bch_move_ctxt_wait(struct moving_context *); +void bch_move_ctxt_wait_for_io(struct moving_context *); + +void bch_move_ctxt_exit(struct moving_context *); +void bch_move_ctxt_init(struct moving_context *, struct bch_ratelimit *, + unsigned); #endif /* _BCACHE_MOVE_H */ diff --git a/drivers/md/bcache/move_types.h b/drivers/md/bcache/move_types.h index 294a26d0c7af..0e2275e2d3f5 100644 --- a/drivers/md/bcache/move_types.h +++ b/drivers/md/bcache/move_types.h @@ -1,61 +1,4 @@ #ifndef _BCACHE_MOVE_TYPES_H #define _BCACHE_MOVE_TYPES_H -/* - * We rely on moving_queue being kzalloc'd so that the initial value of - * the flags is 0. - */ - -struct moving_queue { - struct work_struct work; - struct workqueue_struct *wq; - - /* Configuration */ - unsigned max_count; - unsigned max_read_count; - unsigned max_write_count; - - /* - * If true, reads are coming from rotational media. All reads - * are queued up on @tree and sorted by physical location prior - * to being submitted. - */ - bool rotational; - - /* Protects everything below */ - spinlock_t lock; - - /* - * Tree of struct moving_io, sorted by moving_io->sort_key. - * Contains reads which have not yet been issued; when a read is - * issued, it is removed from the tree. - * - * Only used if @rotational is set. - */ - struct rb_root tree; - - /* - * List of struct moving_io, sorted by logical offset. - * Contains writes which have not yet been issued; when a write is - * issued, it is removed from the list. - * - * Writes are issued in logical offset order, and only when all - * prior writes have been issued. - */ - struct list_head pending; - - /* - * List of struct moving_io, sorted by logical offset. - * - * Contains writes which are in-flight. - */ - struct list_head write_pending; - - atomic_t count; - atomic_t read_count; - atomic_t write_count; - - wait_queue_head_t wait; -}; - #endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index ceb85f12ba73..d93adbd2e609 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -45,10 +45,9 @@ static int issue_moving_gc_move(struct cache *ca, struct moving_context *ctxt, struct bkey_s_c k) { - struct moving_queue *q = &ca->moving_gc_queue; struct cache_set *c = ca->set; const struct bch_extent_ptr *ptr; - struct moving_io *io; + int ret; extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) if ((ca->sb.nr_this_dev == ptr->dev) && @@ -58,48 +57,36 @@ static int issue_moving_gc_move(struct cache *ca, /* We raced - bucket's been reused */ return 0; found: - io = moving_io_alloc(c, q, &ca->copygc_write_point, k, ptr); - if (!io) { + ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr); + if (!ret) + trace_bcache_gc_copy(k.k); + else trace_bcache_moving_gc_alloc_fail(c, k.k->size); - return -ENOMEM; - } - - trace_bcache_gc_copy(k.k); - - bch_data_move(q, ctxt, io); - return 0; + return ret; } -static void read_moving(struct cache *ca, struct moving_context *ctxt) +static void read_moving(struct cache *ca, size_t buckets_to_move) { struct cache_set *c = ca->set; + struct moving_context ctxt; struct btree_iter iter; struct bkey_s_c k; bch_ratelimit_reset(&ca->moving_gc_pd.rate); + bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate, + SECTORS_IN_FLIGHT_PER_DEVICE); bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - while (!bch_moving_context_wait(ctxt) && + while (!bch_move_ctxt_wait(&ctxt) && (k = bch_btree_iter_peek(&iter)).k) { if (!moving_pred(ca, k)) goto next; - if (bch_queue_full(&ca->moving_gc_queue)) { - bch_btree_iter_unlock(&iter); - - if (ca->moving_gc_queue.rotational) - bch_queue_run(&ca->moving_gc_queue, ctxt); - else - wait_event(ca->moving_gc_queue.wait, - !bch_queue_full(&ca->moving_gc_queue)); - continue; - } - - if (issue_moving_gc_move(ca, ctxt, k)) { + if (issue_moving_gc_move(ca, &ctxt, k)) { bch_btree_iter_unlock(&iter); - /* memory allocation failure, wait for IOs to finish */ - bch_queue_run(&ca->moving_gc_queue, ctxt); + /* memory allocation failure, wait for some IO to finish */ + bch_move_ctxt_wait_for_io(&ctxt); continue; } next: @@ -112,7 +99,9 @@ next: } bch_btree_iter_unlock(&iter); - bch_queue_run(&ca->moving_gc_queue, ctxt); + bch_move_ctxt_exit(&ctxt); + trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, + buckets_to_move); } static bool have_copygc_reserve(struct cache *ca) @@ -137,11 +126,6 @@ static void bch_moving_gc(struct cache *ca) unsigned sectors_used, i; int reserve_sectors; - struct moving_context ctxt; - - bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate, - MOVING_PURPOSE_COPY_GC); - if (!have_copygc_reserve(ca)) { struct closure cl; @@ -214,15 +198,12 @@ static void bch_moving_gc(struct cache *ca) mutex_unlock(&ca->heap_lock); up_read(&c->gc_lock); - read_moving(ca, &ctxt); + read_moving(ca, buckets_to_move); if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { for_each_bucket(g, ca) BUG_ON(g->copygc_gen && bucket_sectors_used(g)); } - - trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); } static int bch_moving_gc_thread(void *arg) @@ -260,24 +241,10 @@ static int bch_moving_gc_thread(void *arg) return 0; } -#define MOVING_GC_NR 64 -#define MOVING_GC_READ_NR 32 -#define MOVING_GC_WRITE_NR 32 - -int bch_moving_init_cache(struct cache *ca) +void bch_moving_init_cache(struct cache *ca) { - bool rotational = !blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)); - bch_pd_controller_init(&ca->moving_gc_pd); ca->moving_gc_pd.d_term = 0; - - return bch_queue_init(&ca->moving_gc_queue, - ca->set, - MOVING_GC_NR, - MOVING_GC_READ_NR, - MOVING_GC_WRITE_NR, - rotational, - "bch_copygc_write"); } int bch_moving_gc_thread_start(struct cache *ca) @@ -309,8 +276,3 @@ void bch_moving_gc_stop(struct cache *ca) kthread_stop(ca->moving_gc_read); ca->moving_gc_read = NULL; } - -void bch_moving_gc_destroy(struct cache *ca) -{ - bch_queue_destroy(&ca->moving_gc_queue); -} diff --git a/drivers/md/bcache/movinggc.h b/drivers/md/bcache/movinggc.h index 6ee8db55f44a..5f15308593d4 100644 --- a/drivers/md/bcache/movinggc.h +++ b/drivers/md/bcache/movinggc.h @@ -23,9 +23,8 @@ #define COPYGC_SECTORS_PER_ITER(ca) \ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) -int bch_moving_init_cache(struct cache *); +void bch_moving_init_cache(struct cache *); void bch_moving_gc_stop(struct cache *); int bch_moving_gc_thread_start(struct cache *); -void bch_moving_gc_destroy(struct cache *); #endif diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index bf8dfed64939..ee383193726d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1654,15 +1654,6 @@ static void bch_cache_free_work(struct work_struct *work) struct cache_set *c = ca->set; unsigned i; - /* - * These test internally and skip if never initialized, - * hence we don't need to test here. However, we do need - * to unregister them before we drop our reference to - * @c. - */ - bch_moving_gc_destroy(ca); - bch_tiering_write_destroy(ca); - cancel_work_sync(&ca->io_error_work); if (c && c->kobj.state_in_sysfs) { @@ -1930,6 +1921,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->prio_buckets_lock); mutex_init(&ca->heap_lock); + bch_moving_init_cache(ca); ca->disk_sb = *sb; ca->disk_sb.bdev->bd_holder = ca; @@ -1976,9 +1968,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio.bio)) || - !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || - bch_moving_init_cache(ca) || - bch_tiering_init_cache(ca)) + !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 54ea801c1690..8e7e97979fed 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -138,15 +138,11 @@ rw_attribute(cache_replacement_policy); rw_attribute(foreground_write_ratelimit_enabled); rw_attribute(copy_gc_enabled); -sysfs_queue_attribute(copy_gc); sysfs_pd_controller_attribute(copy_gc); rw_attribute(tiering_enabled); rw_attribute(tiering_percent); sysfs_pd_controller_attribute(tiering); -sysfs_queue_attribute(tiering); -rw_attribute(tiering_stripe_size); - sysfs_pd_controller_attribute(foreground_write); rw_attribute(btree_flush_delay); @@ -1233,10 +1229,6 @@ SHOW(bch_cache) sysfs_print(has_metadata, ca->mi.has_metadata); sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); - sysfs_queue_show(copy_gc, &ca->moving_gc_queue); - - sysfs_queue_show(tiering, &ca->tiering_queue); - sysfs_print(tiering_stripe_size, ca->tiering_stripe_size); if (attr == &sysfs_cache_replacement_policy) return bch_snprint_string_list(buf, PAGE_SIZE, @@ -1273,10 +1265,6 @@ STORE(__bch_cache) struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev]; sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); - sysfs_queue_store(copy_gc, &ca->moving_gc_queue); - - sysfs_queue_store(tiering, &ca->tiering_queue); - sysfs_strtoul(tiering_stripe_size, ca->tiering_stripe_size); if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -1399,9 +1387,6 @@ static struct attribute *bch_cache_files[] = { &sysfs_alloc_debug, sysfs_pd_controller_files(copy_gc), - sysfs_queue_files(copy_gc), - sysfs_queue_files(tiering), - &sysfs_tiering_stripe_size, NULL }; KTYPE(bch_cache); diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index b69456d9f95c..baecaf4c3aa7 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -86,21 +86,17 @@ static int issue_tiering_move(struct cache_set *c, struct moving_context *ctxt, struct bkey_s_c k) { - struct moving_io *io; + int ret; - io = moving_io_alloc(c, - &s->ca->tiering_queue, - &s->ca->tiering_write_point, - k, NULL); - if (!io) { + ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL); + if (!ret) { + trace_bcache_tiering_copy(k.k); + s->sectors += k.k->size; + } else { trace_bcache_tiering_alloc_fail(c, k.k->size); - return -ENOMEM; } - trace_bcache_tiering_copy(k.k); - bch_data_move(&s->ca->tiering_queue, ctxt, io); - s->sectors += k.k->size; - return 0; + return ret; } /** @@ -113,8 +109,12 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) struct tiering_state s; struct btree_iter iter; struct bkey_s_c k; + unsigned nr_devices = READ_ONCE(tier->nr_devices); int ret; + if (!nr_devices) + return 0; + trace_bcache_tiering_start(c); memset(&s, 0, sizeof(s)); @@ -122,11 +122,11 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) s.tier_idx = tier - c->cache_tiers; s.stripe_size = 2048; /* 1 mb for now */ - bch_moving_context_init(&ctxt, &c->tiering_pd.rate, - MOVING_PURPOSE_TIERING); + bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate, + nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - while (!bch_moving_context_wait(&ctxt) && + while (!bch_move_ctxt_wait(&ctxt) && (k = bch_btree_iter_peek(&iter)).k) { if (!tiering_pred(c, &s, k)) goto next; @@ -135,29 +135,12 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier) if (!s.ca) break; - if (bch_queue_full(&s.ca->tiering_queue)) { - bch_btree_iter_unlock(&iter); - - if (s.ca->tiering_queue.rotational) - bch_queue_run(&s.ca->tiering_queue, &ctxt); - else - wait_event(s.ca->tiering_queue.wait, - !bch_queue_full(&s.ca->tiering_queue)); - continue; - } - ret = issue_tiering_move(c, &s, &ctxt, k); if (ret) { bch_btree_iter_unlock(&iter); - /* memory allocation failure, wait for IOs to finish */ - - /* - * XXX: this only waits for IOs issued to this - * particular device, but there may not be any outstanding - * to this device - */ - bch_queue_run(&s.ca->tiering_queue, &ctxt); + /* memory allocation failure, wait for some IO to finish */ + bch_move_ctxt_wait_for_io(&ctxt); continue; } next: @@ -171,7 +154,7 @@ next: bch_btree_iter_unlock(&iter); tier_put_device(&s); - closure_sync(&ctxt.cl); + bch_move_ctxt_exit(&ctxt); trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); return ctxt.sectors_moved; @@ -231,28 +214,11 @@ static int bch_tiering_thread(void *arg) return 0; } -#define TIERING_NR 64 -#define TIERING_READ_NR 8 -#define TIERING_WRITE_NR 32 - void bch_tiering_init_cache_set(struct cache_set *c) { bch_pd_controller_init(&c->tiering_pd); } -int bch_tiering_init_cache(struct cache *ca) -{ - ca->tiering_stripe_size = ca->mi.bucket_size * 2; - - return bch_queue_init(&ca->tiering_queue, - ca->set, - TIERING_NR, - TIERING_READ_NR, - TIERING_WRITE_NR, - false, - "bch_tier_write"); -} - int bch_tiering_read_start(struct cache_set *c) { struct task_struct *t; @@ -267,11 +233,6 @@ int bch_tiering_read_start(struct cache_set *c) return 0; } -void bch_tiering_write_destroy(struct cache *ca) -{ - bch_queue_destroy(&ca->tiering_queue); -} - void bch_tiering_read_stop(struct cache_set *c) { if (!IS_ERR_OR_NULL(c->tiering_read)) { diff --git a/drivers/md/bcache/tier.h b/drivers/md/bcache/tier.h index 94923c2e3d7f..89c2bffde957 100644 --- a/drivers/md/bcache/tier.h +++ b/drivers/md/bcache/tier.h @@ -2,9 +2,7 @@ #define _BCACHE_TIER_H void bch_tiering_init_cache_set(struct cache_set *); -int bch_tiering_init_cache(struct cache *); int bch_tiering_read_start(struct cache_set *); -void bch_tiering_write_destroy(struct cache *); void bch_tiering_read_stop(struct cache_set *); #endif diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e0476d5bfb99..95eb0ef0236a 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -1030,58 +1030,48 @@ TRACE_EVENT(bcache_keyscan, /* Moving IO */ DECLARE_EVENT_CLASS(moving_io, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k), + TP_PROTO(struct bkey *k), + TP_ARGS(k), TP_STRUCT__entry( - __field(void *, q ) __field(__u32, inode ) __field(__u64, offset ) __field(__u32, sectors ) - __field(unsigned, count ) - __field(unsigned, read_count ) - __field(unsigned, write_count ) ), TP_fast_assign( - __entry->q = q; __entry->inode = k->p.inode; __entry->offset = k->p.offset; __entry->sectors = k->size; - __entry->count = atomic_read(&q->count); - __entry->read_count = atomic_read(&q->read_count); - __entry->write_count = atomic_read(&q->write_count); ), - TP_printk("%p %u:%llu sectors %u queue %u reads %u writes %u", - __entry->q, __entry->inode, __entry->offset, - __entry->sectors, __entry->count, - __entry->read_count, __entry->write_count) + TP_printk("%u:%llu sectors %u", + __entry->inode, __entry->offset, __entry->sectors) ); DEFINE_EVENT(moving_io, bcache_move_read, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k) + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); DEFINE_EVENT(moving_io, bcache_move_read_done, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k) + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); DEFINE_EVENT(moving_io, bcache_move_write, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k) + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); DEFINE_EVENT(moving_io, bcache_move_write_done, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k) + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); DEFINE_EVENT(moving_io, bcache_copy_collision, - TP_PROTO(struct moving_queue *q, struct bkey *k), - TP_ARGS(q, k) + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); /* Copy GC */ |