diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-04 06:16:50 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-01-18 21:40:33 -0900 |
commit | 40ff276e09f62f09fb892cc183766b88376783bb (patch) | |
tree | 2e0d68f8223bcc0387aba1a92d6357740a0c0827 | |
parent | b9aadd59175a6475eb2e8b5e21bd38bb9454b099 (diff) |
bcache: scan keylists get to die in a fire
-rw-r--r-- | drivers/md/bcache/alloc_types.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/btree_gc.c | 18 | ||||
-rw-r--r-- | drivers/md/bcache/keylist.c | 195 | ||||
-rw-r--r-- | drivers/md/bcache/keylist.h | 45 | ||||
-rw-r--r-- | drivers/md/bcache/keylist_types.h | 45 | ||||
-rw-r--r-- | drivers/md/bcache/migrate.c | 250 | ||||
-rw-r--r-- | drivers/md/bcache/move.c | 176 | ||||
-rw-r--r-- | drivers/md/bcache/move.h | 40 | ||||
-rw-r--r-- | drivers/md/bcache/move_types.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 104 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/tier.c | 392 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 6 |
14 files changed, 328 insertions, 966 deletions
diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h index 0bfafbf8c91f..aa3c81268817 100644 --- a/drivers/md/bcache/alloc_types.h +++ b/drivers/md/bcache/alloc_types.h @@ -55,7 +55,7 @@ struct cache_group { unsigned cur_device; struct { u64 weight; - struct cache __rcu *dev; + struct cache *dev; } d[MAX_CACHES_PER_SET]; }; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1507fe4d5047..cbb2dca3257a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -686,10 +686,6 @@ struct cache_set { struct task_struct *gc_thread; atomic_t kick_gc; - /* This is a list of scan_keylists for btree GC to scan */ - struct list_head gc_scan_keylists; - struct mutex gc_scan_keylist_lock; - /* * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] * has been marked by GC. diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c index 4db9e54b3751..8bd6070cfee0 100644 --- a/drivers/md/bcache/btree_gc.c +++ b/drivers/md/bcache/btree_gc.c @@ -308,23 +308,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c) mutex_unlock(&c->btree_interior_update_lock); } -static void bch_mark_scan_keylists(struct cache_set *c) -{ - struct scan_keylist *kl; - - mutex_lock(&c->gc_scan_keylist_lock); - - /* What the goddamn fuck? */ - list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) { - if (kl->owner == NULL) - bch_keylist_recalc_oldest_gens(c, kl); - else - bch_queue_recalc_oldest_gens(c, kl->owner); - } - - mutex_unlock(&c->gc_scan_keylist_lock); -} - /** * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes */ @@ -426,7 +409,6 @@ void bch_gc(struct cache_set *c) bch_mark_metadata(c); bch_mark_pending_btree_node_frees(c); bch_writeback_recalc_oldest_gens(c); - bch_mark_scan_keylists(c); for_each_cache(ca, c, i) atomic_long_set(&ca->saturated_count, 0); diff --git a/drivers/md/bcache/keylist.c b/drivers/md/bcache/keylist.c index 4f9d1865abdc..b7af04e5adfa 100644 --- a/drivers/md/bcache/keylist.c +++ b/drivers/md/bcache/keylist.c @@ -118,198 +118,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) BUG_ON(l->top_p > l->end_keys_p); bkey_copy(where, insert); } - -/* Scan keylists simple utilities */ - -void bch_scan_keylist_init(struct scan_keylist *kl, - struct cache_set *c, - unsigned max_size) - -{ - kl->c = c; - kl->owner = NULL; - - mutex_init(&kl->lock); - kl->max_size = max_size; - bch_keylist_init(&kl->list, NULL, 0); - - /* - * Order of initialization is tricky, and this makes sure that - * we have a valid cache set in case the order of - * initialization chages and breaks things. - */ - BUG_ON(c == NULL); - mutex_lock(&c->gc_scan_keylist_lock); - list_add_tail(&kl->mark_list, &c->gc_scan_keylists); - mutex_unlock(&c->gc_scan_keylist_lock); -} - -void bch_scan_keylist_destroy(struct scan_keylist *kl) -{ - if (kl->c) { - mutex_lock(&kl->c->gc_scan_keylist_lock); - list_del(&kl->mark_list); - mutex_unlock(&kl->c->gc_scan_keylist_lock); - } - - mutex_lock(&kl->lock); - bch_keylist_free(&kl->list); - mutex_unlock(&kl->lock); -} - -void bch_scan_keylist_reset(struct scan_keylist *kl) -{ - mutex_lock(&kl->lock); - kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p; - mutex_unlock(&kl->lock); -} - -/* - * This should only be called from sysfs, and holding a lock that prevents - * re-entrancy. - */ -void bch_scan_keylist_resize(struct scan_keylist *kl, - unsigned max_size) -{ - mutex_lock(&kl->lock); - kl->max_size = max_size; /* May be smaller than current size */ - mutex_unlock(&kl->lock); -} - -/** - * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys - * - * This prevents us from wrapping around gens for a bucket only referenced from - * the tiering or moving GC keylists. We don't actually care that the data in - * those buckets is marked live, only that we don't wrap the gens. - * - * Note: This interlocks with insertions, but not all dequeues interlock. - * The particular case in which dequeues don't interlock is when a - * scan list used by the copy offload ioctls is used as a plain - * keylist for btree insertion. - * The btree insertion code doesn't go through - * bch_scan_keylist_dequeue below, and instead uses plain - * bch_keylist_dequeue. The other pointers (top, start, end) are - * unchanged in this case. - * A little care with the bottomp pointer suffices in this case. - * Of course, we may end up marking stuff that we don't need to mark, - * but was recently valid and we have likely just inserted in the tree - * anyway. - */ -void bch_keylist_recalc_oldest_gens(struct cache_set *c, - struct scan_keylist *kl) -{ - struct bkey_i *k; - - mutex_lock(&kl->lock); - - for_each_keylist_key(&kl->list, k) - bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k)); - - mutex_unlock(&kl->lock); -} - -int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k) -{ - int ret; - - mutex_lock(&kl->lock); - ret = bch_keylist_realloc_max(&kl->list, - k.k->u64s, - kl->max_size); - - if (!ret) { - bkey_reassemble(kl->list.top, k); - bch_keylist_enqueue(&kl->list); - atomic64_add(k.k->size, &kl->sectors); - } - mutex_unlock(&kl->lock); - - return ret; -} - -/* Actual scanning functionality of scan_keylists */ - -static void bch_refill_scan_keylist(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred) -{ - struct bpos start = *last_scanned; - struct btree_iter iter; - struct bkey_s_c k; - unsigned nr_found = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) { - if (bkey_cmp(k.k->p, end) >= 0) { - *last_scanned = k.k->p; - goto done; - } - - if (pred(kl, k)) { - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - bch_cut_front(*last_scanned, &tmp.k); - - if (bch_scan_keylist_add(kl, bkey_i_to_s_c(&tmp.k))) - goto done; - - nr_found++; - } - - *last_scanned = k.k->p; - bch_btree_iter_cond_resched(&iter); - } - - /* If we end up here, it means: - * - the map_fn didn't fill up the keybuf - * - the map_fn didn't see the end key - * - there were no more keys to map over - * Therefore, we are at the end of the key space */ - *last_scanned = POS_MAX; -done: - bch_btree_iter_unlock(&iter); - - trace_bcache_keyscan(nr_found, - start.inode, start.offset, - last_scanned->inode, - last_scanned->offset); -} - -struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl) -{ - if (bch_keylist_empty(&kl->list)) - return NULL; - - return bch_keylist_front(&kl->list); -} - -struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred) -{ - if (bch_keylist_empty(&kl->list)) { - if (bkey_cmp(*last_scanned, end) >= 0) - return NULL; - - bch_refill_scan_keylist(c, kl, last_scanned, end, pred); - } - - return bch_scan_keylist_next(kl); -} - -void bch_scan_keylist_dequeue(struct scan_keylist *kl) -{ - u64 sectors; - - mutex_lock(&kl->lock); - sectors = kl->list.bot->k.size; - bch_keylist_dequeue(&kl->list); - mutex_unlock(&kl->lock); - - BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0); -} diff --git a/drivers/md/bcache/keylist.h b/drivers/md/bcache/keylist.h index 028552757527..990ba72ee60c 100644 --- a/drivers/md/bcache/keylist.h +++ b/drivers/md/bcache/keylist.h @@ -116,49 +116,4 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *); int bch_keylist_realloc(struct keylist *, unsigned); int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned); -void bch_scan_keylist_init(struct scan_keylist *kl, - struct cache_set *c, - unsigned max_size); - -void bch_scan_keylist_reset(struct scan_keylist *kl); - -/* The keylist is dynamically adjusted. This just clamps the maxima */ - -static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl) -{ - return kl->max_size; -} - -static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl) -{ - return atomic64_read(&kl->sectors); -} - -void bch_scan_keylist_resize(struct scan_keylist *kl, - unsigned max_size); - -void bch_scan_keylist_destroy(struct scan_keylist *kl); - -/* - * IMPORTANT: The caller of bch_scan_keylist_next or - * bch_scan_keylist_next_rescan needs to copy any - * non-null return value before calling either again! - * These functions return a pointer into the internal structure. - * Furthermore, they need to call bch_scan_keylist_advance after - * copying the structure. - */ - -struct bkey_i *bch_scan_keylist_next(struct scan_keylist *); - -struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred); - -int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c); -void bch_scan_keylist_dequeue(struct scan_keylist *); - -void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *); - #endif /* _BCACHE_KEYLIST_H */ diff --git a/drivers/md/bcache/keylist_types.h b/drivers/md/bcache/keylist_types.h index 569cdc2480e2..156fbe0745fd 100644 --- a/drivers/md/bcache/keylist_types.h +++ b/drivers/md/bcache/keylist_types.h @@ -48,49 +48,4 @@ struct keylist { bool has_buf; }; -/* - * scan_keylists are conceptually similar to keybufs, but they don't - * have an internal RB tree. - * keybufs should be used when read or write operations need to - * examine keys in flight, as for writeback. - * But for moving operations (moving gc, tiering, moving data off - * devices), read and writes don't need to look at all, so we don't - * need the RB tree and use scan_keylists instead. - * - * Note that unlike keybufs, they don't contain a semaphore to limit - * bios. That must be done externally, if necessary. - */ - -#define DFLT_SCAN_KEYLIST_MAX_SIZE 512 - -struct scan_keylist { - struct list_head mark_list; /* For GC marking */ - - struct cache_set *c; /* For destroying */ - - /* - * Only one thread is allowed to mutate the keylist. Other threads can - * read it. The mutex has to be taken by the mutator thread when - * mutating the keylist, and by other threads when reading, but not by - * the mutator thread when reading. - */ - struct mutex lock; - /* - * Maximum size, in u64s. The keylist will not grow beyond this size. - */ - unsigned max_size; - /* - * Number of sectors in keys currently on the keylist. - */ - atomic64_t sectors; - /* - * The underlying keylist. - */ - struct keylist list; - - struct moving_queue *owner; -}; - -typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c); - #endif /* _BCACHE_KEYLIST_TYPES_H */ diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c index 1169f8fc28a1..46d05e32b2f6 100644 --- a/drivers/md/bcache/migrate.c +++ b/drivers/md/bcache/migrate.c @@ -12,20 +12,9 @@ #include "migrate.h" #include "move.h" -static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k) -{ - struct cache *ca = container_of(kl, struct cache, - moving_gc_queue.keys); - - return bkey_extent_is_data(k.k) && - bch_extent_has_device(bkey_s_c_to_extent(k), - ca->sb.nr_this_dev); -} - static int issue_migration_move(struct cache *ca, struct moving_context *ctxt, - struct bkey_s_c k, - u64 *seen_key_count) + struct bkey_s_c k) { struct moving_queue *q = &ca->moving_gc_queue; struct cache_set *c = ca->set; @@ -49,24 +38,10 @@ found: } bch_data_move(q, ctxt, io); - (*seen_key_count)++; - - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ - bch_scan_keylist_dequeue(&q->keys); return 0; } -#define MIGRATION_DEBUG 0 - #define MAX_DATA_OFF_ITER 10 -#define PASS_LOW_LIMIT (MIGRATION_DEBUG ? 0 : 2) #define MIGRATE_NR 64 #define MIGRATE_READ_NR 32 #define MIGRATE_WRITE_NR 32 @@ -87,15 +62,14 @@ found: int bch_move_data_off_device(struct cache *ca) { - int ret; - struct bkey_i *k; - unsigned pass; - u64 seen_key_count; - unsigned last_error_count; - unsigned last_error_flags; - struct moving_context context; + struct moving_context ctxt; struct cache_set *c = ca->set; struct moving_queue *queue = &ca->moving_gc_queue; + unsigned pass = 0; + u64 seen_key_count; + int ret = 0; + + BUG_ON(ca->mi.state == CACHE_ACTIVE); /* * This reuses the moving gc queue as it is no longer in use @@ -116,8 +90,8 @@ int bch_move_data_off_device(struct cache *ca) queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR); BUG_ON(queue->wq == NULL); - bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION); - context.avoid = ca; + bch_moving_context_init(&ctxt, NULL, MOVING_PURPOSE_MIGRATION); + ctxt.avoid = ca; /* * In theory, only one pass should be necessary as we've @@ -136,142 +110,128 @@ int bch_move_data_off_device(struct cache *ca) * but that can be viewed as a verification pass. */ - seen_key_count = 1; - last_error_count = 0; - last_error_flags = 0; - - for (pass = 0; - (seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER)); - pass++) { - bool again; + do { + struct btree_iter iter; + struct bkey_s_c k; seen_key_count = 0; - atomic_set(&context.error_count, 0); - atomic_set(&context.error_flags, 0); - context.last_scanned = POS_MIN; + atomic_set(&ctxt.error_count, 0); + atomic_set(&ctxt.error_flags, 0); -again: - again = false; + bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while ((k = bch_btree_iter_peek(&iter)).k) { + if (!bkey_extent_is_data(k.k) || + !bch_extent_has_device(bkey_s_c_to_extent(k), + ca->sb.nr_this_dev)) + goto next; - while (1) { if (bch_queue_full(queue)) { - if (queue->rotational) { - again = true; - break; - } else { - bch_moving_wait(&context); - continue; - } + bch_btree_iter_unlock(&iter); + + if (queue->rotational) + bch_queue_run(queue, &ctxt); + else + wait_event(queue->wait, + !bch_queue_full(queue)); + continue; } - k = bch_scan_keylist_next_rescan(c, - &queue->keys, - &context.last_scanned, - POS_MAX, - migrate_data_pred); - if (k == NULL) - break; + ret = issue_migration_move(ca, &ctxt, k); + if (ret == -ENOMEM) { + bch_btree_iter_unlock(&iter); - if (issue_migration_move(ca, &context, bkey_i_to_s_c(k), - &seen_key_count)) { /* - * Memory allocation failed; we will wait for - * all queued moves to finish and continue - * scanning starting from the same key + * memory allocation failure, wait for IOs to + * finish */ - again = true; - break; + bch_queue_run(queue, &ctxt); + continue; } - } + if (ret == -ENOSPC) { + bch_btree_iter_unlock(&iter); + bch_queue_run(queue, &ctxt); + return -ENOSPC; + } + BUG_ON(ret); - bch_queue_run(queue, &context); - if (again) - goto again; + seen_key_count++; +next: + bch_btree_iter_advance_pos(&iter); + bch_btree_iter_cond_resched(&iter); - if ((pass >= PASS_LOW_LIMIT) - && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) { - pr_notice("found %llu keys on pass %u.", - seen_key_count, pass); } + ret = bch_btree_iter_unlock(&iter); + bch_queue_run(queue, &ctxt); - last_error_count = atomic_read(&context.error_count); - last_error_flags = atomic_read(&context.error_flags); - - if (last_error_count != 0) { - pr_notice("pass %u: error count = %u, error flags = 0x%x", - pass, last_error_count, last_error_flags); - } - } + if (ret) + return ret; + } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER); - if (seen_key_count != 0 || last_error_count != 0) { + if (seen_key_count) { pr_err("Unable to migrate all data in %d iterations.", MAX_DATA_OFF_ITER); - ret = -EDEADLK; - } else if (MIGRATION_DEBUG) - pr_notice("Migrated all data in %d iterations", pass); + return -1; + } - bch_queue_run(queue, &context); - return ret; + return 0; } /* * This walks the btree, and for any node on the relevant device it moves the * node elsewhere. */ -static int bch_move_btree_off(struct cache *ca, - enum btree_id id, - const char *name) +static int bch_move_btree_off(struct cache *ca, enum btree_id id) { + struct cache_set *c = ca->set; + struct btree_iter iter; struct closure cl; - unsigned pass; + struct btree *b; + int ret; - closure_init_stack(&cl); + BUG_ON(ca->mi.state == CACHE_ACTIVE); - pr_debug("Moving %s btree off device %u", - name, ca->sb.nr_this_dev); + closure_init_stack(&cl); - for (pass = 0; (pass < MAX_DATA_OFF_ITER); pass++) { - struct btree_iter iter; - struct btree *b; - unsigned moved = 0, seen = 0; - int ret; - - for_each_btree_node(&iter, ca->set, id, POS_MIN, 0, b) { - struct bkey_s_c_extent e = - bkey_i_to_s_c_extent(&b->key); - seen++; + for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); retry: - if (!bch_extent_has_device(e, ca->sb.nr_this_dev)) - continue; - - if (bch_btree_node_rewrite(&iter, b, &cl)) { - /* - * Drop locks to upgrade locks or wait on - * reserve: after retaking, recheck in case we - * raced. - */ - bch_btree_iter_unlock(&iter); - closure_sync(&cl); - b = bch_btree_iter_peek_node(&iter); - goto retry; - } - - moved++; - iter.locks_want = 0; + if (!bch_extent_has_device(e, ca->sb.nr_this_dev)) + continue; + + ret = bch_btree_node_rewrite(&iter, b, &cl); + if (ret == -EINTR || ret == -ENOSPC) { + /* + * Drop locks to upgrade locks or wait on + * reserve: after retaking, recheck in case we + * raced. + */ + bch_btree_iter_unlock(&iter); + closure_sync(&cl); + b = bch_btree_iter_peek_node(&iter); + goto retry; } - ret = bch_btree_iter_unlock(&iter); - if (ret) - return ret; /* btree IO error */ + if (ret) { + bch_btree_iter_unlock(&iter); + return ret; + } + + iter.locks_want = 0; + } + ret = bch_btree_iter_unlock(&iter); + if (ret) + return ret; /* btree IO error */ - if (!moved) - return 0; + if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { + for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - pr_debug("%s pass %u: seen %u, moved %u.", - name, pass, seen, moved); + BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev)); + } + bch_btree_iter_unlock(&iter); } - /* Failed: */ - return -1; + return 0; } /* @@ -321,25 +281,25 @@ retry: int bch_move_meta_data_off_device(struct cache *ca) { unsigned i; - int ret = 0; /* Success */ + int ret; /* 1st, Move the btree nodes off the device */ - for (i = 0; i < BTREE_ID_NR; i++) - if (bch_move_btree_off(ca, i, bch_btree_id_names[i]) != 0) - return 1; + for (i = 0; i < BTREE_ID_NR; i++) { + ret = bch_move_btree_off(ca, i); + if (ret) + return ret; + } /* There are no prios/gens to move -- they are already in the device. */ /* 2nd. Move the journal off the device */ - if (bch_journal_move(ca) != 0) { - pr_err("Unable to move the journal off in %pU.", - ca->set->disk_sb.user_uuid.b); - ret = 1; /* Failure */ - } + ret = bch_journal_move(ca); + if (ret) + return ret; - return ret; + return 0; } /* diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c index 9c4abbe9576e..f9c2d9e46e84 100644 --- a/drivers/md/bcache/move.c +++ b/drivers/md/bcache/move.c @@ -162,50 +162,14 @@ void bch_moving_context_init(struct moving_context *ctxt, enum moving_purpose purpose) { memset(ctxt, 0, sizeof(*ctxt)); - ctxt->task = current; ctxt->rate = rate; ctxt->purpose = purpose; closure_init_stack(&ctxt->cl); } -/* - * bch_moving_wait() -- wait for a bch_moving_notify() call - * - * To deal with lost wakeups, we make this return immediately if notify - * was already called. - */ -void bch_moving_wait(struct moving_context *ctxt) -{ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_xchg(&ctxt->pending, 0)) - break; - schedule(); - } - __set_current_state(TASK_RUNNING); -} - -static void bch_moving_notify(struct moving_context *ctxt) -{ - atomic_set(&ctxt->pending, 1); - wake_up_process(ctxt->task); -} - -static bool __bch_queue_reads_pending(struct moving_queue *q) -{ - return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree)); -} - static bool bch_queue_reads_pending(struct moving_queue *q) { - unsigned long flags; - bool pending; - - spin_lock_irqsave(&q->lock, flags); - pending = __bch_queue_reads_pending(q); - spin_unlock_irqrestore(&q->lock, flags); - - return pending; + return atomic_read(&q->read_count) || !RB_EMPTY_ROOT(&q->tree); } static void bch_queue_write(struct moving_queue *q) @@ -276,7 +240,6 @@ static void moving_io_destructor(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_queue *q = io->q; - struct moving_context *ctxt = io->context; unsigned long flags; bool kick_writes = true; @@ -285,28 +248,29 @@ static void moving_io_destructor(struct closure *cl) spin_lock_irqsave(&q->lock, flags); - BUG_ON(!q->count); - q->count--; - if (io->read_issued) { - BUG_ON(!q->read_count); - q->read_count--; + BUG_ON(!atomic_read(&q->read_count)); + atomic_dec(&q->read_count); } if (io->write_issued) { - BUG_ON(!q->write_count); - q->write_count--; + BUG_ON(!atomic_read(&q->write_count)); + atomic_dec(&q->write_count); trace_bcache_move_write_done(q, &io->write.key.k); } + BUG_ON(!atomic_read(&q->count)); + atomic_dec(&q->count); + wake_up(&q->wait); + list_del_init(&io->list); - if ((q->count == 0) && (q->stop_waitcl != NULL)) { + if (!atomic_read(&q->count) && q->stop_waitcl) { closure_put(q->stop_waitcl); q->stop_waitcl = NULL; } - if (q->rotational && __bch_queue_reads_pending(q)) + if (q->rotational && bch_queue_reads_pending(q)) kick_writes = false; if (list_empty(&q->pending)) @@ -318,8 +282,6 @@ static void moving_io_destructor(struct closure *cl) if (kick_writes) bch_queue_write(q); - - bch_moving_notify(ctxt); } static void moving_io_after_write(struct closure *cl) @@ -336,13 +298,12 @@ static void moving_io_after_write(struct closure *cl) static void write_moving(struct moving_io *io) { bool stopped; - unsigned long flags; struct bch_write_op *op = &io->write.op; - spin_lock_irqsave(&io->q->lock, flags); - BUG_ON(io->q->count == 0); + spin_lock_irq(&io->q->lock); + BUG_ON(!atomic_read(&io->q->count)); stopped = io->q->stopped; - spin_unlock_irqrestore(&io->q->lock, flags); + spin_unlock_irq(&io->q->lock); /* * If the queue has been stopped, prevent the write from occurring. @@ -362,17 +323,17 @@ static void bch_queue_write_work(struct work_struct *work) { struct moving_queue *q = container_of(work, struct moving_queue, work); struct moving_io *io; - unsigned long flags; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irq(&q->lock); - if (q->rotational && __bch_queue_reads_pending(q)) { + if (q->rotational && bch_queue_reads_pending(q)) { /* All reads should have finished before writes start */ - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); return; } - while (!q->stopped && q->write_count < q->max_write_count) { + while (!q->stopped && + atomic_read(&q->write_count) < q->max_write_count) { io = list_first_entry_or_null(&q->pending, struct moving_io, list); /* @@ -388,17 +349,17 @@ static void bch_queue_write_work(struct work_struct *work) break; BUG_ON(io->write_issued); - q->write_count++; + atomic_inc(&q->write_count); io->write_issued = 1; list_del(&io->list); list_add_tail(&io->list, &q->write_pending); trace_bcache_move_write(q, &io->write.key.k); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); write_moving(io); - spin_lock_irqsave(&q->lock, flags); + spin_lock_irq(&q->lock); } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); } /* @@ -408,7 +369,6 @@ static void bch_queue_write_work(struct work_struct *work) int bch_queue_init(struct moving_queue *q, struct cache_set *c, - unsigned max_size, unsigned max_count, unsigned max_read_count, unsigned max_write_count, @@ -417,7 +377,6 @@ int bch_queue_init(struct moving_queue *q, { INIT_WORK(&q->work, bch_queue_write_work); - q->keys.owner = q; q->max_count = max_count; q->max_read_count = max_read_count; q->max_write_count = max_write_count; @@ -427,6 +386,7 @@ int bch_queue_init(struct moving_queue *q, INIT_LIST_HEAD(&q->pending); INIT_LIST_HEAD(&q->write_pending); q->tree = RB_ROOT; + init_waitqueue_head(&q->wait); q->wq = alloc_workqueue(name, WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1); @@ -443,8 +403,6 @@ void bch_queue_start(struct moving_queue *q) spin_lock_irqsave(&q->lock, flags); q->stopped = false; spin_unlock_irqrestore(&q->lock, flags); - - bch_scan_keylist_reset(&q->keys); } void queue_io_resize(struct moving_queue *q, @@ -466,8 +424,6 @@ void bch_queue_destroy(struct moving_queue *q) if (q->wq) destroy_workqueue(q->wq); q->wq = NULL; - - bch_scan_keylist_destroy(&q->keys); } static void bch_queue_cancel_writes(struct moving_queue *q) @@ -489,8 +445,11 @@ static void bch_queue_cancel_writes(struct moving_queue *q) list_del_init(&io->list); read_issued = io->read_issued; read_completed = io->read_completed; - if (!read_issued && !read_completed && q->rotational) + if (!read_issued && !read_completed && q->rotational) { rb_erase(&io->node, &q->tree); + wake_up(&q->wait); + } + spin_unlock_irqrestore(&q->lock, flags); if (read_completed) closure_return_with_destructor_noreturn(&io->cl, @@ -505,22 +464,21 @@ static void bch_queue_cancel_writes(struct moving_queue *q) void bch_queue_stop(struct moving_queue *q) { - unsigned long flags; struct closure waitcl; closure_init_stack(&waitcl); - spin_lock_irqsave(&q->lock, flags); + spin_lock_irq(&q->lock); if (q->stopped) BUG_ON(q->stop_waitcl != NULL); else { q->stopped = true; - if (q->count != 0) { + if (atomic_read(&q->count)) { q->stop_waitcl = &waitcl; closure_get(&waitcl); } } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); bch_queue_cancel_writes(q); @@ -549,9 +507,6 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q) { unsigned long flags; - /* 1st, mark the keylist keys */ - bch_keylist_recalc_oldest_gens(c, &q->keys); - /* 2nd, mark the keys in the I/Os */ spin_lock_irqsave(&q->lock, flags); @@ -566,7 +521,6 @@ static void read_moving_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_queue *q = io->q; - struct moving_context *ctxt = io->context; bool stopped; unsigned long flags; @@ -586,8 +540,11 @@ static void read_moving_endio(struct bio *bio) BUG_ON(io->read_completed); io->read_issued = 0; io->read_completed = 1; - BUG_ON(!q->read_count); - q->read_count--; + + BUG_ON(!atomic_read(&q->read_count)); + atomic_dec(&q->read_count); + wake_up(&q->wait); + stopped = q->stopped; if (stopped) list_del_init(&io->list); @@ -598,8 +555,6 @@ static void read_moving_endio(struct bio *bio) moving_io_destructor); else if (!q->rotational) bch_queue_write(q); - - bch_moving_notify(ctxt); } static void __bch_data_move(struct closure *cl) @@ -628,27 +583,6 @@ static void __bch_data_move(struct closure *cl) &pick, BCH_READ_IS_LAST); } -/* - * bch_queue_full() - return if more reads can be queued with bch_data_move(). - * - * In rotational mode, always returns false if no reads are in flight (see - * how max_count is initialized in bch_queue_init()). - */ -bool bch_queue_full(struct moving_queue *q) -{ - unsigned long flags; - bool full; - - spin_lock_irqsave(&q->lock, flags); - BUG_ON(q->count > q->max_count); - BUG_ON(q->read_count > q->max_read_count); - full = (q->count == q->max_count || - q->read_count == q->max_read_count); - spin_unlock_irqrestore(&q->lock, flags); - - return full; -} - static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2) { if (io1->sort_key < io2->sort_key) @@ -671,20 +605,19 @@ void bch_data_move(struct moving_queue *q, struct moving_context *ctxt, struct moving_io *io) { - unsigned long flags; bool stopped = false; BUG_ON(q->wq == NULL); io->q = q; io->context = ctxt; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irq(&q->lock); if (q->stopped) { stopped = true; goto out; } - q->count++; + atomic_inc(&q->count); list_add_tail(&io->list, &q->pending); trace_bcache_move_read(q, &io->write.key.k); @@ -693,11 +626,11 @@ void bch_data_move(struct moving_queue *q, else { BUG_ON(io->read_issued); io->read_issued = 1; - q->read_count++; + atomic_inc(&q->read_count); } out: - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); if (stopped) moving_io_free(io); @@ -710,26 +643,27 @@ out: static bool bch_queue_read(struct moving_queue *q, struct moving_context *ctxt) { - unsigned long flags; struct rb_node *node; struct moving_io *io; bool stopped; BUG_ON(!q->rotational); - spin_lock_irqsave(&q->lock, flags); + spin_lock_irq(&q->lock); node = rb_first(&q->tree); if (!node) { - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); return false; } io = rb_entry(node, struct moving_io, node); rb_erase(node, &q->tree); + wake_up(&q->wait); + io->read_issued = 1; - q->read_count++; + atomic_inc(&q->read_count); stopped = q->stopped; - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irq(&q->lock); if (stopped) { moving_io_destructor(&io->cl); @@ -742,31 +676,19 @@ static bool bch_queue_read(struct moving_queue *q, void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt) { - unsigned long flags; - bool full; - if (!q->rotational) goto sync; while (!bch_moving_context_wait(ctxt)) { - spin_lock_irqsave(&q->lock, flags); - full = (q->read_count == q->max_read_count); - spin_unlock_irqrestore(&q->lock, flags); - - if (full) { - bch_moving_wait(ctxt); - continue; - } + wait_event(q->wait, + atomic_read(&q->read_count) < q->max_read_count); if (!bch_queue_read(q, ctxt)) break; } - while (bch_queue_reads_pending(q)) - bch_moving_wait(ctxt); - + wait_event(q->wait, !bch_queue_reads_pending(q)); bch_queue_write(q); - sync: closure_sync(&ctxt->cl); } diff --git a/drivers/md/bcache/move.h b/drivers/md/bcache/move.h index 0f8f787a5168..8f30f078f3f5 100644 --- a/drivers/md/bcache/move.h +++ b/drivers/md/bcache/move.h @@ -3,6 +3,7 @@ #include "buckets.h" #include "io_types.h" +#include "move_types.h" enum moving_purpose { MOVING_PURPOSE_UNKNOWN, /* Un-init */ @@ -27,17 +28,11 @@ struct moving_context { atomic_t error_count; atomic_t error_flags; - /* If != 0, @task is waiting for a read or write to complete */ - atomic_t pending; - struct task_struct *task; /* Key and sector moves issued, updated from submission context */ u64 keys_moved; u64 sectors_moved; - /* Last key scanned */ - struct bpos last_scanned; - /* Rate-limiter counting submitted reads */ struct bch_ratelimit *rate; @@ -59,8 +54,6 @@ static inline int bch_moving_context_wait(struct moving_context *ctxt) return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl); } -void bch_moving_wait(struct moving_context *); - struct migrate_write { BKEY_PADDED(key); bool promote; @@ -124,14 +117,28 @@ typedef struct moving_io *(moving_queue_fn)(struct moving_queue *, int bch_queue_init(struct moving_queue *, struct cache_set *, - unsigned max_keys, unsigned max_ios, unsigned max_reads, unsigned max_writes, bool rotational, const char *); void bch_queue_start(struct moving_queue *); -bool bch_queue_full(struct moving_queue *); + +/* + * bch_queue_full() - return if more reads can be queued with bch_data_move(). + * + * In rotational mode, always returns false if no reads are in flight (see + * how max_count is initialized in bch_queue_init()). + */ +static inline bool bch_queue_full(struct moving_queue *q) +{ + EBUG_ON(atomic_read(&q->count) > q->max_count); + EBUG_ON(atomic_read(&q->read_count) > q->max_read_count); + + return atomic_read(&q->count) == q->max_count || + atomic_read(&q->read_count) == q->max_read_count; +} + void bch_data_move(struct moving_queue *, struct moving_context *, struct moving_io *); @@ -149,21 +156,18 @@ void bch_queue_run(struct moving_queue *, struct moving_context *); #define sysfs_queue_attribute(name) \ rw_attribute(name##_max_count); \ rw_attribute(name##_max_read_count); \ - rw_attribute(name##_max_write_count); \ - rw_attribute(name##_max_keys) + rw_attribute(name##_max_write_count); #define sysfs_queue_files(name) \ &sysfs_##name##_max_count, \ &sysfs_##name##_max_read_count, \ - &sysfs_##name##_max_write_count, \ - &sysfs_##name##_max_keys + &sysfs_##name##_max_write_count #define sysfs_queue_show(name, var) \ do { \ sysfs_hprint(name##_max_count, (var)->max_count); \ sysfs_print(name##_max_read_count, (var)->max_read_count); \ sysfs_print(name##_max_write_count, (var)->max_write_count);\ - sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\ } while (0) #define sysfs_queue_store(name, var) \ @@ -171,12 +175,6 @@ do { \ sysfs_strtoul(name##_max_count, (var)->max_count); \ sysfs_strtoul(name##_max_read_count, (var)->max_read_count); \ sysfs_strtoul(name##_max_write_count, (var)->max_write_count); \ - if (attr == &sysfs_##name##_max_keys) { \ - int v = strtoi_h_or_return(buf); \ - \ - v = clamp(v, 2, KEYLIST_MAX); \ - bch_scan_keylist_resize(&(var)->keys, v); \ - } \ } while (0) #endif /* _BCACHE_MOVE_H */ diff --git a/drivers/md/bcache/move_types.h b/drivers/md/bcache/move_types.h index d5e1a4a968fa..25f6d2fab592 100644 --- a/drivers/md/bcache/move_types.h +++ b/drivers/md/bcache/move_types.h @@ -8,7 +8,6 @@ struct moving_queue { struct work_struct work; - struct scan_keylist keys; struct workqueue_struct *wq; /* Configuration */ @@ -57,9 +56,11 @@ struct moving_queue { */ struct list_head write_pending; - unsigned count; - unsigned read_count; - unsigned write_count; + atomic_t count; + atomic_t read_count; + atomic_t write_count; + + wait_queue_head_t wait; }; #endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index a18340031368..76b92c4f1713 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -5,6 +5,7 @@ */ #include "bcache.h" +#include "btree_iter.h" #include "buckets.h" #include "clock.h" #include "extents.h" @@ -16,13 +17,12 @@ #include <trace/events/bcache.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/wait.h> /* Moving GC - IO loop */ -static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k) +static bool moving_pred(struct cache *ca, struct bkey_s_c k) { - struct cache *ca = container_of(kl, struct cache, - moving_gc_queue.keys); struct cache_set *c = ca->set; const struct bch_extent_ptr *ptr; bool ret = false; @@ -41,11 +41,11 @@ static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k) return ret; } -static int issue_moving_gc_move(struct moving_queue *q, +static int issue_moving_gc_move(struct cache *ca, struct moving_context *ctxt, struct bkey_s_c k) { - struct cache *ca = container_of(q, struct cache, moving_gc_queue); + struct moving_queue *q = &ca->moving_gc_queue; struct cache_set *c = ca->set; const struct bch_extent_ptr *ptr; struct moving_io *io; @@ -57,7 +57,7 @@ static int issue_moving_gc_move(struct moving_queue *q, goto found; /* We raced - bucket's been reused */ - goto out; + return 0; found: gen--; BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets)); @@ -70,65 +70,53 @@ found: trace_bcache_gc_copy(k.k); - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ bch_data_move(q, ctxt, io); -out: - bch_scan_keylist_dequeue(&q->keys); return 0; } static void read_moving(struct cache *ca, struct moving_context *ctxt) { - struct bkey_i *k; - bool again; + struct cache_set *c = ca->set; + struct btree_iter iter; + struct bkey_s_c k; bch_ratelimit_reset(&ca->moving_gc_pd.rate); + bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); + + while (!bch_moving_context_wait(ctxt) && + (k = bch_btree_iter_peek(&iter)).k) { + if (!moving_pred(ca, k)) + goto next; + + if (bch_queue_full(&ca->moving_gc_queue)) { + bch_btree_iter_unlock(&iter); + + if (ca->moving_gc_queue.rotational) + bch_queue_run(&ca->moving_gc_queue, ctxt); + else + wait_event(ca->moving_gc_queue.wait, + !bch_queue_full(&ca->moving_gc_queue)); + continue; + } + + if (issue_moving_gc_move(ca, ctxt, k)) { + bch_btree_iter_unlock(&iter); - do { - again = false; - - while (!bch_moving_context_wait(ctxt)) { - if (bch_queue_full(&ca->moving_gc_queue)) { - if (ca->moving_gc_queue.rotational) { - again = true; - break; - } else { - bch_moving_wait(ctxt); - continue; - } - } - - k = bch_scan_keylist_next_rescan( - ca->set, - &ca->moving_gc_queue.keys, - &ctxt->last_scanned, - POS_MAX, - moving_pred); - - if (k == NULL) - break; - - if (issue_moving_gc_move(&ca->moving_gc_queue, - ctxt, bkey_i_to_s_c(k))) { - /* - * Memory allocation failed; we will wait for - * all queued moves to finish and continue - * scanning starting from the same key - */ - again = true; - break; - } + /* memory allocation failure, wait for IOs to finish */ + bch_queue_run(&ca->moving_gc_queue, ctxt); + continue; } +next: + bch_btree_iter_advance_pos(&iter); + //bch_btree_iter_cond_resched(&iter); + + /* unlock before calling moving_context_wait() */ + bch_btree_iter_unlock(&iter); + cond_resched(); + } - bch_queue_run(&ca->moving_gc_queue, ctxt); - } while (!kthread_should_stop() && again); + bch_btree_iter_unlock(&iter); + bch_queue_run(&ca->moving_gc_queue, ctxt); } static bool bch_moving_gc(struct cache *ca) @@ -289,7 +277,6 @@ static int bch_moving_gc_thread(void *arg) return 0; } -#define MOVING_GC_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE #define MOVING_GC_NR 64 #define MOVING_GC_READ_NR 32 #define MOVING_GC_WRITE_NR 32 @@ -303,7 +290,6 @@ int bch_moving_init_cache(struct cache *ca) return bch_queue_init(&ca->moving_gc_queue, ca->set, - MOVING_GC_KEYS_MAX_SIZE, MOVING_GC_NR, MOVING_GC_READ_NR, MOVING_GC_WRITE_NR, @@ -343,12 +329,6 @@ void bch_moving_gc_stop(struct cache *ca) if (ca->moving_gc_read) kthread_stop(ca->moving_gc_read); ca->moving_gc_read = NULL; - - /* - * Make sure that it is empty so that gc marking doesn't keep - * marking stale entries from when last used. - */ - bch_scan_keylist_reset(&ca->moving_gc_queue.keys); } void bch_moving_gc_destroy(struct cache *ca) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6275f708e555..309a720c5a80 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1074,8 +1074,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, init_rwsem(&c->gc_lock); mutex_init(&c->trigger_gc_lock); - mutex_init(&c->gc_scan_keylist_lock); - INIT_LIST_HEAD(&c->gc_scan_keylists); #define BCH_TIME_STAT(name, frequency_units, duration_units) \ spin_lock_init(&c->name##_time.lock); @@ -1981,12 +1979,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->tiering_write_point.reserve = RESERVE_NONE; ca->tiering_write_point.group = &ca->self; - /* XXX: scan keylists will die */ - bch_scan_keylist_init(&ca->moving_gc_queue.keys, c, - DFLT_SCAN_KEYLIST_MAX_SIZE); - bch_scan_keylist_init(&ca->tiering_queue.keys, c, - DFLT_SCAN_KEYLIST_MAX_SIZE); - kobject_get(&c->kobj); ca->set = c; diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c index 02ceb586239d..069e68a38a4a 100644 --- a/drivers/md/bcache/tier.c +++ b/drivers/md/bcache/tier.c @@ -15,15 +15,19 @@ #include <linux/kthread.h> #include <trace/events/bcache.h> -/** - * tiering_pred - check if tiering should copy an extent to tier 1 - */ -static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k) -{ - struct cache *ca = container_of(kl, struct cache, - tiering_queue.keys); - struct cache_set *c = ca->set; +struct tiering_state { + struct cache_group *tier; + unsigned tier_idx; + unsigned sectors; + unsigned stripe_size; + unsigned dev_idx; + struct cache *ca; +}; +static bool tiering_pred(struct cache_set *c, + struct tiering_state *s, + struct bkey_s_c k) +{ if (bkey_extent_is_data(k.k)) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; @@ -38,7 +42,7 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k) mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) if (ptr->dev < mi->nr_in_set && - mi->m[ptr->dev].tier) + mi->m[ptr->dev].tier >= s->tier_idx) replicas++; cache_member_info_put(); @@ -48,196 +52,54 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k) return false; } -struct tiering_refill { - struct bpos start; - struct cache *ca; - int cache_iter; - u64 sectors; -}; - -static void refill_done(struct tiering_refill *refill) +static void tier_put_device(struct tiering_state *s) { - if (refill->ca) { - percpu_ref_put(&refill->ca->ref); - refill->ca = NULL; - } + if (s->ca) + percpu_ref_put(&s->ca->ref); + s->ca = NULL; } /** * refill_next - move on to refilling the next cache's tiering keylist */ -static void refill_next(struct cache_set *c, struct tiering_refill *refill) -{ - struct cache_group *tier; - - refill_done(refill); - - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices == 0) - goto out; - - while (1) { - while (refill->cache_iter < tier->nr_devices) { - refill->ca = rcu_dereference( - tier->d[refill->cache_iter].dev); - if (refill->ca != NULL) { - percpu_ref_get(&refill->ca->ref); - goto out; - } - refill->cache_iter++; - } - - /* Reached the end, wrap around */ - refill->cache_iter = 0; - } - -out: - rcu_read_unlock(); -} - -/* - * refill_init - Start refilling a random cache device -- this ensures we - * distribute data sanely even if each tiering pass discovers only a few - * keys to tier - */ -static void refill_init(struct cache_set *c, struct tiering_refill *refill) -{ - struct cache_group *tier; - - memset(refill, 0, sizeof(*refill)); - refill->start = POS_MIN; - - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices != 0) - refill->cache_iter = bch_rand_range(tier->nr_devices); - rcu_read_unlock(); - - refill_next(c, refill); -} - -/** - * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache - * device's tiering keylist before we move on to the next cache device - */ -static bool tiering_keylist_full(struct tiering_refill *refill) +static void tier_next_device(struct cache_set *c, struct tiering_state *s) { - return (refill->sectors >= refill->ca->tiering_stripe_size); -} - -/** - * tiering_keylist_empty - to prevent a keylist from growing to more than twice - * the tiering stripe size, we stop refill when a keylist has more than a single - * stripe of sectors - */ -static bool tiering_keylist_empty(struct cache *ca) -{ - return (bch_scan_keylist_sectors(&ca->tiering_queue.keys) - <= ca->tiering_stripe_size); -} - -/** - * tiering_refill - to keep all queues busy as much as possible, we add - * up to a single stripe of sectors to each cache device's queue, iterating - * over all cache devices twice, so each one has two stripe's of writes - * queued up, before we have to wait for move operations to complete. - */ -static void tiering_refill(struct cache_set *c, struct tiering_refill *refill) -{ - struct scan_keylist *keys; - struct btree_iter iter; - struct bkey_s_c k; - - if (bkey_cmp(refill->start, POS_MAX) >= 0) - return; - - if (refill->ca == NULL) - return; - - if (!tiering_keylist_empty(refill->ca)) - return; - - trace_bcache_tiering_refill_start(c); - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) { - BKEY_PADDED(k) tmp; - - keys = &refill->ca->tiering_queue.keys; - - if (!tiering_pred(keys, k)) { - refill->start = k.k->p; - goto next; - } - - bkey_reassemble(&tmp.k, k); - bch_cut_front(refill->start, &tmp.k); - - /* Growing the keylist might fail */ - if (bch_scan_keylist_add(keys, bkey_i_to_s_c(&tmp.k))) - goto done; - - /* TODO: split key if refill->sectors is now > stripe_size */ - refill->sectors += tmp.k.k.size; - refill->start = tmp.k.k.p; - - /* Check if we've added enough keys to this keylist */ - if (tiering_keylist_full(refill)) { - /* Move on to refill the next cache device's keylist */ - refill->sectors = 0; - refill->cache_iter++; - refill_next(c, refill); - - /* All cache devices got removed somehow */ - if (refill->ca == NULL) - goto done; - - /* - * If the next cache's keylist is not sufficiently - * empty, wait for it to drain before refilling - * anything. We prioritize even distribution of data - * over maximizing write bandwidth. - */ - if (!tiering_keylist_empty(refill->ca)) - goto done; + if (!s->ca || s->sectors > s->stripe_size) { + tier_put_device(s); + s->sectors = 0; + s->dev_idx++; + + spin_lock(&s->tier->lock); + if (s->dev_idx >= s->tier->nr_devices) + s->dev_idx = 0; + + if (s->tier->nr_devices) { + s->ca = s->tier->d[s->dev_idx].dev; + percpu_ref_get(&s->ca->ref); } -next: - bch_btree_iter_cond_resched(&iter); + spin_unlock(&s->tier->lock); } - /* Reached the end of the keyspace */ - refill->start = POS_MAX; -done: - bch_btree_iter_unlock(&iter); - - trace_bcache_tiering_refill_end(c); } -static int issue_tiering_move(struct moving_queue *q, +static int issue_tiering_move(struct cache_set *c, + struct tiering_state *s, struct moving_context *ctxt, struct bkey_s_c k) { - struct cache *ca = container_of(q, struct cache, tiering_queue); - struct cache_set *c = ca->set; struct moving_io *io; - io = moving_io_alloc(c, q, &ca->tiering_write_point, k, NULL); + io = moving_io_alloc(c, + &s->ca->tiering_queue, + &s->ca->tiering_write_point, + k, NULL); if (!io) { trace_bcache_tiering_alloc_fail(c, k.k->size); return -ENOMEM; } trace_bcache_tiering_copy(k.k); - - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ - bch_data_move(q, ctxt, io); - bch_scan_keylist_dequeue(&q->keys); + bch_data_move(&s->ca->tiering_queue, ctxt, io); + s->sectors += k.k->size; return 0; } @@ -245,124 +107,80 @@ static int issue_tiering_move(struct moving_queue *q, * tiering_next_cache - issue a move to write an extent to the next cache * device in round robin order */ -static int tiering_next_cache(struct cache_set *c, - int *cache_iter, - struct moving_context *ctxt, - struct tiering_refill *refill) -{ - struct cache_group *tier; - int start = *cache_iter; - struct cache *ca; - - /* If true at the end of the loop, all keylists were empty, so we - * have reached the end of the keyspace */ - bool done = true; - /* If true at the end of the loop, all queues were full, so we must - * wait for some ops to finish */ - bool full = true; - - do { - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices == 0) { - rcu_read_unlock(); - return 0; - } - - if (*cache_iter >= tier->nr_devices) { - rcu_read_unlock(); - *cache_iter = 0; - continue; - } - - ca = rcu_dereference(tier->d[*cache_iter].dev); - if (ca == NULL || - ca->mi.state != CACHE_ACTIVE || - ca->tiering_queue.stopped) { - rcu_read_unlock(); - (*cache_iter)++; - continue; - } - - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - (*cache_iter)++; - - tiering_refill(c, refill); - - if (bch_queue_full(&ca->tiering_queue)) { - done = false; - } else { - struct bkey_i *k = - bch_scan_keylist_next(&ca->tiering_queue.keys); - if (k) { - issue_tiering_move(&ca->tiering_queue, ctxt, - bkey_i_to_s_c(k)); - done = false; - full = false; - } - } - - percpu_ref_put(&ca->ref); - } while (*cache_iter != start); - - if (done) { - /* - * All devices have an empty keylist now, just wait for - * pending moves to finish and we're done. - */ - return 0; - } else if (full) { - /* - * No device with keys still remaining on its keylist has a - * queue that is not full. In this case, we have to wait for - * at least one read to complete before trying again. - * Otherwise, we could issue a read for this device. - */ - return -EAGAIN; - } else { - /* Try again immediately */ - return -EIOCBQUEUED; - } -} - -static u64 read_tiering(struct cache_set *c) +static s64 read_tiering(struct cache_set *c, struct cache_group *tier) { struct moving_context ctxt; - struct tiering_refill refill; - int cache_iter = 0; + struct tiering_state s; + struct btree_iter iter; + struct bkey_s_c k; int ret; trace_bcache_tiering_start(c); - refill_init(c, &refill); + memset(&s, 0, sizeof(s)); + s.tier = tier; + s.tier_idx = tier - c->cache_tiers; + s.stripe_size = 2048; /* 1 mb for now */ bch_moving_context_init(&ctxt, &c->tiering_pd.rate, MOVING_PURPOSE_TIERING); + bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN); - while (!bch_moving_context_wait(&ctxt)) { - cond_resched(); + while (!bch_moving_context_wait(&ctxt) && + (k = bch_btree_iter_peek(&iter)).k) { + if (!tiering_pred(c, &s, k)) + goto next; - ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill); - if (ret == -EAGAIN) - bch_moving_wait(&ctxt); - else if (!ret) + tier_next_device(c, &s); + if (!s.ca) break; + + if (bch_queue_full(&s.ca->tiering_queue)) { + bch_btree_iter_unlock(&iter); + + if (s.ca->tiering_queue.rotational) + bch_queue_run(&s.ca->tiering_queue, &ctxt); + else + wait_event(s.ca->tiering_queue.wait, + !bch_queue_full(&s.ca->tiering_queue)); + continue; + } + + ret = issue_tiering_move(c, &s, &ctxt, k); + if (ret) { + bch_btree_iter_unlock(&iter); + + /* memory allocation failure, wait for IOs to finish */ + + /* + * XXX: this only waits for IOs issued to this + * particular device, but there may not be any outstanding + * to this device + */ + bch_queue_run(&s.ca->tiering_queue, &ctxt); + continue; + } +next: + bch_btree_iter_advance_pos(&iter); + //bch_btree_iter_cond_resched(&iter); + + /* unlock before calling moving_context_wait() */ + bch_btree_iter_unlock(&iter); + cond_resched(); } + bch_btree_iter_unlock(&iter); + tier_put_device(&s); closure_sync(&ctxt.cl); - refill_done(&refill); - trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); - //pr_info("pred y %llu pred n %llu", c->tiering_pred_y, c->tiering_pred_n); - return ctxt.sectors_moved; } static int bch_tiering_thread(void *arg) { struct cache_set *c = arg; + struct cache_group *tier = &c->cache_tiers[1]; struct io_clock *clock = &c->io_clock[WRITE]; struct cache *ca; u64 tier_capacity, available_sectors; @@ -373,20 +191,26 @@ static int bch_tiering_thread(void *arg) while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && - c->cache_tiers[1].nr_devices)) + tier->nr_devices)) break; while (1) { + struct cache_group *faster_tier; + last = atomic_long_read(&clock->now); tier_capacity = available_sectors = 0; rcu_read_lock(); - group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) { - tier_capacity += - (ca->mi.nbuckets - - ca->mi.first_bucket) << ca->bucket_bits; - available_sectors += - buckets_available_cache(ca) << ca->bucket_bits; + for (faster_tier = c->cache_tiers; + faster_tier != tier; + faster_tier++) { + group_for_each_cache_rcu(ca, faster_tier, i) { + tier_capacity += + (ca->mi.nbuckets - + ca->mi.first_bucket) << ca->bucket_bits; + available_sectors += + buckets_available_cache(ca) << ca->bucket_bits; + } } rcu_read_unlock(); @@ -401,13 +225,12 @@ static int bch_tiering_thread(void *arg) return 0; } - read_tiering(c); + read_tiering(c, tier); } return 0; } -#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE #define TIERING_NR 64 #define TIERING_READ_NR 8 #define TIERING_WRITE_NR 32 @@ -423,7 +246,6 @@ int bch_tiering_init_cache(struct cache *ca) return bch_queue_init(&ca->tiering_queue, ca->set, - TIERING_KEYS_MAX_SIZE, TIERING_NR, TIERING_READ_NR, TIERING_WRITE_NR, @@ -458,12 +280,6 @@ void bch_tiering_write_destroy(struct cache *ca) void bch_tiering_write_stop(struct cache *ca) { bch_queue_stop(&ca->tiering_queue); - - /* - * Make sure that it is empty so that gc marking doesn't keep - * marking stale entries from when last used. - */ - bch_scan_keylist_reset(&ca->tiering_queue.keys); } void bch_tiering_read_stop(struct cache_set *c) diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index f4153a5c962d..73b80c8648e4 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -1080,9 +1080,9 @@ DECLARE_EVENT_CLASS(moving_io, __entry->inode = k->p.inode; __entry->offset = k->p.offset; __entry->sectors = k->size; - __entry->count = q->count; - __entry->read_count = q->read_count; - __entry->write_count = q->write_count; + __entry->count = atomic_read(&q->count); + __entry->read_count = atomic_read(&q->read_count); + __entry->write_count = atomic_read(&q->write_count); ), TP_printk("%p %u:%llu sectors %u queue %u reads %u writes %u", |