summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-10-04 06:16:50 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2017-01-18 21:40:33 -0900
commit40ff276e09f62f09fb892cc183766b88376783bb (patch)
tree2e0d68f8223bcc0387aba1a92d6357740a0c0827
parentb9aadd59175a6475eb2e8b5e21bd38bb9454b099 (diff)
bcache: scan keylists get to die in a fire
-rw-r--r--drivers/md/bcache/alloc_types.h2
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/btree_gc.c18
-rw-r--r--drivers/md/bcache/keylist.c195
-rw-r--r--drivers/md/bcache/keylist.h45
-rw-r--r--drivers/md/bcache/keylist_types.h45
-rw-r--r--drivers/md/bcache/migrate.c250
-rw-r--r--drivers/md/bcache/move.c176
-rw-r--r--drivers/md/bcache/move.h40
-rw-r--r--drivers/md/bcache/move_types.h9
-rw-r--r--drivers/md/bcache/movinggc.c104
-rw-r--r--drivers/md/bcache/super.c8
-rw-r--r--drivers/md/bcache/tier.c392
-rw-r--r--include/trace/events/bcache.h6
14 files changed, 328 insertions, 966 deletions
diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h
index 0bfafbf8c91f..aa3c81268817 100644
--- a/drivers/md/bcache/alloc_types.h
+++ b/drivers/md/bcache/alloc_types.h
@@ -55,7 +55,7 @@ struct cache_group {
unsigned cur_device;
struct {
u64 weight;
- struct cache __rcu *dev;
+ struct cache *dev;
} d[MAX_CACHES_PER_SET];
};
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1507fe4d5047..cbb2dca3257a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -686,10 +686,6 @@ struct cache_set {
struct task_struct *gc_thread;
atomic_t kick_gc;
- /* This is a list of scan_keylists for btree GC to scan */
- struct list_head gc_scan_keylists;
- struct mutex gc_scan_keylist_lock;
-
/*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
* has been marked by GC.
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index 4db9e54b3751..8bd6070cfee0 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -308,23 +308,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c)
mutex_unlock(&c->btree_interior_update_lock);
}
-static void bch_mark_scan_keylists(struct cache_set *c)
-{
- struct scan_keylist *kl;
-
- mutex_lock(&c->gc_scan_keylist_lock);
-
- /* What the goddamn fuck? */
- list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) {
- if (kl->owner == NULL)
- bch_keylist_recalc_oldest_gens(c, kl);
- else
- bch_queue_recalc_oldest_gens(c, kl->owner);
- }
-
- mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
/**
* bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
*/
@@ -426,7 +409,6 @@ void bch_gc(struct cache_set *c)
bch_mark_metadata(c);
bch_mark_pending_btree_node_frees(c);
bch_writeback_recalc_oldest_gens(c);
- bch_mark_scan_keylists(c);
for_each_cache(ca, c, i)
atomic_long_set(&ca->saturated_count, 0);
diff --git a/drivers/md/bcache/keylist.c b/drivers/md/bcache/keylist.c
index 4f9d1865abdc..b7af04e5adfa 100644
--- a/drivers/md/bcache/keylist.c
+++ b/drivers/md/bcache/keylist.c
@@ -118,198 +118,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
BUG_ON(l->top_p > l->end_keys_p);
bkey_copy(where, insert);
}
-
-/* Scan keylists simple utilities */
-
-void bch_scan_keylist_init(struct scan_keylist *kl,
- struct cache_set *c,
- unsigned max_size)
-
-{
- kl->c = c;
- kl->owner = NULL;
-
- mutex_init(&kl->lock);
- kl->max_size = max_size;
- bch_keylist_init(&kl->list, NULL, 0);
-
- /*
- * Order of initialization is tricky, and this makes sure that
- * we have a valid cache set in case the order of
- * initialization chages and breaks things.
- */
- BUG_ON(c == NULL);
- mutex_lock(&c->gc_scan_keylist_lock);
- list_add_tail(&kl->mark_list, &c->gc_scan_keylists);
- mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl)
-{
- if (kl->c) {
- mutex_lock(&kl->c->gc_scan_keylist_lock);
- list_del(&kl->mark_list);
- mutex_unlock(&kl->c->gc_scan_keylist_lock);
- }
-
- mutex_lock(&kl->lock);
- bch_keylist_free(&kl->list);
- mutex_unlock(&kl->lock);
-}
-
-void bch_scan_keylist_reset(struct scan_keylist *kl)
-{
- mutex_lock(&kl->lock);
- kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p;
- mutex_unlock(&kl->lock);
-}
-
-/*
- * This should only be called from sysfs, and holding a lock that prevents
- * re-entrancy.
- */
-void bch_scan_keylist_resize(struct scan_keylist *kl,
- unsigned max_size)
-{
- mutex_lock(&kl->lock);
- kl->max_size = max_size; /* May be smaller than current size */
- mutex_unlock(&kl->lock);
-}
-
-/**
- * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys
- *
- * This prevents us from wrapping around gens for a bucket only referenced from
- * the tiering or moving GC keylists. We don't actually care that the data in
- * those buckets is marked live, only that we don't wrap the gens.
- *
- * Note: This interlocks with insertions, but not all dequeues interlock.
- * The particular case in which dequeues don't interlock is when a
- * scan list used by the copy offload ioctls is used as a plain
- * keylist for btree insertion.
- * The btree insertion code doesn't go through
- * bch_scan_keylist_dequeue below, and instead uses plain
- * bch_keylist_dequeue. The other pointers (top, start, end) are
- * unchanged in this case.
- * A little care with the bottomp pointer suffices in this case.
- * Of course, we may end up marking stuff that we don't need to mark,
- * but was recently valid and we have likely just inserted in the tree
- * anyway.
- */
-void bch_keylist_recalc_oldest_gens(struct cache_set *c,
- struct scan_keylist *kl)
-{
- struct bkey_i *k;
-
- mutex_lock(&kl->lock);
-
- for_each_keylist_key(&kl->list, k)
- bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k));
-
- mutex_unlock(&kl->lock);
-}
-
-int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k)
-{
- int ret;
-
- mutex_lock(&kl->lock);
- ret = bch_keylist_realloc_max(&kl->list,
- k.k->u64s,
- kl->max_size);
-
- if (!ret) {
- bkey_reassemble(kl->list.top, k);
- bch_keylist_enqueue(&kl->list);
- atomic64_add(k.k->size, &kl->sectors);
- }
- mutex_unlock(&kl->lock);
-
- return ret;
-}
-
-/* Actual scanning functionality of scan_keylists */
-
-static void bch_refill_scan_keylist(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred)
-{
- struct bpos start = *last_scanned;
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned nr_found = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) {
- if (bkey_cmp(k.k->p, end) >= 0) {
- *last_scanned = k.k->p;
- goto done;
- }
-
- if (pred(kl, k)) {
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
- bch_cut_front(*last_scanned, &tmp.k);
-
- if (bch_scan_keylist_add(kl, bkey_i_to_s_c(&tmp.k)))
- goto done;
-
- nr_found++;
- }
-
- *last_scanned = k.k->p;
- bch_btree_iter_cond_resched(&iter);
- }
-
- /* If we end up here, it means:
- * - the map_fn didn't fill up the keybuf
- * - the map_fn didn't see the end key
- * - there were no more keys to map over
- * Therefore, we are at the end of the key space */
- *last_scanned = POS_MAX;
-done:
- bch_btree_iter_unlock(&iter);
-
- trace_bcache_keyscan(nr_found,
- start.inode, start.offset,
- last_scanned->inode,
- last_scanned->offset);
-}
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl)
-{
- if (bch_keylist_empty(&kl->list))
- return NULL;
-
- return bch_keylist_front(&kl->list);
-}
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred)
-{
- if (bch_keylist_empty(&kl->list)) {
- if (bkey_cmp(*last_scanned, end) >= 0)
- return NULL;
-
- bch_refill_scan_keylist(c, kl, last_scanned, end, pred);
- }
-
- return bch_scan_keylist_next(kl);
-}
-
-void bch_scan_keylist_dequeue(struct scan_keylist *kl)
-{
- u64 sectors;
-
- mutex_lock(&kl->lock);
- sectors = kl->list.bot->k.size;
- bch_keylist_dequeue(&kl->list);
- mutex_unlock(&kl->lock);
-
- BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0);
-}
diff --git a/drivers/md/bcache/keylist.h b/drivers/md/bcache/keylist.h
index 028552757527..990ba72ee60c 100644
--- a/drivers/md/bcache/keylist.h
+++ b/drivers/md/bcache/keylist.h
@@ -116,49 +116,4 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
int bch_keylist_realloc(struct keylist *, unsigned);
int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned);
-void bch_scan_keylist_init(struct scan_keylist *kl,
- struct cache_set *c,
- unsigned max_size);
-
-void bch_scan_keylist_reset(struct scan_keylist *kl);
-
-/* The keylist is dynamically adjusted. This just clamps the maxima */
-
-static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl)
-{
- return kl->max_size;
-}
-
-static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl)
-{
- return atomic64_read(&kl->sectors);
-}
-
-void bch_scan_keylist_resize(struct scan_keylist *kl,
- unsigned max_size);
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl);
-
-/*
- * IMPORTANT: The caller of bch_scan_keylist_next or
- * bch_scan_keylist_next_rescan needs to copy any
- * non-null return value before calling either again!
- * These functions return a pointer into the internal structure.
- * Furthermore, they need to call bch_scan_keylist_advance after
- * copying the structure.
- */
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *);
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
- struct scan_keylist *kl,
- struct bpos *last_scanned,
- struct bpos end,
- scan_keylist_pred_fn *pred);
-
-int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c);
-void bch_scan_keylist_dequeue(struct scan_keylist *);
-
-void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *);
-
#endif /* _BCACHE_KEYLIST_H */
diff --git a/drivers/md/bcache/keylist_types.h b/drivers/md/bcache/keylist_types.h
index 569cdc2480e2..156fbe0745fd 100644
--- a/drivers/md/bcache/keylist_types.h
+++ b/drivers/md/bcache/keylist_types.h
@@ -48,49 +48,4 @@ struct keylist {
bool has_buf;
};
-/*
- * scan_keylists are conceptually similar to keybufs, but they don't
- * have an internal RB tree.
- * keybufs should be used when read or write operations need to
- * examine keys in flight, as for writeback.
- * But for moving operations (moving gc, tiering, moving data off
- * devices), read and writes don't need to look at all, so we don't
- * need the RB tree and use scan_keylists instead.
- *
- * Note that unlike keybufs, they don't contain a semaphore to limit
- * bios. That must be done externally, if necessary.
- */
-
-#define DFLT_SCAN_KEYLIST_MAX_SIZE 512
-
-struct scan_keylist {
- struct list_head mark_list; /* For GC marking */
-
- struct cache_set *c; /* For destroying */
-
- /*
- * Only one thread is allowed to mutate the keylist. Other threads can
- * read it. The mutex has to be taken by the mutator thread when
- * mutating the keylist, and by other threads when reading, but not by
- * the mutator thread when reading.
- */
- struct mutex lock;
- /*
- * Maximum size, in u64s. The keylist will not grow beyond this size.
- */
- unsigned max_size;
- /*
- * Number of sectors in keys currently on the keylist.
- */
- atomic64_t sectors;
- /*
- * The underlying keylist.
- */
- struct keylist list;
-
- struct moving_queue *owner;
-};
-
-typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c);
-
#endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c
index 1169f8fc28a1..46d05e32b2f6 100644
--- a/drivers/md/bcache/migrate.c
+++ b/drivers/md/bcache/migrate.c
@@ -12,20 +12,9 @@
#include "migrate.h"
#include "move.h"
-static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
- struct cache *ca = container_of(kl, struct cache,
- moving_gc_queue.keys);
-
- return bkey_extent_is_data(k.k) &&
- bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->sb.nr_this_dev);
-}
-
static int issue_migration_move(struct cache *ca,
struct moving_context *ctxt,
- struct bkey_s_c k,
- u64 *seen_key_count)
+ struct bkey_s_c k)
{
struct moving_queue *q = &ca->moving_gc_queue;
struct cache_set *c = ca->set;
@@ -49,24 +38,10 @@ found:
}
bch_data_move(q, ctxt, io);
- (*seen_key_count)++;
-
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
- bch_scan_keylist_dequeue(&q->keys);
return 0;
}
-#define MIGRATION_DEBUG 0
-
#define MAX_DATA_OFF_ITER 10
-#define PASS_LOW_LIMIT (MIGRATION_DEBUG ? 0 : 2)
#define MIGRATE_NR 64
#define MIGRATE_READ_NR 32
#define MIGRATE_WRITE_NR 32
@@ -87,15 +62,14 @@ found:
int bch_move_data_off_device(struct cache *ca)
{
- int ret;
- struct bkey_i *k;
- unsigned pass;
- u64 seen_key_count;
- unsigned last_error_count;
- unsigned last_error_flags;
- struct moving_context context;
+ struct moving_context ctxt;
struct cache_set *c = ca->set;
struct moving_queue *queue = &ca->moving_gc_queue;
+ unsigned pass = 0;
+ u64 seen_key_count;
+ int ret = 0;
+
+ BUG_ON(ca->mi.state == CACHE_ACTIVE);
/*
* This reuses the moving gc queue as it is no longer in use
@@ -116,8 +90,8 @@ int bch_move_data_off_device(struct cache *ca)
queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR);
BUG_ON(queue->wq == NULL);
- bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION);
- context.avoid = ca;
+ bch_moving_context_init(&ctxt, NULL, MOVING_PURPOSE_MIGRATION);
+ ctxt.avoid = ca;
/*
* In theory, only one pass should be necessary as we've
@@ -136,142 +110,128 @@ int bch_move_data_off_device(struct cache *ca)
* but that can be viewed as a verification pass.
*/
- seen_key_count = 1;
- last_error_count = 0;
- last_error_flags = 0;
-
- for (pass = 0;
- (seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER));
- pass++) {
- bool again;
+ do {
+ struct btree_iter iter;
+ struct bkey_s_c k;
seen_key_count = 0;
- atomic_set(&context.error_count, 0);
- atomic_set(&context.error_flags, 0);
- context.last_scanned = POS_MIN;
+ atomic_set(&ctxt.error_count, 0);
+ atomic_set(&ctxt.error_flags, 0);
-again:
- again = false;
+ bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+ while ((k = bch_btree_iter_peek(&iter)).k) {
+ if (!bkey_extent_is_data(k.k) ||
+ !bch_extent_has_device(bkey_s_c_to_extent(k),
+ ca->sb.nr_this_dev))
+ goto next;
- while (1) {
if (bch_queue_full(queue)) {
- if (queue->rotational) {
- again = true;
- break;
- } else {
- bch_moving_wait(&context);
- continue;
- }
+ bch_btree_iter_unlock(&iter);
+
+ if (queue->rotational)
+ bch_queue_run(queue, &ctxt);
+ else
+ wait_event(queue->wait,
+ !bch_queue_full(queue));
+ continue;
}
- k = bch_scan_keylist_next_rescan(c,
- &queue->keys,
- &context.last_scanned,
- POS_MAX,
- migrate_data_pred);
- if (k == NULL)
- break;
+ ret = issue_migration_move(ca, &ctxt, k);
+ if (ret == -ENOMEM) {
+ bch_btree_iter_unlock(&iter);
- if (issue_migration_move(ca, &context, bkey_i_to_s_c(k),
- &seen_key_count)) {
/*
- * Memory allocation failed; we will wait for
- * all queued moves to finish and continue
- * scanning starting from the same key
+ * memory allocation failure, wait for IOs to
+ * finish
*/
- again = true;
- break;
+ bch_queue_run(queue, &ctxt);
+ continue;
}
- }
+ if (ret == -ENOSPC) {
+ bch_btree_iter_unlock(&iter);
+ bch_queue_run(queue, &ctxt);
+ return -ENOSPC;
+ }
+ BUG_ON(ret);
- bch_queue_run(queue, &context);
- if (again)
- goto again;
+ seen_key_count++;
+next:
+ bch_btree_iter_advance_pos(&iter);
+ bch_btree_iter_cond_resched(&iter);
- if ((pass >= PASS_LOW_LIMIT)
- && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) {
- pr_notice("found %llu keys on pass %u.",
- seen_key_count, pass);
}
+ ret = bch_btree_iter_unlock(&iter);
+ bch_queue_run(queue, &ctxt);
- last_error_count = atomic_read(&context.error_count);
- last_error_flags = atomic_read(&context.error_flags);
-
- if (last_error_count != 0) {
- pr_notice("pass %u: error count = %u, error flags = 0x%x",
- pass, last_error_count, last_error_flags);
- }
- }
+ if (ret)
+ return ret;
+ } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
- if (seen_key_count != 0 || last_error_count != 0) {
+ if (seen_key_count) {
pr_err("Unable to migrate all data in %d iterations.",
MAX_DATA_OFF_ITER);
- ret = -EDEADLK;
- } else if (MIGRATION_DEBUG)
- pr_notice("Migrated all data in %d iterations", pass);
+ return -1;
+ }
- bch_queue_run(queue, &context);
- return ret;
+ return 0;
}
/*
* This walks the btree, and for any node on the relevant device it moves the
* node elsewhere.
*/
-static int bch_move_btree_off(struct cache *ca,
- enum btree_id id,
- const char *name)
+static int bch_move_btree_off(struct cache *ca, enum btree_id id)
{
+ struct cache_set *c = ca->set;
+ struct btree_iter iter;
struct closure cl;
- unsigned pass;
+ struct btree *b;
+ int ret;
- closure_init_stack(&cl);
+ BUG_ON(ca->mi.state == CACHE_ACTIVE);
- pr_debug("Moving %s btree off device %u",
- name, ca->sb.nr_this_dev);
+ closure_init_stack(&cl);
- for (pass = 0; (pass < MAX_DATA_OFF_ITER); pass++) {
- struct btree_iter iter;
- struct btree *b;
- unsigned moved = 0, seen = 0;
- int ret;
-
- for_each_btree_node(&iter, ca->set, id, POS_MIN, 0, b) {
- struct bkey_s_c_extent e =
- bkey_i_to_s_c_extent(&b->key);
- seen++;
+ for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
retry:
- if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
- continue;
-
- if (bch_btree_node_rewrite(&iter, b, &cl)) {
- /*
- * Drop locks to upgrade locks or wait on
- * reserve: after retaking, recheck in case we
- * raced.
- */
- bch_btree_iter_unlock(&iter);
- closure_sync(&cl);
- b = bch_btree_iter_peek_node(&iter);
- goto retry;
- }
-
- moved++;
- iter.locks_want = 0;
+ if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+ continue;
+
+ ret = bch_btree_node_rewrite(&iter, b, &cl);
+ if (ret == -EINTR || ret == -ENOSPC) {
+ /*
+ * Drop locks to upgrade locks or wait on
+ * reserve: after retaking, recheck in case we
+ * raced.
+ */
+ bch_btree_iter_unlock(&iter);
+ closure_sync(&cl);
+ b = bch_btree_iter_peek_node(&iter);
+ goto retry;
}
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- return ret; /* btree IO error */
+ if (ret) {
+ bch_btree_iter_unlock(&iter);
+ return ret;
+ }
+
+ iter.locks_want = 0;
+ }
+ ret = bch_btree_iter_unlock(&iter);
+ if (ret)
+ return ret; /* btree IO error */
- if (!moved)
- return 0;
+ if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+ for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
- pr_debug("%s pass %u: seen %u, moved %u.",
- name, pass, seen, moved);
+ BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+ }
+ bch_btree_iter_unlock(&iter);
}
- /* Failed: */
- return -1;
+ return 0;
}
/*
@@ -321,25 +281,25 @@ retry:
int bch_move_meta_data_off_device(struct cache *ca)
{
unsigned i;
- int ret = 0; /* Success */
+ int ret;
/* 1st, Move the btree nodes off the device */
- for (i = 0; i < BTREE_ID_NR; i++)
- if (bch_move_btree_off(ca, i, bch_btree_id_names[i]) != 0)
- return 1;
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ ret = bch_move_btree_off(ca, i);
+ if (ret)
+ return ret;
+ }
/* There are no prios/gens to move -- they are already in the device. */
/* 2nd. Move the journal off the device */
- if (bch_journal_move(ca) != 0) {
- pr_err("Unable to move the journal off in %pU.",
- ca->set->disk_sb.user_uuid.b);
- ret = 1; /* Failure */
- }
+ ret = bch_journal_move(ca);
+ if (ret)
+ return ret;
- return ret;
+ return 0;
}
/*
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 9c4abbe9576e..f9c2d9e46e84 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -162,50 +162,14 @@ void bch_moving_context_init(struct moving_context *ctxt,
enum moving_purpose purpose)
{
memset(ctxt, 0, sizeof(*ctxt));
- ctxt->task = current;
ctxt->rate = rate;
ctxt->purpose = purpose;
closure_init_stack(&ctxt->cl);
}
-/*
- * bch_moving_wait() -- wait for a bch_moving_notify() call
- *
- * To deal with lost wakeups, we make this return immediately if notify
- * was already called.
- */
-void bch_moving_wait(struct moving_context *ctxt)
-{
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (atomic_xchg(&ctxt->pending, 0))
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-static void bch_moving_notify(struct moving_context *ctxt)
-{
- atomic_set(&ctxt->pending, 1);
- wake_up_process(ctxt->task);
-}
-
-static bool __bch_queue_reads_pending(struct moving_queue *q)
-{
- return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree));
-}
-
static bool bch_queue_reads_pending(struct moving_queue *q)
{
- unsigned long flags;
- bool pending;
-
- spin_lock_irqsave(&q->lock, flags);
- pending = __bch_queue_reads_pending(q);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return pending;
+ return atomic_read(&q->read_count) || !RB_EMPTY_ROOT(&q->tree);
}
static void bch_queue_write(struct moving_queue *q)
@@ -276,7 +240,6 @@ static void moving_io_destructor(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_queue *q = io->q;
- struct moving_context *ctxt = io->context;
unsigned long flags;
bool kick_writes = true;
@@ -285,28 +248,29 @@ static void moving_io_destructor(struct closure *cl)
spin_lock_irqsave(&q->lock, flags);
- BUG_ON(!q->count);
- q->count--;
-
if (io->read_issued) {
- BUG_ON(!q->read_count);
- q->read_count--;
+ BUG_ON(!atomic_read(&q->read_count));
+ atomic_dec(&q->read_count);
}
if (io->write_issued) {
- BUG_ON(!q->write_count);
- q->write_count--;
+ BUG_ON(!atomic_read(&q->write_count));
+ atomic_dec(&q->write_count);
trace_bcache_move_write_done(q, &io->write.key.k);
}
+ BUG_ON(!atomic_read(&q->count));
+ atomic_dec(&q->count);
+ wake_up(&q->wait);
+
list_del_init(&io->list);
- if ((q->count == 0) && (q->stop_waitcl != NULL)) {
+ if (!atomic_read(&q->count) && q->stop_waitcl) {
closure_put(q->stop_waitcl);
q->stop_waitcl = NULL;
}
- if (q->rotational && __bch_queue_reads_pending(q))
+ if (q->rotational && bch_queue_reads_pending(q))
kick_writes = false;
if (list_empty(&q->pending))
@@ -318,8 +282,6 @@ static void moving_io_destructor(struct closure *cl)
if (kick_writes)
bch_queue_write(q);
-
- bch_moving_notify(ctxt);
}
static void moving_io_after_write(struct closure *cl)
@@ -336,13 +298,12 @@ static void moving_io_after_write(struct closure *cl)
static void write_moving(struct moving_io *io)
{
bool stopped;
- unsigned long flags;
struct bch_write_op *op = &io->write.op;
- spin_lock_irqsave(&io->q->lock, flags);
- BUG_ON(io->q->count == 0);
+ spin_lock_irq(&io->q->lock);
+ BUG_ON(!atomic_read(&io->q->count));
stopped = io->q->stopped;
- spin_unlock_irqrestore(&io->q->lock, flags);
+ spin_unlock_irq(&io->q->lock);
/*
* If the queue has been stopped, prevent the write from occurring.
@@ -362,17 +323,17 @@ static void bch_queue_write_work(struct work_struct *work)
{
struct moving_queue *q = container_of(work, struct moving_queue, work);
struct moving_io *io;
- unsigned long flags;
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irq(&q->lock);
- if (q->rotational && __bch_queue_reads_pending(q)) {
+ if (q->rotational && bch_queue_reads_pending(q)) {
/* All reads should have finished before writes start */
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
return;
}
- while (!q->stopped && q->write_count < q->max_write_count) {
+ while (!q->stopped &&
+ atomic_read(&q->write_count) < q->max_write_count) {
io = list_first_entry_or_null(&q->pending,
struct moving_io, list);
/*
@@ -388,17 +349,17 @@ static void bch_queue_write_work(struct work_struct *work)
break;
BUG_ON(io->write_issued);
- q->write_count++;
+ atomic_inc(&q->write_count);
io->write_issued = 1;
list_del(&io->list);
list_add_tail(&io->list, &q->write_pending);
trace_bcache_move_write(q, &io->write.key.k);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
write_moving(io);
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irq(&q->lock);
}
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
}
/*
@@ -408,7 +369,6 @@ static void bch_queue_write_work(struct work_struct *work)
int bch_queue_init(struct moving_queue *q,
struct cache_set *c,
- unsigned max_size,
unsigned max_count,
unsigned max_read_count,
unsigned max_write_count,
@@ -417,7 +377,6 @@ int bch_queue_init(struct moving_queue *q,
{
INIT_WORK(&q->work, bch_queue_write_work);
- q->keys.owner = q;
q->max_count = max_count;
q->max_read_count = max_read_count;
q->max_write_count = max_write_count;
@@ -427,6 +386,7 @@ int bch_queue_init(struct moving_queue *q,
INIT_LIST_HEAD(&q->pending);
INIT_LIST_HEAD(&q->write_pending);
q->tree = RB_ROOT;
+ init_waitqueue_head(&q->wait);
q->wq = alloc_workqueue(name,
WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1);
@@ -443,8 +403,6 @@ void bch_queue_start(struct moving_queue *q)
spin_lock_irqsave(&q->lock, flags);
q->stopped = false;
spin_unlock_irqrestore(&q->lock, flags);
-
- bch_scan_keylist_reset(&q->keys);
}
void queue_io_resize(struct moving_queue *q,
@@ -466,8 +424,6 @@ void bch_queue_destroy(struct moving_queue *q)
if (q->wq)
destroy_workqueue(q->wq);
q->wq = NULL;
-
- bch_scan_keylist_destroy(&q->keys);
}
static void bch_queue_cancel_writes(struct moving_queue *q)
@@ -489,8 +445,11 @@ static void bch_queue_cancel_writes(struct moving_queue *q)
list_del_init(&io->list);
read_issued = io->read_issued;
read_completed = io->read_completed;
- if (!read_issued && !read_completed && q->rotational)
+ if (!read_issued && !read_completed && q->rotational) {
rb_erase(&io->node, &q->tree);
+ wake_up(&q->wait);
+ }
+
spin_unlock_irqrestore(&q->lock, flags);
if (read_completed)
closure_return_with_destructor_noreturn(&io->cl,
@@ -505,22 +464,21 @@ static void bch_queue_cancel_writes(struct moving_queue *q)
void bch_queue_stop(struct moving_queue *q)
{
- unsigned long flags;
struct closure waitcl;
closure_init_stack(&waitcl);
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irq(&q->lock);
if (q->stopped)
BUG_ON(q->stop_waitcl != NULL);
else {
q->stopped = true;
- if (q->count != 0) {
+ if (atomic_read(&q->count)) {
q->stop_waitcl = &waitcl;
closure_get(&waitcl);
}
}
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
bch_queue_cancel_writes(q);
@@ -549,9 +507,6 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
{
unsigned long flags;
- /* 1st, mark the keylist keys */
- bch_keylist_recalc_oldest_gens(c, &q->keys);
-
/* 2nd, mark the keys in the I/Os */
spin_lock_irqsave(&q->lock, flags);
@@ -566,7 +521,6 @@ static void read_moving_endio(struct bio *bio)
struct closure *cl = bio->bi_private;
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_queue *q = io->q;
- struct moving_context *ctxt = io->context;
bool stopped;
unsigned long flags;
@@ -586,8 +540,11 @@ static void read_moving_endio(struct bio *bio)
BUG_ON(io->read_completed);
io->read_issued = 0;
io->read_completed = 1;
- BUG_ON(!q->read_count);
- q->read_count--;
+
+ BUG_ON(!atomic_read(&q->read_count));
+ atomic_dec(&q->read_count);
+ wake_up(&q->wait);
+
stopped = q->stopped;
if (stopped)
list_del_init(&io->list);
@@ -598,8 +555,6 @@ static void read_moving_endio(struct bio *bio)
moving_io_destructor);
else if (!q->rotational)
bch_queue_write(q);
-
- bch_moving_notify(ctxt);
}
static void __bch_data_move(struct closure *cl)
@@ -628,27 +583,6 @@ static void __bch_data_move(struct closure *cl)
&pick, BCH_READ_IS_LAST);
}
-/*
- * bch_queue_full() - return if more reads can be queued with bch_data_move().
- *
- * In rotational mode, always returns false if no reads are in flight (see
- * how max_count is initialized in bch_queue_init()).
- */
-bool bch_queue_full(struct moving_queue *q)
-{
- unsigned long flags;
- bool full;
-
- spin_lock_irqsave(&q->lock, flags);
- BUG_ON(q->count > q->max_count);
- BUG_ON(q->read_count > q->max_read_count);
- full = (q->count == q->max_count ||
- q->read_count == q->max_read_count);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return full;
-}
-
static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2)
{
if (io1->sort_key < io2->sort_key)
@@ -671,20 +605,19 @@ void bch_data_move(struct moving_queue *q,
struct moving_context *ctxt,
struct moving_io *io)
{
- unsigned long flags;
bool stopped = false;
BUG_ON(q->wq == NULL);
io->q = q;
io->context = ctxt;
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irq(&q->lock);
if (q->stopped) {
stopped = true;
goto out;
}
- q->count++;
+ atomic_inc(&q->count);
list_add_tail(&io->list, &q->pending);
trace_bcache_move_read(q, &io->write.key.k);
@@ -693,11 +626,11 @@ void bch_data_move(struct moving_queue *q,
else {
BUG_ON(io->read_issued);
io->read_issued = 1;
- q->read_count++;
+ atomic_inc(&q->read_count);
}
out:
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
if (stopped)
moving_io_free(io);
@@ -710,26 +643,27 @@ out:
static bool bch_queue_read(struct moving_queue *q,
struct moving_context *ctxt)
{
- unsigned long flags;
struct rb_node *node;
struct moving_io *io;
bool stopped;
BUG_ON(!q->rotational);
- spin_lock_irqsave(&q->lock, flags);
+ spin_lock_irq(&q->lock);
node = rb_first(&q->tree);
if (!node) {
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
return false;
}
io = rb_entry(node, struct moving_io, node);
rb_erase(node, &q->tree);
+ wake_up(&q->wait);
+
io->read_issued = 1;
- q->read_count++;
+ atomic_inc(&q->read_count);
stopped = q->stopped;
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irq(&q->lock);
if (stopped) {
moving_io_destructor(&io->cl);
@@ -742,31 +676,19 @@ static bool bch_queue_read(struct moving_queue *q,
void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt)
{
- unsigned long flags;
- bool full;
-
if (!q->rotational)
goto sync;
while (!bch_moving_context_wait(ctxt)) {
- spin_lock_irqsave(&q->lock, flags);
- full = (q->read_count == q->max_read_count);
- spin_unlock_irqrestore(&q->lock, flags);
-
- if (full) {
- bch_moving_wait(ctxt);
- continue;
- }
+ wait_event(q->wait,
+ atomic_read(&q->read_count) < q->max_read_count);
if (!bch_queue_read(q, ctxt))
break;
}
- while (bch_queue_reads_pending(q))
- bch_moving_wait(ctxt);
-
+ wait_event(q->wait, !bch_queue_reads_pending(q));
bch_queue_write(q);
-
sync:
closure_sync(&ctxt->cl);
}
diff --git a/drivers/md/bcache/move.h b/drivers/md/bcache/move.h
index 0f8f787a5168..8f30f078f3f5 100644
--- a/drivers/md/bcache/move.h
+++ b/drivers/md/bcache/move.h
@@ -3,6 +3,7 @@
#include "buckets.h"
#include "io_types.h"
+#include "move_types.h"
enum moving_purpose {
MOVING_PURPOSE_UNKNOWN, /* Un-init */
@@ -27,17 +28,11 @@ struct moving_context {
atomic_t error_count;
atomic_t error_flags;
- /* If != 0, @task is waiting for a read or write to complete */
- atomic_t pending;
- struct task_struct *task;
/* Key and sector moves issued, updated from submission context */
u64 keys_moved;
u64 sectors_moved;
- /* Last key scanned */
- struct bpos last_scanned;
-
/* Rate-limiter counting submitted reads */
struct bch_ratelimit *rate;
@@ -59,8 +54,6 @@ static inline int bch_moving_context_wait(struct moving_context *ctxt)
return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl);
}
-void bch_moving_wait(struct moving_context *);
-
struct migrate_write {
BKEY_PADDED(key);
bool promote;
@@ -124,14 +117,28 @@ typedef struct moving_io *(moving_queue_fn)(struct moving_queue *,
int bch_queue_init(struct moving_queue *,
struct cache_set *,
- unsigned max_keys,
unsigned max_ios,
unsigned max_reads,
unsigned max_writes,
bool rotational,
const char *);
void bch_queue_start(struct moving_queue *);
-bool bch_queue_full(struct moving_queue *);
+
+/*
+ * bch_queue_full() - return if more reads can be queued with bch_data_move().
+ *
+ * In rotational mode, always returns false if no reads are in flight (see
+ * how max_count is initialized in bch_queue_init()).
+ */
+static inline bool bch_queue_full(struct moving_queue *q)
+{
+ EBUG_ON(atomic_read(&q->count) > q->max_count);
+ EBUG_ON(atomic_read(&q->read_count) > q->max_read_count);
+
+ return atomic_read(&q->count) == q->max_count ||
+ atomic_read(&q->read_count) == q->max_read_count;
+}
+
void bch_data_move(struct moving_queue *,
struct moving_context *,
struct moving_io *);
@@ -149,21 +156,18 @@ void bch_queue_run(struct moving_queue *, struct moving_context *);
#define sysfs_queue_attribute(name) \
rw_attribute(name##_max_count); \
rw_attribute(name##_max_read_count); \
- rw_attribute(name##_max_write_count); \
- rw_attribute(name##_max_keys)
+ rw_attribute(name##_max_write_count);
#define sysfs_queue_files(name) \
&sysfs_##name##_max_count, \
&sysfs_##name##_max_read_count, \
- &sysfs_##name##_max_write_count, \
- &sysfs_##name##_max_keys
+ &sysfs_##name##_max_write_count
#define sysfs_queue_show(name, var) \
do { \
sysfs_hprint(name##_max_count, (var)->max_count); \
sysfs_print(name##_max_read_count, (var)->max_read_count); \
sysfs_print(name##_max_write_count, (var)->max_write_count);\
- sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\
} while (0)
#define sysfs_queue_store(name, var) \
@@ -171,12 +175,6 @@ do { \
sysfs_strtoul(name##_max_count, (var)->max_count); \
sysfs_strtoul(name##_max_read_count, (var)->max_read_count); \
sysfs_strtoul(name##_max_write_count, (var)->max_write_count); \
- if (attr == &sysfs_##name##_max_keys) { \
- int v = strtoi_h_or_return(buf); \
- \
- v = clamp(v, 2, KEYLIST_MAX); \
- bch_scan_keylist_resize(&(var)->keys, v); \
- } \
} while (0)
#endif /* _BCACHE_MOVE_H */
diff --git a/drivers/md/bcache/move_types.h b/drivers/md/bcache/move_types.h
index d5e1a4a968fa..25f6d2fab592 100644
--- a/drivers/md/bcache/move_types.h
+++ b/drivers/md/bcache/move_types.h
@@ -8,7 +8,6 @@
struct moving_queue {
struct work_struct work;
- struct scan_keylist keys;
struct workqueue_struct *wq;
/* Configuration */
@@ -57,9 +56,11 @@ struct moving_queue {
*/
struct list_head write_pending;
- unsigned count;
- unsigned read_count;
- unsigned write_count;
+ atomic_t count;
+ atomic_t read_count;
+ atomic_t write_count;
+
+ wait_queue_head_t wait;
};
#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index a18340031368..76b92c4f1713 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -5,6 +5,7 @@
*/
#include "bcache.h"
+#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
#include "extents.h"
@@ -16,13 +17,12 @@
#include <trace/events/bcache.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
+#include <linux/wait.h>
/* Moving GC - IO loop */
-static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
+static bool moving_pred(struct cache *ca, struct bkey_s_c k)
{
- struct cache *ca = container_of(kl, struct cache,
- moving_gc_queue.keys);
struct cache_set *c = ca->set;
const struct bch_extent_ptr *ptr;
bool ret = false;
@@ -41,11 +41,11 @@ static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
return ret;
}
-static int issue_moving_gc_move(struct moving_queue *q,
+static int issue_moving_gc_move(struct cache *ca,
struct moving_context *ctxt,
struct bkey_s_c k)
{
- struct cache *ca = container_of(q, struct cache, moving_gc_queue);
+ struct moving_queue *q = &ca->moving_gc_queue;
struct cache_set *c = ca->set;
const struct bch_extent_ptr *ptr;
struct moving_io *io;
@@ -57,7 +57,7 @@ static int issue_moving_gc_move(struct moving_queue *q,
goto found;
/* We raced - bucket's been reused */
- goto out;
+ return 0;
found:
gen--;
BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
@@ -70,65 +70,53 @@ found:
trace_bcache_gc_copy(k.k);
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
bch_data_move(q, ctxt, io);
-out:
- bch_scan_keylist_dequeue(&q->keys);
return 0;
}
static void read_moving(struct cache *ca, struct moving_context *ctxt)
{
- struct bkey_i *k;
- bool again;
+ struct cache_set *c = ca->set;
+ struct btree_iter iter;
+ struct bkey_s_c k;
bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+ bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+ while (!bch_moving_context_wait(ctxt) &&
+ (k = bch_btree_iter_peek(&iter)).k) {
+ if (!moving_pred(ca, k))
+ goto next;
+
+ if (bch_queue_full(&ca->moving_gc_queue)) {
+ bch_btree_iter_unlock(&iter);
+
+ if (ca->moving_gc_queue.rotational)
+ bch_queue_run(&ca->moving_gc_queue, ctxt);
+ else
+ wait_event(ca->moving_gc_queue.wait,
+ !bch_queue_full(&ca->moving_gc_queue));
+ continue;
+ }
+
+ if (issue_moving_gc_move(ca, ctxt, k)) {
+ bch_btree_iter_unlock(&iter);
- do {
- again = false;
-
- while (!bch_moving_context_wait(ctxt)) {
- if (bch_queue_full(&ca->moving_gc_queue)) {
- if (ca->moving_gc_queue.rotational) {
- again = true;
- break;
- } else {
- bch_moving_wait(ctxt);
- continue;
- }
- }
-
- k = bch_scan_keylist_next_rescan(
- ca->set,
- &ca->moving_gc_queue.keys,
- &ctxt->last_scanned,
- POS_MAX,
- moving_pred);
-
- if (k == NULL)
- break;
-
- if (issue_moving_gc_move(&ca->moving_gc_queue,
- ctxt, bkey_i_to_s_c(k))) {
- /*
- * Memory allocation failed; we will wait for
- * all queued moves to finish and continue
- * scanning starting from the same key
- */
- again = true;
- break;
- }
+ /* memory allocation failure, wait for IOs to finish */
+ bch_queue_run(&ca->moving_gc_queue, ctxt);
+ continue;
}
+next:
+ bch_btree_iter_advance_pos(&iter);
+ //bch_btree_iter_cond_resched(&iter);
+
+ /* unlock before calling moving_context_wait() */
+ bch_btree_iter_unlock(&iter);
+ cond_resched();
+ }
- bch_queue_run(&ca->moving_gc_queue, ctxt);
- } while (!kthread_should_stop() && again);
+ bch_btree_iter_unlock(&iter);
+ bch_queue_run(&ca->moving_gc_queue, ctxt);
}
static bool bch_moving_gc(struct cache *ca)
@@ -289,7 +277,6 @@ static int bch_moving_gc_thread(void *arg)
return 0;
}
-#define MOVING_GC_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
#define MOVING_GC_NR 64
#define MOVING_GC_READ_NR 32
#define MOVING_GC_WRITE_NR 32
@@ -303,7 +290,6 @@ int bch_moving_init_cache(struct cache *ca)
return bch_queue_init(&ca->moving_gc_queue,
ca->set,
- MOVING_GC_KEYS_MAX_SIZE,
MOVING_GC_NR,
MOVING_GC_READ_NR,
MOVING_GC_WRITE_NR,
@@ -343,12 +329,6 @@ void bch_moving_gc_stop(struct cache *ca)
if (ca->moving_gc_read)
kthread_stop(ca->moving_gc_read);
ca->moving_gc_read = NULL;
-
- /*
- * Make sure that it is empty so that gc marking doesn't keep
- * marking stale entries from when last used.
- */
- bch_scan_keylist_reset(&ca->moving_gc_queue.keys);
}
void bch_moving_gc_destroy(struct cache *ca)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6275f708e555..309a720c5a80 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1074,8 +1074,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
init_rwsem(&c->gc_lock);
mutex_init(&c->trigger_gc_lock);
- mutex_init(&c->gc_scan_keylist_lock);
- INIT_LIST_HEAD(&c->gc_scan_keylists);
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
spin_lock_init(&c->name##_time.lock);
@@ -1981,12 +1979,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
ca->tiering_write_point.reserve = RESERVE_NONE;
ca->tiering_write_point.group = &ca->self;
- /* XXX: scan keylists will die */
- bch_scan_keylist_init(&ca->moving_gc_queue.keys, c,
- DFLT_SCAN_KEYLIST_MAX_SIZE);
- bch_scan_keylist_init(&ca->tiering_queue.keys, c,
- DFLT_SCAN_KEYLIST_MAX_SIZE);
-
kobject_get(&c->kobj);
ca->set = c;
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 02ceb586239d..069e68a38a4a 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -15,15 +15,19 @@
#include <linux/kthread.h>
#include <trace/events/bcache.h>
-/**
- * tiering_pred - check if tiering should copy an extent to tier 1
- */
-static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
- struct cache *ca = container_of(kl, struct cache,
- tiering_queue.keys);
- struct cache_set *c = ca->set;
+struct tiering_state {
+ struct cache_group *tier;
+ unsigned tier_idx;
+ unsigned sectors;
+ unsigned stripe_size;
+ unsigned dev_idx;
+ struct cache *ca;
+};
+static bool tiering_pred(struct cache_set *c,
+ struct tiering_state *s,
+ struct bkey_s_c k)
+{
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
@@ -38,7 +42,7 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr)
if (ptr->dev < mi->nr_in_set &&
- mi->m[ptr->dev].tier)
+ mi->m[ptr->dev].tier >= s->tier_idx)
replicas++;
cache_member_info_put();
@@ -48,196 +52,54 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
return false;
}
-struct tiering_refill {
- struct bpos start;
- struct cache *ca;
- int cache_iter;
- u64 sectors;
-};
-
-static void refill_done(struct tiering_refill *refill)
+static void tier_put_device(struct tiering_state *s)
{
- if (refill->ca) {
- percpu_ref_put(&refill->ca->ref);
- refill->ca = NULL;
- }
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ s->ca = NULL;
}
/**
* refill_next - move on to refilling the next cache's tiering keylist
*/
-static void refill_next(struct cache_set *c, struct tiering_refill *refill)
-{
- struct cache_group *tier;
-
- refill_done(refill);
-
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices == 0)
- goto out;
-
- while (1) {
- while (refill->cache_iter < tier->nr_devices) {
- refill->ca = rcu_dereference(
- tier->d[refill->cache_iter].dev);
- if (refill->ca != NULL) {
- percpu_ref_get(&refill->ca->ref);
- goto out;
- }
- refill->cache_iter++;
- }
-
- /* Reached the end, wrap around */
- refill->cache_iter = 0;
- }
-
-out:
- rcu_read_unlock();
-}
-
-/*
- * refill_init - Start refilling a random cache device -- this ensures we
- * distribute data sanely even if each tiering pass discovers only a few
- * keys to tier
- */
-static void refill_init(struct cache_set *c, struct tiering_refill *refill)
-{
- struct cache_group *tier;
-
- memset(refill, 0, sizeof(*refill));
- refill->start = POS_MIN;
-
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices != 0)
- refill->cache_iter = bch_rand_range(tier->nr_devices);
- rcu_read_unlock();
-
- refill_next(c, refill);
-}
-
-/**
- * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache
- * device's tiering keylist before we move on to the next cache device
- */
-static bool tiering_keylist_full(struct tiering_refill *refill)
+static void tier_next_device(struct cache_set *c, struct tiering_state *s)
{
- return (refill->sectors >= refill->ca->tiering_stripe_size);
-}
-
-/**
- * tiering_keylist_empty - to prevent a keylist from growing to more than twice
- * the tiering stripe size, we stop refill when a keylist has more than a single
- * stripe of sectors
- */
-static bool tiering_keylist_empty(struct cache *ca)
-{
- return (bch_scan_keylist_sectors(&ca->tiering_queue.keys)
- <= ca->tiering_stripe_size);
-}
-
-/**
- * tiering_refill - to keep all queues busy as much as possible, we add
- * up to a single stripe of sectors to each cache device's queue, iterating
- * over all cache devices twice, so each one has two stripe's of writes
- * queued up, before we have to wait for move operations to complete.
- */
-static void tiering_refill(struct cache_set *c, struct tiering_refill *refill)
-{
- struct scan_keylist *keys;
- struct btree_iter iter;
- struct bkey_s_c k;
-
- if (bkey_cmp(refill->start, POS_MAX) >= 0)
- return;
-
- if (refill->ca == NULL)
- return;
-
- if (!tiering_keylist_empty(refill->ca))
- return;
-
- trace_bcache_tiering_refill_start(c);
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) {
- BKEY_PADDED(k) tmp;
-
- keys = &refill->ca->tiering_queue.keys;
-
- if (!tiering_pred(keys, k)) {
- refill->start = k.k->p;
- goto next;
- }
-
- bkey_reassemble(&tmp.k, k);
- bch_cut_front(refill->start, &tmp.k);
-
- /* Growing the keylist might fail */
- if (bch_scan_keylist_add(keys, bkey_i_to_s_c(&tmp.k)))
- goto done;
-
- /* TODO: split key if refill->sectors is now > stripe_size */
- refill->sectors += tmp.k.k.size;
- refill->start = tmp.k.k.p;
-
- /* Check if we've added enough keys to this keylist */
- if (tiering_keylist_full(refill)) {
- /* Move on to refill the next cache device's keylist */
- refill->sectors = 0;
- refill->cache_iter++;
- refill_next(c, refill);
-
- /* All cache devices got removed somehow */
- if (refill->ca == NULL)
- goto done;
-
- /*
- * If the next cache's keylist is not sufficiently
- * empty, wait for it to drain before refilling
- * anything. We prioritize even distribution of data
- * over maximizing write bandwidth.
- */
- if (!tiering_keylist_empty(refill->ca))
- goto done;
+ if (!s->ca || s->sectors > s->stripe_size) {
+ tier_put_device(s);
+ s->sectors = 0;
+ s->dev_idx++;
+
+ spin_lock(&s->tier->lock);
+ if (s->dev_idx >= s->tier->nr_devices)
+ s->dev_idx = 0;
+
+ if (s->tier->nr_devices) {
+ s->ca = s->tier->d[s->dev_idx].dev;
+ percpu_ref_get(&s->ca->ref);
}
-next:
- bch_btree_iter_cond_resched(&iter);
+ spin_unlock(&s->tier->lock);
}
- /* Reached the end of the keyspace */
- refill->start = POS_MAX;
-done:
- bch_btree_iter_unlock(&iter);
-
- trace_bcache_tiering_refill_end(c);
}
-static int issue_tiering_move(struct moving_queue *q,
+static int issue_tiering_move(struct cache_set *c,
+ struct tiering_state *s,
struct moving_context *ctxt,
struct bkey_s_c k)
{
- struct cache *ca = container_of(q, struct cache, tiering_queue);
- struct cache_set *c = ca->set;
struct moving_io *io;
- io = moving_io_alloc(c, q, &ca->tiering_write_point, k, NULL);
+ io = moving_io_alloc(c,
+ &s->ca->tiering_queue,
+ &s->ca->tiering_write_point,
+ k, NULL);
if (!io) {
trace_bcache_tiering_alloc_fail(c, k.k->size);
return -ENOMEM;
}
trace_bcache_tiering_copy(k.k);
-
- /*
- * IMPORTANT: We must call bch_data_move before we dequeue so
- * that the key can always be found in either the pending list
- * in the moving queue or in the scan keylist list in the
- * moving queue.
- * If we reorder, there is a window where a key is not found
- * by btree gc marking.
- */
- bch_data_move(q, ctxt, io);
- bch_scan_keylist_dequeue(&q->keys);
+ bch_data_move(&s->ca->tiering_queue, ctxt, io);
+ s->sectors += k.k->size;
return 0;
}
@@ -245,124 +107,80 @@ static int issue_tiering_move(struct moving_queue *q,
* tiering_next_cache - issue a move to write an extent to the next cache
* device in round robin order
*/
-static int tiering_next_cache(struct cache_set *c,
- int *cache_iter,
- struct moving_context *ctxt,
- struct tiering_refill *refill)
-{
- struct cache_group *tier;
- int start = *cache_iter;
- struct cache *ca;
-
- /* If true at the end of the loop, all keylists were empty, so we
- * have reached the end of the keyspace */
- bool done = true;
- /* If true at the end of the loop, all queues were full, so we must
- * wait for some ops to finish */
- bool full = true;
-
- do {
- rcu_read_lock();
- tier = &c->cache_tiers[1];
- if (tier->nr_devices == 0) {
- rcu_read_unlock();
- return 0;
- }
-
- if (*cache_iter >= tier->nr_devices) {
- rcu_read_unlock();
- *cache_iter = 0;
- continue;
- }
-
- ca = rcu_dereference(tier->d[*cache_iter].dev);
- if (ca == NULL ||
- ca->mi.state != CACHE_ACTIVE ||
- ca->tiering_queue.stopped) {
- rcu_read_unlock();
- (*cache_iter)++;
- continue;
- }
-
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
- (*cache_iter)++;
-
- tiering_refill(c, refill);
-
- if (bch_queue_full(&ca->tiering_queue)) {
- done = false;
- } else {
- struct bkey_i *k =
- bch_scan_keylist_next(&ca->tiering_queue.keys);
- if (k) {
- issue_tiering_move(&ca->tiering_queue, ctxt,
- bkey_i_to_s_c(k));
- done = false;
- full = false;
- }
- }
-
- percpu_ref_put(&ca->ref);
- } while (*cache_iter != start);
-
- if (done) {
- /*
- * All devices have an empty keylist now, just wait for
- * pending moves to finish and we're done.
- */
- return 0;
- } else if (full) {
- /*
- * No device with keys still remaining on its keylist has a
- * queue that is not full. In this case, we have to wait for
- * at least one read to complete before trying again.
- * Otherwise, we could issue a read for this device.
- */
- return -EAGAIN;
- } else {
- /* Try again immediately */
- return -EIOCBQUEUED;
- }
-}
-
-static u64 read_tiering(struct cache_set *c)
+static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
{
struct moving_context ctxt;
- struct tiering_refill refill;
- int cache_iter = 0;
+ struct tiering_state s;
+ struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
trace_bcache_tiering_start(c);
- refill_init(c, &refill);
+ memset(&s, 0, sizeof(s));
+ s.tier = tier;
+ s.tier_idx = tier - c->cache_tiers;
+ s.stripe_size = 2048; /* 1 mb for now */
bch_moving_context_init(&ctxt, &c->tiering_pd.rate,
MOVING_PURPOSE_TIERING);
+ bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
- while (!bch_moving_context_wait(&ctxt)) {
- cond_resched();
+ while (!bch_moving_context_wait(&ctxt) &&
+ (k = bch_btree_iter_peek(&iter)).k) {
+ if (!tiering_pred(c, &s, k))
+ goto next;
- ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill);
- if (ret == -EAGAIN)
- bch_moving_wait(&ctxt);
- else if (!ret)
+ tier_next_device(c, &s);
+ if (!s.ca)
break;
+
+ if (bch_queue_full(&s.ca->tiering_queue)) {
+ bch_btree_iter_unlock(&iter);
+
+ if (s.ca->tiering_queue.rotational)
+ bch_queue_run(&s.ca->tiering_queue, &ctxt);
+ else
+ wait_event(s.ca->tiering_queue.wait,
+ !bch_queue_full(&s.ca->tiering_queue));
+ continue;
+ }
+
+ ret = issue_tiering_move(c, &s, &ctxt, k);
+ if (ret) {
+ bch_btree_iter_unlock(&iter);
+
+ /* memory allocation failure, wait for IOs to finish */
+
+ /*
+ * XXX: this only waits for IOs issued to this
+ * particular device, but there may not be any outstanding
+ * to this device
+ */
+ bch_queue_run(&s.ca->tiering_queue, &ctxt);
+ continue;
+ }
+next:
+ bch_btree_iter_advance_pos(&iter);
+ //bch_btree_iter_cond_resched(&iter);
+
+ /* unlock before calling moving_context_wait() */
+ bch_btree_iter_unlock(&iter);
+ cond_resched();
}
+ bch_btree_iter_unlock(&iter);
+ tier_put_device(&s);
closure_sync(&ctxt.cl);
- refill_done(&refill);
-
trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
- //pr_info("pred y %llu pred n %llu", c->tiering_pred_y, c->tiering_pred_n);
-
return ctxt.sectors_moved;
}
static int bch_tiering_thread(void *arg)
{
struct cache_set *c = arg;
+ struct cache_group *tier = &c->cache_tiers[1];
struct io_clock *clock = &c->io_clock[WRITE];
struct cache *ca;
u64 tier_capacity, available_sectors;
@@ -373,20 +191,26 @@ static int bch_tiering_thread(void *arg)
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->tiering_enabled &&
- c->cache_tiers[1].nr_devices))
+ tier->nr_devices))
break;
while (1) {
+ struct cache_group *faster_tier;
+
last = atomic_long_read(&clock->now);
tier_capacity = available_sectors = 0;
rcu_read_lock();
- group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
- tier_capacity +=
- (ca->mi.nbuckets -
- ca->mi.first_bucket) << ca->bucket_bits;
- available_sectors +=
- buckets_available_cache(ca) << ca->bucket_bits;
+ for (faster_tier = c->cache_tiers;
+ faster_tier != tier;
+ faster_tier++) {
+ group_for_each_cache_rcu(ca, faster_tier, i) {
+ tier_capacity +=
+ (ca->mi.nbuckets -
+ ca->mi.first_bucket) << ca->bucket_bits;
+ available_sectors +=
+ buckets_available_cache(ca) << ca->bucket_bits;
+ }
}
rcu_read_unlock();
@@ -401,13 +225,12 @@ static int bch_tiering_thread(void *arg)
return 0;
}
- read_tiering(c);
+ read_tiering(c, tier);
}
return 0;
}
-#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
#define TIERING_NR 64
#define TIERING_READ_NR 8
#define TIERING_WRITE_NR 32
@@ -423,7 +246,6 @@ int bch_tiering_init_cache(struct cache *ca)
return bch_queue_init(&ca->tiering_queue,
ca->set,
- TIERING_KEYS_MAX_SIZE,
TIERING_NR,
TIERING_READ_NR,
TIERING_WRITE_NR,
@@ -458,12 +280,6 @@ void bch_tiering_write_destroy(struct cache *ca)
void bch_tiering_write_stop(struct cache *ca)
{
bch_queue_stop(&ca->tiering_queue);
-
- /*
- * Make sure that it is empty so that gc marking doesn't keep
- * marking stale entries from when last used.
- */
- bch_scan_keylist_reset(&ca->tiering_queue.keys);
}
void bch_tiering_read_stop(struct cache_set *c)
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index f4153a5c962d..73b80c8648e4 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -1080,9 +1080,9 @@ DECLARE_EVENT_CLASS(moving_io,
__entry->inode = k->p.inode;
__entry->offset = k->p.offset;
__entry->sectors = k->size;
- __entry->count = q->count;
- __entry->read_count = q->read_count;
- __entry->write_count = q->write_count;
+ __entry->count = atomic_read(&q->count);
+ __entry->read_count = atomic_read(&q->read_count);
+ __entry->write_count = atomic_read(&q->write_count);
),
TP_printk("%p %u:%llu sectors %u queue %u reads %u writes %u",