bcache: scan keylists get to die in a fire

author: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-04 06:16:50 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-01-18 21:40:33 -0900
commit: 40ff276e09f62f09fb892cc183766b88376783bb (patch)
tree: 2e0d68f8223bcc0387aba1a92d6357740a0c0827
parent: b9aadd59175a6475eb2e8b5e21bd38bb9454b099 (diff)
14 files changed, 328 insertions, 966 deletions
diff --git a/drivers/md/bcache/alloc_types.h b/drivers/md/bcache/alloc_types.h
index 0bfafbf8c91f..aa3c81268817 100644
--- a/drivers/md/bcache/alloc_types.h
+++ b/drivers/md/bcache/alloc_types.h
@@ -55,7 +55,7 @@ struct cache_group {
 	unsigned		cur_device;
 	struct {
 		u64		weight;
-		struct cache __rcu *dev;
+		struct cache	*dev;
 	}			d[MAX_CACHES_PER_SET];
 };
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1507fe4d5047..cbb2dca3257a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -686,10 +686,6 @@ struct cache_set {
 	struct task_struct	*gc_thread;
 	atomic_t		kick_gc;
 
-	/* This is a list of scan_keylists for btree GC to scan */
-	struct list_head	gc_scan_keylists;
-	struct mutex		gc_scan_keylist_lock;
-
 	/*
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
diff --git a/drivers/md/bcache/btree_gc.c b/drivers/md/bcache/btree_gc.c
index 4db9e54b3751..8bd6070cfee0 100644
--- a/drivers/md/bcache/btree_gc.c
+++ b/drivers/md/bcache/btree_gc.c
@@ -308,23 +308,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void bch_mark_scan_keylists(struct cache_set *c)
-{
-	struct scan_keylist *kl;
-
-	mutex_lock(&c->gc_scan_keylist_lock);
-
-	/* What the goddamn fuck? */
-	list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) {
-		if (kl->owner == NULL)
-			bch_keylist_recalc_oldest_gens(c, kl);
-		else
-			bch_queue_recalc_oldest_gens(c, kl->owner);
-	}
-
-	mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
 /**
  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
  */
@@ -426,7 +409,6 @@ void bch_gc(struct cache_set *c)
 	bch_mark_metadata(c);
 	bch_mark_pending_btree_node_frees(c);
 	bch_writeback_recalc_oldest_gens(c);
-	bch_mark_scan_keylists(c);
 
 	for_each_cache(ca, c, i)
 		atomic_long_set(&ca->saturated_count, 0);
diff --git a/drivers/md/bcache/keylist.c b/drivers/md/bcache/keylist.c
index 4f9d1865abdc..b7af04e5adfa 100644
--- a/drivers/md/bcache/keylist.c
+++ b/drivers/md/bcache/keylist.c
@@ -118,198 +118,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
 	BUG_ON(l->top_p > l->end_keys_p);
 	bkey_copy(where, insert);
 }
-
-/* Scan keylists simple utilities */
-
-void bch_scan_keylist_init(struct scan_keylist *kl,
-			   struct cache_set *c,
-			   unsigned max_size)
-
-{
-	kl->c = c;
-	kl->owner = NULL;
-
-	mutex_init(&kl->lock);
-	kl->max_size = max_size;
-	bch_keylist_init(&kl->list, NULL, 0);
-
-	/*
-	 * Order of initialization is tricky, and this makes sure that
-	 * we have a valid cache set in case the order of
-	 * initialization chages and breaks things.
-	 */
-	BUG_ON(c == NULL);
-	mutex_lock(&c->gc_scan_keylist_lock);
-	list_add_tail(&kl->mark_list, &c->gc_scan_keylists);
-	mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl)
-{
-	if (kl->c) {
-		mutex_lock(&kl->c->gc_scan_keylist_lock);
-		list_del(&kl->mark_list);
-		mutex_unlock(&kl->c->gc_scan_keylist_lock);
-	}
-
-	mutex_lock(&kl->lock);
-	bch_keylist_free(&kl->list);
-	mutex_unlock(&kl->lock);
-}
-
-void bch_scan_keylist_reset(struct scan_keylist *kl)
-{
-	mutex_lock(&kl->lock);
-	kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p;
-	mutex_unlock(&kl->lock);
-}
-
-/*
- * This should only be called from sysfs, and holding a lock that prevents
- * re-entrancy.
- */
-void bch_scan_keylist_resize(struct scan_keylist *kl,
-			     unsigned max_size)
-{
-	mutex_lock(&kl->lock);
-	kl->max_size = max_size;	/* May be smaller than current size */
-	mutex_unlock(&kl->lock);
-}
-
-/**
- * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys
- *
- * This prevents us from wrapping around gens for a bucket only referenced from
- * the tiering or moving GC keylists. We don't actually care that the data in
- * those buckets is marked live, only that we don't wrap the gens.
- *
- * Note: This interlocks with insertions, but not all dequeues interlock.
- * The particular case in which dequeues don't interlock is when a
- * scan list used by the copy offload ioctls is used as a plain
- * keylist for btree insertion.
- * The btree insertion code doesn't go through
- * bch_scan_keylist_dequeue below, and instead uses plain
- * bch_keylist_dequeue.  The other pointers (top, start, end) are
- * unchanged in this case.
- * A little care with the bottomp pointer suffices in this case.
- * Of course, we may end up marking stuff that we don't need to mark,
- * but was recently valid and we have likely just inserted in the tree
- * anyway.
- */
-void bch_keylist_recalc_oldest_gens(struct cache_set *c,
-				    struct scan_keylist *kl)
-{
-	struct bkey_i *k;
-
-	mutex_lock(&kl->lock);
-
-	for_each_keylist_key(&kl->list, k)
-		bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k));
-
-	mutex_unlock(&kl->lock);
-}
-
-int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	int ret;
-
-	mutex_lock(&kl->lock);
-	ret = bch_keylist_realloc_max(&kl->list,
-				      k.k->u64s,
-				      kl->max_size);
-
-	if (!ret) {
-		bkey_reassemble(kl->list.top, k);
-		bch_keylist_enqueue(&kl->list);
-		atomic64_add(k.k->size, &kl->sectors);
-	}
-	mutex_unlock(&kl->lock);
-
-	return ret;
-}
-
-/* Actual scanning functionality of scan_keylists */
-
-static void bch_refill_scan_keylist(struct cache_set *c,
-				    struct scan_keylist *kl,
-				    struct bpos *last_scanned,
-				    struct bpos end,
-				    scan_keylist_pred_fn *pred)
-{
-	struct bpos start = *last_scanned;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned nr_found = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) {
-		if (bkey_cmp(k.k->p, end) >= 0) {
-			*last_scanned = k.k->p;
-			goto done;
-		}
-
-		if (pred(kl, k)) {
-			BKEY_PADDED(k) tmp;
-
-			bkey_reassemble(&tmp.k, k);
-			bch_cut_front(*last_scanned, &tmp.k);
-
-			if (bch_scan_keylist_add(kl, bkey_i_to_s_c(&tmp.k)))
-				goto done;
-
-			nr_found++;
-		}
-
-		*last_scanned = k.k->p;
-		bch_btree_iter_cond_resched(&iter);
-	}
-
-	/* If we end up here, it means:
-	 * - the map_fn didn't fill up the keybuf
-	 * - the map_fn didn't see the end key
-	 * - there were no more keys to map over
-	 * Therefore, we are at the end of the key space */
-	*last_scanned = POS_MAX;
-done:
-	bch_btree_iter_unlock(&iter);
-
-	trace_bcache_keyscan(nr_found,
-			     start.inode, start.offset,
-			     last_scanned->inode,
-			     last_scanned->offset);
-}
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl)
-{
-	if (bch_keylist_empty(&kl->list))
-		return NULL;
-
-	return bch_keylist_front(&kl->list);
-}
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
-					    struct scan_keylist *kl,
-					    struct bpos *last_scanned,
-					    struct bpos end,
-					    scan_keylist_pred_fn *pred)
-{
-	if (bch_keylist_empty(&kl->list)) {
-		if (bkey_cmp(*last_scanned, end) >= 0)
-			return NULL;
-
-		bch_refill_scan_keylist(c, kl, last_scanned, end, pred);
-	}
-
-	return bch_scan_keylist_next(kl);
-}
-
-void bch_scan_keylist_dequeue(struct scan_keylist *kl)
-{
-	u64 sectors;
-
-	mutex_lock(&kl->lock);
-	sectors = kl->list.bot->k.size;
-	bch_keylist_dequeue(&kl->list);
-	mutex_unlock(&kl->lock);
-
-	BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0);
-}
diff --git a/drivers/md/bcache/keylist.h b/drivers/md/bcache/keylist.h
index 028552757527..990ba72ee60c 100644
--- a/drivers/md/bcache/keylist.h
+++ b/drivers/md/bcache/keylist.h
@@ -116,49 +116,4 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
 int bch_keylist_realloc(struct keylist *, unsigned);
 int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned);
 
-void bch_scan_keylist_init(struct scan_keylist *kl,
-			   struct cache_set *c,
-			   unsigned max_size);
-
-void bch_scan_keylist_reset(struct scan_keylist *kl);
-
-/* The keylist is dynamically adjusted. This just clamps the maxima */
-
-static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl)
-{
-	return kl->max_size;
-}
-
-static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl)
-{
-	return atomic64_read(&kl->sectors);
-}
-
-void bch_scan_keylist_resize(struct scan_keylist *kl,
-			     unsigned max_size);
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl);
-
-/*
- * IMPORTANT: The caller of bch_scan_keylist_next or
- * bch_scan_keylist_next_rescan needs to copy any
- * non-null return value before calling either again!
- * These functions return a pointer into the internal structure.
- * Furthermore, they need to call bch_scan_keylist_advance after
- * copying the structure.
- */
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *);
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
-					    struct scan_keylist *kl,
-					    struct bpos *last_scanned,
-					    struct bpos end,
-					    scan_keylist_pred_fn *pred);
-
-int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c);
-void bch_scan_keylist_dequeue(struct scan_keylist *);
-
-void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *);
-
 #endif /* _BCACHE_KEYLIST_H */
diff --git a/drivers/md/bcache/keylist_types.h b/drivers/md/bcache/keylist_types.h
index 569cdc2480e2..156fbe0745fd 100644
--- a/drivers/md/bcache/keylist_types.h
+++ b/drivers/md/bcache/keylist_types.h
@@ -48,49 +48,4 @@ struct keylist {
 	bool				has_buf;
 };
 
-/*
- * scan_keylists are conceptually similar to keybufs, but they don't
- * have an internal RB tree.
- * keybufs should be used when read or write operations need to
- * examine keys in flight, as for writeback.
- * But for moving operations (moving gc, tiering, moving data off
- * devices), read and writes don't need to look at all, so we don't
- * need the RB tree and use scan_keylists instead.
- *
- * Note that unlike keybufs, they don't contain a semaphore to limit
- * bios.  That must be done externally, if necessary.
- */
-
-#define DFLT_SCAN_KEYLIST_MAX_SIZE	512
-
-struct scan_keylist {
-	struct list_head	mark_list;	/* For GC marking */
-
-	struct cache_set	*c;	/* For destroying */
-
-	/*
-	 * Only one thread is allowed to mutate the keylist. Other threads can
-	 * read it. The mutex has to be taken by the mutator thread when
-	 * mutating the keylist, and by other threads when reading, but not by
-	 * the mutator thread when reading.
-	 */
-	struct mutex		lock;
-	/*
-	 * Maximum size, in u64s. The keylist will not grow beyond this size.
-	 */
-	unsigned		max_size;
-	/*
-	 * Number of sectors in keys currently on the keylist.
-	 */
-	atomic64_t		sectors;
-	/*
-	 * The underlying keylist.
-	 */
-	struct keylist		list;
-
-	struct moving_queue	*owner;
-};
-
-typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c);
-
 #endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c
index 1169f8fc28a1..46d05e32b2f6 100644
--- a/drivers/md/bcache/migrate.c
+++ b/drivers/md/bcache/migrate.c
@@ -12,20 +12,9 @@
 #include "migrate.h"
 #include "move.h"
 
-static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	struct cache *ca = container_of(kl, struct cache,
-					moving_gc_queue.keys);
-
-	return bkey_extent_is_data(k.k) &&
-		bch_extent_has_device(bkey_s_c_to_extent(k),
-				      ca->sb.nr_this_dev);
-}
-
 static int issue_migration_move(struct cache *ca,
 				struct moving_context *ctxt,
-				struct bkey_s_c k,
-				u64 *seen_key_count)
+				struct bkey_s_c k)
 {
 	struct moving_queue *q = &ca->moving_gc_queue;
 	struct cache_set *c = ca->set;
@@ -49,24 +38,10 @@ found:
 	}
 
 	bch_data_move(q, ctxt, io);
-	(*seen_key_count)++;
-
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
-	bch_scan_keylist_dequeue(&q->keys);
 	return 0;
 }
 
-#define MIGRATION_DEBUG		0
-
 #define MAX_DATA_OFF_ITER	10
-#define PASS_LOW_LIMIT		(MIGRATION_DEBUG ? 0 : 2)
 #define MIGRATE_NR		64
 #define MIGRATE_READ_NR		32
 #define MIGRATE_WRITE_NR	32
@@ -87,15 +62,14 @@ found:
 
 int bch_move_data_off_device(struct cache *ca)
 {
-	int ret;
-	struct bkey_i *k;
-	unsigned pass;
-	u64 seen_key_count;
-	unsigned last_error_count;
-	unsigned last_error_flags;
-	struct moving_context context;
+	struct moving_context ctxt;
 	struct cache_set *c = ca->set;
 	struct moving_queue *queue = &ca->moving_gc_queue;
+	unsigned pass = 0;
+	u64 seen_key_count;
+	int ret = 0;
+
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
 
 	/*
 	 * This reuses the moving gc queue as it is no longer in use
@@ -116,8 +90,8 @@ int bch_move_data_off_device(struct cache *ca)
 	queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR);
 
 	BUG_ON(queue->wq == NULL);
-	bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION);
-	context.avoid = ca;
+	bch_moving_context_init(&ctxt, NULL, MOVING_PURPOSE_MIGRATION);
+	ctxt.avoid = ca;
 
 	/*
 	 * In theory, only one pass should be necessary as we've
@@ -136,142 +110,128 @@ int bch_move_data_off_device(struct cache *ca)
 	 * but that can be viewed as a verification pass.
 	 */
 
-	seen_key_count = 1;
-	last_error_count = 0;
-	last_error_flags = 0;
-
-	for (pass = 0;
-	     (seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER));
-	     pass++) {
-		bool again;
+	do {
+		struct btree_iter iter;
+		struct bkey_s_c k;
 
 		seen_key_count = 0;
-		atomic_set(&context.error_count, 0);
-		atomic_set(&context.error_flags, 0);
-		context.last_scanned = POS_MIN;
+		atomic_set(&ctxt.error_count, 0);
+		atomic_set(&ctxt.error_flags, 0);
 
-again:
-		again = false;
+		bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+		while ((k = bch_btree_iter_peek(&iter)).k) {
+			if (!bkey_extent_is_data(k.k) ||
+			    !bch_extent_has_device(bkey_s_c_to_extent(k),
+						   ca->sb.nr_this_dev))
+				goto next;
 
-		while (1) {
 			if (bch_queue_full(queue)) {
-				if (queue->rotational) {
-					again = true;
-					break;
-				} else {
-					bch_moving_wait(&context);
-					continue;
-				}
+				bch_btree_iter_unlock(&iter);
+
+				if (queue->rotational)
+					bch_queue_run(queue, &ctxt);
+				else
+					wait_event(queue->wait,
+						   !bch_queue_full(queue));
+				continue;
 			}
 
-			k = bch_scan_keylist_next_rescan(c,
-							 &queue->keys,
-							 &context.last_scanned,
-							 POS_MAX,
-							 migrate_data_pred);
-			if (k == NULL)
-				break;
+			ret = issue_migration_move(ca, &ctxt, k);
+			if (ret == -ENOMEM) {
+				bch_btree_iter_unlock(&iter);
 
-			if (issue_migration_move(ca, &context, bkey_i_to_s_c(k),
-						 &seen_key_count)) {
 				/*
-				 * Memory allocation failed; we will wait for
-				 * all queued moves to finish and continue
-				 * scanning starting from the same key
+				 * memory allocation failure, wait for IOs to
+				 * finish
 				 */
-				again = true;
-				break;
+				bch_queue_run(queue, &ctxt);
+				continue;
 			}
-		}
+			if (ret == -ENOSPC) {
+				bch_btree_iter_unlock(&iter);
+				bch_queue_run(queue, &ctxt);
+				return -ENOSPC;
+			}
+			BUG_ON(ret);
 
-		bch_queue_run(queue, &context);
-		if (again)
-			goto again;
+			seen_key_count++;
+next:
+			bch_btree_iter_advance_pos(&iter);
+			bch_btree_iter_cond_resched(&iter);
 
-		if ((pass >= PASS_LOW_LIMIT)
-		    && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) {
-			pr_notice("found %llu keys on pass %u.",
-				  seen_key_count, pass);
 		}
+		ret = bch_btree_iter_unlock(&iter);
+		bch_queue_run(queue, &ctxt);
 
-		last_error_count = atomic_read(&context.error_count);
-		last_error_flags = atomic_read(&context.error_flags);
-
-		if (last_error_count != 0) {
-			pr_notice("pass %u: error count = %u, error flags = 0x%x",
-				  pass, last_error_count, last_error_flags);
-		}
-	}
+		if (ret)
+			return ret;
+	} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
 
-	if (seen_key_count != 0 || last_error_count != 0) {
+	if (seen_key_count) {
 		pr_err("Unable to migrate all data in %d iterations.",
 		       MAX_DATA_OFF_ITER);
-		ret = -EDEADLK;
-	} else if (MIGRATION_DEBUG)
-		pr_notice("Migrated all data in %d iterations", pass);
+		return -1;
+	}
 
-	bch_queue_run(queue, &context);
-	return ret;
+	return 0;
 }
 
 /*
  * This walks the btree, and for any node on the relevant device it moves the
  * node elsewhere.
  */
-static int bch_move_btree_off(struct cache *ca,
-			      enum btree_id id,
-			      const char *name)
+static int bch_move_btree_off(struct cache *ca, enum btree_id id)
 {
+	struct cache_set *c = ca->set;
+	struct btree_iter iter;
 	struct closure cl;
-	unsigned pass;
+	struct btree *b;
+	int ret;
 
-	closure_init_stack(&cl);
+	BUG_ON(ca->mi.state == CACHE_ACTIVE);
 
-	pr_debug("Moving %s btree off device %u",
-		 name, ca->sb.nr_this_dev);
+	closure_init_stack(&cl);
 
-	for (pass = 0; (pass < MAX_DATA_OFF_ITER); pass++) {
-		struct btree_iter iter;
-		struct btree *b;
-		unsigned moved = 0, seen = 0;
-		int ret;
-
-		for_each_btree_node(&iter, ca->set, id, POS_MIN, 0, b) {
-			struct bkey_s_c_extent e =
-				bkey_i_to_s_c_extent(&b->key);
-			seen++;
+	for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 retry:
-			if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
-				continue;
-
-			if (bch_btree_node_rewrite(&iter, b, &cl)) {
-				/*
-				 * Drop locks to upgrade locks or wait on
-				 * reserve: after retaking, recheck in case we
-				 * raced.
-				 */
-				bch_btree_iter_unlock(&iter);
-				closure_sync(&cl);
-				b = bch_btree_iter_peek_node(&iter);
-				goto retry;
-			}
-
-			moved++;
-			iter.locks_want = 0;
+		if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+			continue;
+
+		ret = bch_btree_node_rewrite(&iter, b, &cl);
+		if (ret == -EINTR || ret == -ENOSPC) {
+			/*
+			 * Drop locks to upgrade locks or wait on
+			 * reserve: after retaking, recheck in case we
+			 * raced.
+			 */
+			bch_btree_iter_unlock(&iter);
+			closure_sync(&cl);
+			b = bch_btree_iter_peek_node(&iter);
+			goto retry;
 		}
-		ret = bch_btree_iter_unlock(&iter);
-		if (ret)
-			return ret; /* btree IO error */
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		iter.locks_want = 0;
+	}
+	ret = bch_btree_iter_unlock(&iter);
+	if (ret)
+		return ret; /* btree IO error */
 
-		if (!moved)
-			return 0;
+	if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+		for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
+			struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 
-		pr_debug("%s pass %u: seen %u, moved %u.",
-			 name, pass, seen, moved);
+			BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+		}
+		bch_btree_iter_unlock(&iter);
 	}
 
-	/* Failed: */
-	return -1;
+	return 0;
 }
 
 /*
@@ -321,25 +281,25 @@ retry:
 int bch_move_meta_data_off_device(struct cache *ca)
 {
 	unsigned i;
-	int ret = 0;		/* Success */
+	int ret;
 
 	/* 1st, Move the btree nodes off the device */
 
-	for (i = 0; i < BTREE_ID_NR; i++)
-		if (bch_move_btree_off(ca, i, bch_btree_id_names[i]) != 0)
-			return 1;
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		ret = bch_move_btree_off(ca, i);
+		if (ret)
+			return ret;
+	}
 
 	/* There are no prios/gens to move -- they are already in the device. */
 
 	/* 2nd. Move the journal off the device */
 
-	if (bch_journal_move(ca) != 0) {
-		pr_err("Unable to move the journal off in %pU.",
-		       ca->set->disk_sb.user_uuid.b);
-		ret = 1;	/* Failure */
-	}
+	ret = bch_journal_move(ca);
+	if (ret)
+		return ret;
 
-	return ret;
+	return 0;
 }
 
 /*
diff --git a/drivers/md/bcache/move.c b/drivers/md/bcache/move.c
index 9c4abbe9576e..f9c2d9e46e84 100644
--- a/drivers/md/bcache/move.c
+++ b/drivers/md/bcache/move.c
@@ -162,50 +162,14 @@ void bch_moving_context_init(struct moving_context *ctxt,
 			     enum moving_purpose purpose)
 {
 	memset(ctxt, 0, sizeof(*ctxt));
-	ctxt->task = current;
 	ctxt->rate = rate;
 	ctxt->purpose = purpose;
 	closure_init_stack(&ctxt->cl);
 }
 
-/*
- * bch_moving_wait() -- wait for a bch_moving_notify() call
- *
- * To deal with lost wakeups, we make this return immediately if notify
- * was already called.
- */
-void bch_moving_wait(struct moving_context *ctxt)
-{
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (atomic_xchg(&ctxt->pending, 0))
-			break;
-		schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-}
-
-static void bch_moving_notify(struct moving_context *ctxt)
-{
-	atomic_set(&ctxt->pending, 1);
-	wake_up_process(ctxt->task);
-}
-
-static bool __bch_queue_reads_pending(struct moving_queue *q)
-{
-	return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree));
-}
-
 static bool bch_queue_reads_pending(struct moving_queue *q)
 {
-	unsigned long flags;
-	bool pending;
-
-	spin_lock_irqsave(&q->lock, flags);
-	pending = __bch_queue_reads_pending(q);
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	return pending;
+	return atomic_read(&q->read_count) || !RB_EMPTY_ROOT(&q->tree);
 }
 
 static void bch_queue_write(struct moving_queue *q)
@@ -276,7 +240,6 @@ static void moving_io_destructor(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_queue *q = io->q;
-	struct moving_context *ctxt = io->context;
 	unsigned long flags;
 	bool kick_writes = true;
 
@@ -285,28 +248,29 @@ static void moving_io_destructor(struct closure *cl)
 
 	spin_lock_irqsave(&q->lock, flags);
 
-	BUG_ON(!q->count);
-	q->count--;
-
 	if (io->read_issued) {
-		BUG_ON(!q->read_count);
-		q->read_count--;
+		BUG_ON(!atomic_read(&q->read_count));
+		atomic_dec(&q->read_count);
 	}
 
 	if (io->write_issued) {
-		BUG_ON(!q->write_count);
-		q->write_count--;
+		BUG_ON(!atomic_read(&q->write_count));
+		atomic_dec(&q->write_count);
 		trace_bcache_move_write_done(q, &io->write.key.k);
 	}
 
+	BUG_ON(!atomic_read(&q->count));
+	atomic_dec(&q->count);
+	wake_up(&q->wait);
+
 	list_del_init(&io->list);
 
-	if ((q->count == 0) && (q->stop_waitcl != NULL)) {
+	if (!atomic_read(&q->count) && q->stop_waitcl) {
 		closure_put(q->stop_waitcl);
 		q->stop_waitcl = NULL;
 	}
 
-	if (q->rotational && __bch_queue_reads_pending(q))
+	if (q->rotational && bch_queue_reads_pending(q))
 		kick_writes = false;
 
 	if (list_empty(&q->pending))
@@ -318,8 +282,6 @@ static void moving_io_destructor(struct closure *cl)
 
 	if (kick_writes)
 		bch_queue_write(q);
-
-	bch_moving_notify(ctxt);
 }
 
 static void moving_io_after_write(struct closure *cl)
@@ -336,13 +298,12 @@ static void moving_io_after_write(struct closure *cl)
 static void write_moving(struct moving_io *io)
 {
 	bool stopped;
-	unsigned long flags;
 	struct bch_write_op *op = &io->write.op;
 
-	spin_lock_irqsave(&io->q->lock, flags);
-	BUG_ON(io->q->count == 0);
+	spin_lock_irq(&io->q->lock);
+	BUG_ON(!atomic_read(&io->q->count));
 	stopped = io->q->stopped;
-	spin_unlock_irqrestore(&io->q->lock, flags);
+	spin_unlock_irq(&io->q->lock);
 
 	/*
 	 * If the queue has been stopped, prevent the write from occurring.
@@ -362,17 +323,17 @@ static void bch_queue_write_work(struct work_struct *work)
 {
 	struct moving_queue *q = container_of(work, struct moving_queue, work);
 	struct moving_io *io;
-	unsigned long flags;
 
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irq(&q->lock);
 
-	if (q->rotational && __bch_queue_reads_pending(q)) {
+	if (q->rotational && bch_queue_reads_pending(q)) {
 		/* All reads should have finished before writes start */
-		spin_unlock_irqrestore(&q->lock, flags);
+		spin_unlock_irq(&q->lock);
 		return;
 	}
 
-	while (!q->stopped && q->write_count < q->max_write_count) {
+	while (!q->stopped &&
+	       atomic_read(&q->write_count) < q->max_write_count) {
 		io = list_first_entry_or_null(&q->pending,
 					struct moving_io, list);
 		/*
@@ -388,17 +349,17 @@ static void bch_queue_write_work(struct work_struct *work)
 			break;
 
 		BUG_ON(io->write_issued);
-		q->write_count++;
+		atomic_inc(&q->write_count);
 		io->write_issued = 1;
 		list_del(&io->list);
 		list_add_tail(&io->list, &q->write_pending);
 		trace_bcache_move_write(q, &io->write.key.k);
-		spin_unlock_irqrestore(&q->lock, flags);
+		spin_unlock_irq(&q->lock);
 		write_moving(io);
-		spin_lock_irqsave(&q->lock, flags);
+		spin_lock_irq(&q->lock);
 	}
 
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irq(&q->lock);
 }
 
 /*
@@ -408,7 +369,6 @@ static void bch_queue_write_work(struct work_struct *work)
 
 int bch_queue_init(struct moving_queue *q,
 		   struct cache_set *c,
-		   unsigned max_size,
 		   unsigned max_count,
 		   unsigned max_read_count,
 		   unsigned max_write_count,
@@ -417,7 +377,6 @@ int bch_queue_init(struct moving_queue *q,
 {
 	INIT_WORK(&q->work, bch_queue_write_work);
 
-	q->keys.owner = q;
 	q->max_count = max_count;
 	q->max_read_count = max_read_count;
 	q->max_write_count = max_write_count;
@@ -427,6 +386,7 @@ int bch_queue_init(struct moving_queue *q,
 	INIT_LIST_HEAD(&q->pending);
 	INIT_LIST_HEAD(&q->write_pending);
 	q->tree = RB_ROOT;
+	init_waitqueue_head(&q->wait);
 
 	q->wq = alloc_workqueue(name,
 				WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1);
@@ -443,8 +403,6 @@ void bch_queue_start(struct moving_queue *q)
 	spin_lock_irqsave(&q->lock, flags);
 	q->stopped = false;
 	spin_unlock_irqrestore(&q->lock, flags);
-
-	bch_scan_keylist_reset(&q->keys);
 }
 
 void queue_io_resize(struct moving_queue *q,
@@ -466,8 +424,6 @@ void bch_queue_destroy(struct moving_queue *q)
 	if (q->wq)
 		destroy_workqueue(q->wq);
 	q->wq = NULL;
-
-	bch_scan_keylist_destroy(&q->keys);
 }
 
 static void bch_queue_cancel_writes(struct moving_queue *q)
@@ -489,8 +445,11 @@ static void bch_queue_cancel_writes(struct moving_queue *q)
 		list_del_init(&io->list);
 		read_issued = io->read_issued;
 		read_completed = io->read_completed;
-		if (!read_issued && !read_completed && q->rotational)
+		if (!read_issued && !read_completed && q->rotational) {
 			rb_erase(&io->node, &q->tree);
+			wake_up(&q->wait);
+		}
+
 		spin_unlock_irqrestore(&q->lock, flags);
 		if (read_completed)
 			closure_return_with_destructor_noreturn(&io->cl,
@@ -505,22 +464,21 @@ static void bch_queue_cancel_writes(struct moving_queue *q)
 
 void bch_queue_stop(struct moving_queue *q)
 {
-	unsigned long flags;
 	struct closure waitcl;
 
 	closure_init_stack(&waitcl);
 
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irq(&q->lock);
 	if (q->stopped)
 		BUG_ON(q->stop_waitcl != NULL);
 	else {
 		q->stopped = true;
-		if (q->count != 0) {
+		if (atomic_read(&q->count)) {
 			q->stop_waitcl = &waitcl;
 			closure_get(&waitcl);
 		}
 	}
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irq(&q->lock);
 
 	bch_queue_cancel_writes(q);
 
@@ -549,9 +507,6 @@ void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
 {
 	unsigned long flags;
 
-	/* 1st, mark the keylist keys */
-	bch_keylist_recalc_oldest_gens(c, &q->keys);
-
 	/* 2nd, mark the keys in the I/Os */
 	spin_lock_irqsave(&q->lock, flags);
 
@@ -566,7 +521,6 @@ static void read_moving_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_queue *q = io->q;
-	struct moving_context *ctxt = io->context;
 	bool stopped;
 
 	unsigned long flags;
@@ -586,8 +540,11 @@ static void read_moving_endio(struct bio *bio)
 	BUG_ON(io->read_completed);
 	io->read_issued = 0;
 	io->read_completed = 1;
-	BUG_ON(!q->read_count);
-	q->read_count--;
+
+	BUG_ON(!atomic_read(&q->read_count));
+	atomic_dec(&q->read_count);
+	wake_up(&q->wait);
+
 	stopped = q->stopped;
 	if (stopped)
 		list_del_init(&io->list);
@@ -598,8 +555,6 @@ static void read_moving_endio(struct bio *bio)
 			moving_io_destructor);
 	else if (!q->rotational)
 		bch_queue_write(q);
-
-	bch_moving_notify(ctxt);
 }
 
 static void __bch_data_move(struct closure *cl)
@@ -628,27 +583,6 @@ static void __bch_data_move(struct closure *cl)
 			&pick, BCH_READ_IS_LAST);
 }
 
-/*
- * bch_queue_full() - return if more reads can be queued with bch_data_move().
- *
- * In rotational mode, always returns false if no reads are in flight (see
- * how max_count is initialized in bch_queue_init()).
- */
-bool bch_queue_full(struct moving_queue *q)
-{
-	unsigned long flags;
-	bool full;
-
-	spin_lock_irqsave(&q->lock, flags);
-	BUG_ON(q->count > q->max_count);
-	BUG_ON(q->read_count > q->max_read_count);
-	full = (q->count == q->max_count ||
-		q->read_count == q->max_read_count);
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	return full;
-}
-
 static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2)
 {
 	if (io1->sort_key < io2->sort_key)
@@ -671,20 +605,19 @@ void bch_data_move(struct moving_queue *q,
 		   struct moving_context *ctxt,
 		   struct moving_io *io)
 {
-	unsigned long flags;
 	bool stopped = false;
 
 	BUG_ON(q->wq == NULL);
 	io->q = q;
 	io->context = ctxt;
 
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irq(&q->lock);
 	if (q->stopped) {
 		stopped = true;
 		goto out;
 	}
 
-	q->count++;
+	atomic_inc(&q->count);
 	list_add_tail(&io->list, &q->pending);
 	trace_bcache_move_read(q, &io->write.key.k);
 
@@ -693,11 +626,11 @@ void bch_data_move(struct moving_queue *q,
 	else {
 		BUG_ON(io->read_issued);
 		io->read_issued = 1;
-		q->read_count++;
+		atomic_inc(&q->read_count);
 	}
 
 out:
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irq(&q->lock);
 
 	if (stopped)
 		moving_io_free(io);
@@ -710,26 +643,27 @@ out:
 static bool bch_queue_read(struct moving_queue *q,
 			   struct moving_context *ctxt)
 {
-	unsigned long flags;
 	struct rb_node *node;
 	struct moving_io *io;
 	bool stopped;
 
 	BUG_ON(!q->rotational);
 
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irq(&q->lock);
 	node = rb_first(&q->tree);
 	if (!node) {
-		spin_unlock_irqrestore(&q->lock, flags);
+		spin_unlock_irq(&q->lock);
 		return false;
 	}
 
 	io = rb_entry(node, struct moving_io, node);
 	rb_erase(node, &q->tree);
+	wake_up(&q->wait);
+
 	io->read_issued = 1;
-	q->read_count++;
+	atomic_inc(&q->read_count);
 	stopped = q->stopped;
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irq(&q->lock);
 
 	if (stopped) {
 		moving_io_destructor(&io->cl);
@@ -742,31 +676,19 @@ static bool bch_queue_read(struct moving_queue *q,
 
 void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt)
 {
-	unsigned long flags;
-	bool full;
-
 	if (!q->rotational)
 		goto sync;
 
 	while (!bch_moving_context_wait(ctxt)) {
-		spin_lock_irqsave(&q->lock, flags);
-		full = (q->read_count == q->max_read_count);
-		spin_unlock_irqrestore(&q->lock, flags);
-
-		if (full) {
-			bch_moving_wait(ctxt);
-			continue;
-		}
+		wait_event(q->wait,
+			   atomic_read(&q->read_count) < q->max_read_count);
 
 		if (!bch_queue_read(q, ctxt))
 			break;
 	}
 
-	while (bch_queue_reads_pending(q))
-		bch_moving_wait(ctxt);
-
+	wait_event(q->wait, !bch_queue_reads_pending(q));
 	bch_queue_write(q);
-
 sync:
 	closure_sync(&ctxt->cl);
 }
diff --git a/drivers/md/bcache/move.h b/drivers/md/bcache/move.h
index 0f8f787a5168..8f30f078f3f5 100644
--- a/drivers/md/bcache/move.h
+++ b/drivers/md/bcache/move.h
@@ -3,6 +3,7 @@
 
 #include "buckets.h"
 #include "io_types.h"
+#include "move_types.h"
 
 enum moving_purpose {
 	MOVING_PURPOSE_UNKNOWN,	/* Un-init */
@@ -27,17 +28,11 @@ struct moving_context {
 	atomic_t		error_count;
 	atomic_t		error_flags;
 
-	/* If != 0, @task is waiting for a read or write to complete */
-	atomic_t		pending;
-	struct task_struct	*task;
 
 	/* Key and sector moves issued, updated from submission context */
 	u64			keys_moved;
 	u64			sectors_moved;
 
-	/* Last key scanned */
-	struct bpos		last_scanned;
-
 	/* Rate-limiter counting submitted reads */
 	struct bch_ratelimit	*rate;
 
@@ -59,8 +54,6 @@ static inline int bch_moving_context_wait(struct moving_context *ctxt)
 	return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl);
 }
 
-void bch_moving_wait(struct moving_context *);
-
 struct migrate_write {
 	BKEY_PADDED(key);
 	bool			promote;
@@ -124,14 +117,28 @@ typedef struct moving_io *(moving_queue_fn)(struct moving_queue *,
 
 int bch_queue_init(struct moving_queue *,
 		   struct cache_set *,
-		   unsigned max_keys,
 		   unsigned max_ios,
 		   unsigned max_reads,
 		   unsigned max_writes,
 		   bool rotational,
 		   const char *);
 void bch_queue_start(struct moving_queue *);
-bool bch_queue_full(struct moving_queue *);
+
+/*
+ * bch_queue_full() - return if more reads can be queued with bch_data_move().
+ *
+ * In rotational mode, always returns false if no reads are in flight (see
+ * how max_count is initialized in bch_queue_init()).
+ */
+static inline bool bch_queue_full(struct moving_queue *q)
+{
+	EBUG_ON(atomic_read(&q->count) > q->max_count);
+	EBUG_ON(atomic_read(&q->read_count) > q->max_read_count);
+
+	return atomic_read(&q->count) == q->max_count ||
+		atomic_read(&q->read_count) == q->max_read_count;
+}
+
 void bch_data_move(struct moving_queue *,
 		   struct moving_context *,
 		   struct moving_io *);
@@ -149,21 +156,18 @@ void bch_queue_run(struct moving_queue *, struct moving_context *);
 #define sysfs_queue_attribute(name)					\
 	rw_attribute(name##_max_count);					\
 	rw_attribute(name##_max_read_count);				\
-	rw_attribute(name##_max_write_count);				\
-	rw_attribute(name##_max_keys)
+	rw_attribute(name##_max_write_count);
 
 #define sysfs_queue_files(name)						\
 	&sysfs_##name##_max_count,					\
 	&sysfs_##name##_max_read_count,					\
-	&sysfs_##name##_max_write_count,				\
-	&sysfs_##name##_max_keys
+	&sysfs_##name##_max_write_count
 
 #define sysfs_queue_show(name, var)					\
 do {									\
 	sysfs_hprint(name##_max_count,		(var)->max_count);	\
 	sysfs_print(name##_max_read_count,	(var)->max_read_count);	\
 	sysfs_print(name##_max_write_count,	(var)->max_write_count);\
-	sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\
 } while (0)
 
 #define sysfs_queue_store(name, var)					\
@@ -171,12 +175,6 @@ do {									\
 	sysfs_strtoul(name##_max_count, (var)->max_count);		\
 	sysfs_strtoul(name##_max_read_count, (var)->max_read_count);	\
 	sysfs_strtoul(name##_max_write_count, (var)->max_write_count);	\
-	if (attr == &sysfs_##name##_max_keys) {				\
-		int v = strtoi_h_or_return(buf);			\
-									\
-		v = clamp(v, 2, KEYLIST_MAX);				\
-		bch_scan_keylist_resize(&(var)->keys, v);		\
-	}								\
 } while (0)
 
 #endif /* _BCACHE_MOVE_H */
diff --git a/drivers/md/bcache/move_types.h b/drivers/md/bcache/move_types.h
index d5e1a4a968fa..25f6d2fab592 100644
--- a/drivers/md/bcache/move_types.h
+++ b/drivers/md/bcache/move_types.h
@@ -8,7 +8,6 @@
 
 struct moving_queue {
 	struct work_struct work;
-	struct scan_keylist keys;
 	struct workqueue_struct *wq;
 
 	/* Configuration */
@@ -57,9 +56,11 @@ struct moving_queue {
 	 */
 	struct list_head	write_pending;
 
-	unsigned		count;
-	unsigned		read_count;
-	unsigned		write_count;
+	atomic_t		count;
+	atomic_t		read_count;
+	atomic_t		write_count;
+
+	wait_queue_head_t	wait;
 };
 
 #endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index a18340031368..76b92c4f1713 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -5,6 +5,7 @@
  */
 
 #include "bcache.h"
+#include "btree_iter.h"
 #include "buckets.h"
 #include "clock.h"
 #include "extents.h"
@@ -16,13 +17,12 @@
 #include <trace/events/bcache.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/wait.h>
 
 /* Moving GC - IO loop */
 
-static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
+static bool moving_pred(struct cache *ca, struct bkey_s_c k)
 {
-	struct cache *ca = container_of(kl, struct cache,
-					moving_gc_queue.keys);
 	struct cache_set *c = ca->set;
 	const struct bch_extent_ptr *ptr;
 	bool ret = false;
@@ -41,11 +41,11 @@ static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
 	return ret;
 }
 
-static int issue_moving_gc_move(struct moving_queue *q,
+static int issue_moving_gc_move(struct cache *ca,
 				struct moving_context *ctxt,
 				struct bkey_s_c k)
 {
-	struct cache *ca = container_of(q, struct cache, moving_gc_queue);
+	struct moving_queue *q = &ca->moving_gc_queue;
 	struct cache_set *c = ca->set;
 	const struct bch_extent_ptr *ptr;
 	struct moving_io *io;
@@ -57,7 +57,7 @@ static int issue_moving_gc_move(struct moving_queue *q,
 			goto found;
 
 	/* We raced - bucket's been reused */
-	goto out;
+	return 0;
 found:
 	gen--;
 	BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
@@ -70,65 +70,53 @@ found:
 
 	trace_bcache_gc_copy(k.k);
 
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
 	bch_data_move(q, ctxt, io);
-out:
-	bch_scan_keylist_dequeue(&q->keys);
 	return 0;
 }
 
 static void read_moving(struct cache *ca, struct moving_context *ctxt)
 {
-	struct bkey_i *k;
-	bool again;
+	struct cache_set *c = ca->set;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 
 	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
+
+	while (!bch_moving_context_wait(ctxt) &&
+	       (k = bch_btree_iter_peek(&iter)).k) {
+		if (!moving_pred(ca, k))
+			goto next;
+
+		if (bch_queue_full(&ca->moving_gc_queue)) {
+			bch_btree_iter_unlock(&iter);
+
+			if (ca->moving_gc_queue.rotational)
+				bch_queue_run(&ca->moving_gc_queue, ctxt);
+			else
+				wait_event(ca->moving_gc_queue.wait,
+					!bch_queue_full(&ca->moving_gc_queue));
+			continue;
+		}
+
+		if (issue_moving_gc_move(ca, ctxt, k)) {
+			bch_btree_iter_unlock(&iter);
 
-	do {
-		again = false;
-
-		while (!bch_moving_context_wait(ctxt)) {
-			if (bch_queue_full(&ca->moving_gc_queue)) {
-				if (ca->moving_gc_queue.rotational) {
-					again = true;
-					break;
-				} else {
-					bch_moving_wait(ctxt);
-					continue;
-				}
-			}
-
-			k = bch_scan_keylist_next_rescan(
-				ca->set,
-				&ca->moving_gc_queue.keys,
-				&ctxt->last_scanned,
-				POS_MAX,
-				moving_pred);
-
-			if (k == NULL)
-				break;
-
-			if (issue_moving_gc_move(&ca->moving_gc_queue,
-						 ctxt, bkey_i_to_s_c(k))) {
-				/*
-				 * Memory allocation failed; we will wait for
-				 * all queued moves to finish and continue
-				 * scanning starting from the same key
-				 */
-				again = true;
-				break;
-			}
+			/* memory allocation failure, wait for IOs to finish */
+			bch_queue_run(&ca->moving_gc_queue, ctxt);
+			continue;
 		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
+	}
 
-		bch_queue_run(&ca->moving_gc_queue, ctxt);
-	} while (!kthread_should_stop() && again);
+	bch_btree_iter_unlock(&iter);
+	bch_queue_run(&ca->moving_gc_queue, ctxt);
 }
 
 static bool bch_moving_gc(struct cache *ca)
@@ -289,7 +277,6 @@ static int bch_moving_gc_thread(void *arg)
 	return 0;
 }
 
-#define MOVING_GC_KEYS_MAX_SIZE	DFLT_SCAN_KEYLIST_MAX_SIZE
 #define MOVING_GC_NR 64
 #define MOVING_GC_READ_NR 32
 #define MOVING_GC_WRITE_NR 32
@@ -303,7 +290,6 @@ int bch_moving_init_cache(struct cache *ca)
 
 	return bch_queue_init(&ca->moving_gc_queue,
 			      ca->set,
-			      MOVING_GC_KEYS_MAX_SIZE,
 			      MOVING_GC_NR,
 			      MOVING_GC_READ_NR,
 			      MOVING_GC_WRITE_NR,
@@ -343,12 +329,6 @@ void bch_moving_gc_stop(struct cache *ca)
 	if (ca->moving_gc_read)
 		kthread_stop(ca->moving_gc_read);
 	ca->moving_gc_read = NULL;
-
-	/*
-	 * Make sure that it is empty so that gc marking doesn't keep
-	 * marking stale entries from when last used.
-	 */
-	bch_scan_keylist_reset(&ca->moving_gc_queue.keys);
 }
 
 void bch_moving_gc_destroy(struct cache *ca)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6275f708e555..309a720c5a80 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1074,8 +1074,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
 
 	init_rwsem(&c->gc_lock);
 	mutex_init(&c->trigger_gc_lock);
-	mutex_init(&c->gc_scan_keylist_lock);
-	INIT_LIST_HEAD(&c->gc_scan_keylists);
 
 #define BCH_TIME_STAT(name, frequency_units, duration_units)		\
 	spin_lock_init(&c->name##_time.lock);
@@ -1981,12 +1979,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 	ca->tiering_write_point.reserve = RESERVE_NONE;
 	ca->tiering_write_point.group = &ca->self;
 
-	/* XXX: scan keylists will die */
-	bch_scan_keylist_init(&ca->moving_gc_queue.keys, c,
-			      DFLT_SCAN_KEYLIST_MAX_SIZE);
-	bch_scan_keylist_init(&ca->tiering_queue.keys, c,
-			      DFLT_SCAN_KEYLIST_MAX_SIZE);
-
 	kobject_get(&c->kobj);
 	ca->set = c;
 
diff --git a/drivers/md/bcache/tier.c b/drivers/md/bcache/tier.c
index 02ceb586239d..069e68a38a4a 100644
--- a/drivers/md/bcache/tier.c
+++ b/drivers/md/bcache/tier.c
@@ -15,15 +15,19 @@
 #include <linux/kthread.h>
 #include <trace/events/bcache.h>
 
-/**
- * tiering_pred - check if tiering should copy an extent to tier 1
- */
-static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	struct cache *ca = container_of(kl, struct cache,
-					tiering_queue.keys);
-	struct cache_set *c = ca->set;
+struct tiering_state {
+	struct cache_group	*tier;
+	unsigned		tier_idx;
+	unsigned		sectors;
+	unsigned		stripe_size;
+	unsigned		dev_idx;
+	struct cache		*ca;
+};
 
+static bool tiering_pred(struct cache_set *c,
+			 struct tiering_state *s,
+			 struct bkey_s_c k)
+{
 	if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const struct bch_extent_ptr *ptr;
@@ -38,7 +42,7 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
 		mi = cache_member_info_get(c);
 		extent_for_each_ptr(e, ptr)
 			if (ptr->dev < mi->nr_in_set &&
-			    mi->m[ptr->dev].tier)
+			    mi->m[ptr->dev].tier >= s->tier_idx)
 				replicas++;
 		cache_member_info_put();
 
@@ -48,196 +52,54 @@ static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
 	return false;
 }
 
-struct tiering_refill {
-	struct bpos		start;
-	struct cache		*ca;
-	int			cache_iter;
-	u64			sectors;
-};
-
-static void refill_done(struct tiering_refill *refill)
+static void tier_put_device(struct tiering_state *s)
 {
-	if (refill->ca) {
-		percpu_ref_put(&refill->ca->ref);
-		refill->ca = NULL;
-	}
+	if (s->ca)
+		percpu_ref_put(&s->ca->ref);
+	s->ca = NULL;
 }
 
 /**
  * refill_next - move on to refilling the next cache's tiering keylist
  */
-static void refill_next(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-
-	refill_done(refill);
-
-	rcu_read_lock();
-	tier = &c->cache_tiers[1];
-	if (tier->nr_devices == 0)
-		goto out;
-
-	while (1) {
-		while (refill->cache_iter < tier->nr_devices) {
-			refill->ca = rcu_dereference(
-					tier->d[refill->cache_iter].dev);
-			if (refill->ca != NULL) {
-				percpu_ref_get(&refill->ca->ref);
-				goto out;
-			}
-			refill->cache_iter++;
-		}
-
-		/* Reached the end, wrap around */
-		refill->cache_iter = 0;
-	}
-
-out:
-	rcu_read_unlock();
-}
-
-/*
- * refill_init - Start refilling a random cache device -- this ensures we
- * distribute data sanely even if each tiering pass discovers only a few
- * keys to tier
- */
-static void refill_init(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-
-	memset(refill, 0, sizeof(*refill));
-	refill->start = POS_MIN;
-
-	rcu_read_lock();
-	tier = &c->cache_tiers[1];
-	if (tier->nr_devices != 0)
-		refill->cache_iter = bch_rand_range(tier->nr_devices);
-	rcu_read_unlock();
-
-	refill_next(c, refill);
-}
-
-/**
- * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache
- * device's tiering keylist before we move on to the next cache device
- */
-static bool tiering_keylist_full(struct tiering_refill *refill)
+static void tier_next_device(struct cache_set *c, struct tiering_state *s)
 {
-	return (refill->sectors >= refill->ca->tiering_stripe_size);
-}
-
-/**
- * tiering_keylist_empty - to prevent a keylist from growing to more than twice
- * the tiering stripe size, we stop refill when a keylist has more than a single
- * stripe of sectors
- */
-static bool tiering_keylist_empty(struct cache *ca)
-{
-	return (bch_scan_keylist_sectors(&ca->tiering_queue.keys)
-		<= ca->tiering_stripe_size);
-}
-
-/**
- * tiering_refill - to keep all queues busy as much as possible, we add
- * up to a single stripe of sectors to each cache device's queue, iterating
- * over all cache devices twice, so each one has two stripe's of writes
- * queued up, before we have to wait for move operations to complete.
- */
-static void tiering_refill(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct scan_keylist *keys;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-
-	if (bkey_cmp(refill->start, POS_MAX) >= 0)
-		return;
-
-	if (refill->ca == NULL)
-		return;
-
-	if (!tiering_keylist_empty(refill->ca))
-		return;
-
-	trace_bcache_tiering_refill_start(c);
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) {
-		BKEY_PADDED(k) tmp;
-
-		keys = &refill->ca->tiering_queue.keys;
-
-		if (!tiering_pred(keys, k)) {
-			refill->start = k.k->p;
-			goto next;
-		}
-
-		bkey_reassemble(&tmp.k, k);
-		bch_cut_front(refill->start, &tmp.k);
-
-		/* Growing the keylist might fail */
-		if (bch_scan_keylist_add(keys, bkey_i_to_s_c(&tmp.k)))
-			goto done;
-
-		/* TODO: split key if refill->sectors is now > stripe_size */
-		refill->sectors += tmp.k.k.size;
-		refill->start = tmp.k.k.p;
-
-		/* Check if we've added enough keys to this keylist */
-		if (tiering_keylist_full(refill)) {
-			/* Move on to refill the next cache device's keylist */
-			refill->sectors = 0;
-			refill->cache_iter++;
-			refill_next(c, refill);
-
-			/* All cache devices got removed somehow */
-			if (refill->ca == NULL)
-				goto done;
-
-			/*
-			 * If the next cache's keylist is not sufficiently
-			 * empty, wait for it to drain before refilling
-			 * anything.  We prioritize even distribution of data
-			 * over maximizing write bandwidth.
-			 */
-			if (!tiering_keylist_empty(refill->ca))
-				goto done;
+	if (!s->ca || s->sectors > s->stripe_size) {
+		tier_put_device(s);
+		s->sectors = 0;
+		s->dev_idx++;
+
+		spin_lock(&s->tier->lock);
+		if (s->dev_idx >= s->tier->nr_devices)
+			s->dev_idx = 0;
+
+		if (s->tier->nr_devices) {
+			s->ca = s->tier->d[s->dev_idx].dev;
+			percpu_ref_get(&s->ca->ref);
 		}
-next:
-		bch_btree_iter_cond_resched(&iter);
+		spin_unlock(&s->tier->lock);
 	}
-	/* Reached the end of the keyspace */
-	refill->start = POS_MAX;
-done:
-	bch_btree_iter_unlock(&iter);
-
-	trace_bcache_tiering_refill_end(c);
 }
 
-static int issue_tiering_move(struct moving_queue *q,
+static int issue_tiering_move(struct cache_set *c,
+			      struct tiering_state *s,
 			      struct moving_context *ctxt,
 			      struct bkey_s_c k)
 {
-	struct cache *ca = container_of(q, struct cache, tiering_queue);
-	struct cache_set *c = ca->set;
 	struct moving_io *io;
 
-	io = moving_io_alloc(c, q, &ca->tiering_write_point, k, NULL);
+	io = moving_io_alloc(c,
+			     &s->ca->tiering_queue,
+			     &s->ca->tiering_write_point,
+			     k, NULL);
 	if (!io) {
 		trace_bcache_tiering_alloc_fail(c, k.k->size);
 		return -ENOMEM;
 	}
 
 	trace_bcache_tiering_copy(k.k);
-
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
-	bch_data_move(q, ctxt, io);
-	bch_scan_keylist_dequeue(&q->keys);
+	bch_data_move(&s->ca->tiering_queue, ctxt, io);
+	s->sectors += k.k->size;
 	return 0;
 }
 
@@ -245,124 +107,80 @@ static int issue_tiering_move(struct moving_queue *q,
  * tiering_next_cache - issue a move to write an extent to the next cache
  * device in round robin order
  */
-static int tiering_next_cache(struct cache_set *c,
-			      int *cache_iter,
-			      struct moving_context *ctxt,
-			      struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-	int start = *cache_iter;
-	struct cache *ca;
-
-	/* If true at the end of the loop, all keylists were empty, so we
-	 * have reached the end of the keyspace */
-	bool done = true;
-	/* If true at the end of the loop, all queues were full, so we must
-	 * wait for some ops to finish */
-	bool full = true;
-
-	do {
-		rcu_read_lock();
-		tier = &c->cache_tiers[1];
-		if (tier->nr_devices == 0) {
-			rcu_read_unlock();
-			return 0;
-		}
-
-		if (*cache_iter >= tier->nr_devices) {
-			rcu_read_unlock();
-			*cache_iter = 0;
-			continue;
-		}
-
-		ca = rcu_dereference(tier->d[*cache_iter].dev);
-		if (ca == NULL ||
-		    ca->mi.state != CACHE_ACTIVE ||
-		    ca->tiering_queue.stopped) {
-			rcu_read_unlock();
-			(*cache_iter)++;
-			continue;
-		}
-
-		percpu_ref_get(&ca->ref);
-		rcu_read_unlock();
-		(*cache_iter)++;
-
-		tiering_refill(c, refill);
-
-		if (bch_queue_full(&ca->tiering_queue)) {
-			done = false;
-		} else {
-			struct bkey_i *k =
-				bch_scan_keylist_next(&ca->tiering_queue.keys);
-			if (k) {
-				issue_tiering_move(&ca->tiering_queue, ctxt,
-						   bkey_i_to_s_c(k));
-				done = false;
-				full = false;
-			}
-		}
-
-		percpu_ref_put(&ca->ref);
-	} while (*cache_iter != start);
-
-	if (done) {
-		/*
-		 * All devices have an empty keylist now, just wait for
-		 * pending moves to finish and we're done.
-		 */
-		return 0;
-	} else if (full) {
-		/*
-		 * No device with keys still remaining on its keylist has a
-		 * queue that is not full. In this case, we have to wait for
-		 * at least one read to complete before trying again.
-		 * Otherwise, we could issue a read for this device.
-		 */
-		return -EAGAIN;
-	} else {
-		/* Try again immediately */
-		return -EIOCBQUEUED;
-	}
-}
-
-static u64 read_tiering(struct cache_set *c)
+static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
 {
 	struct moving_context ctxt;
-	struct tiering_refill refill;
-	int cache_iter = 0;
+	struct tiering_state s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
 	int ret;
 
 	trace_bcache_tiering_start(c);
 
-	refill_init(c, &refill);
+	memset(&s, 0, sizeof(s));
+	s.tier		= tier;
+	s.tier_idx	= tier - c->cache_tiers;
+	s.stripe_size	= 2048; /* 1 mb for now */
 
 	bch_moving_context_init(&ctxt, &c->tiering_pd.rate,
 				MOVING_PURPOSE_TIERING);
+	bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
 
-	while (!bch_moving_context_wait(&ctxt)) {
-		cond_resched();
+	while (!bch_moving_context_wait(&ctxt) &&
+	       (k = bch_btree_iter_peek(&iter)).k) {
+		if (!tiering_pred(c, &s, k))
+			goto next;
 
-		ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill);
-		if (ret == -EAGAIN)
-			bch_moving_wait(&ctxt);
-		else if (!ret)
+		tier_next_device(c, &s);
+		if (!s.ca)
 			break;
+
+		if (bch_queue_full(&s.ca->tiering_queue)) {
+			bch_btree_iter_unlock(&iter);
+
+			if (s.ca->tiering_queue.rotational)
+				bch_queue_run(&s.ca->tiering_queue, &ctxt);
+			else
+				wait_event(s.ca->tiering_queue.wait,
+					!bch_queue_full(&s.ca->tiering_queue));
+			continue;
+		}
+
+		ret = issue_tiering_move(c, &s, &ctxt, k);
+		if (ret) {
+			bch_btree_iter_unlock(&iter);
+
+			/* memory allocation failure, wait for IOs to finish */
+
+			/*
+			 * XXX: this only waits for IOs issued to this
+			 * particular device, but there may not be any outstanding
+			 * to this device
+			 */
+			bch_queue_run(&s.ca->tiering_queue, &ctxt);
+			continue;
+		}
+next:
+		bch_btree_iter_advance_pos(&iter);
+		//bch_btree_iter_cond_resched(&iter);
+
+		/* unlock before calling moving_context_wait() */
+		bch_btree_iter_unlock(&iter);
+		cond_resched();
 	}
 
+	bch_btree_iter_unlock(&iter);
+	tier_put_device(&s);
 	closure_sync(&ctxt.cl);
-	refill_done(&refill);
-
 	trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
 
-	//pr_info("pred y %llu pred n %llu", c->tiering_pred_y, c->tiering_pred_n);
-
 	return ctxt.sectors_moved;
 }
 
 static int bch_tiering_thread(void *arg)
 {
 	struct cache_set *c = arg;
+	struct cache_group *tier = &c->cache_tiers[1];
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct cache *ca;
 	u64 tier_capacity, available_sectors;
@@ -373,20 +191,26 @@ static int bch_tiering_thread(void *arg)
 
 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   c->cache_tiers[1].nr_devices))
+					   tier->nr_devices))
 			break;
 
 		while (1) {
+			struct cache_group *faster_tier;
+
 			last = atomic_long_read(&clock->now);
 
 			tier_capacity = available_sectors = 0;
 			rcu_read_lock();
-			group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
-				tier_capacity +=
-					(ca->mi.nbuckets -
-					 ca->mi.first_bucket) << ca->bucket_bits;
-				available_sectors +=
-					buckets_available_cache(ca) << ca->bucket_bits;
+			for (faster_tier = c->cache_tiers;
+			     faster_tier != tier;
+			     faster_tier++) {
+				group_for_each_cache_rcu(ca, faster_tier, i) {
+					tier_capacity +=
+						(ca->mi.nbuckets -
+						 ca->mi.first_bucket) << ca->bucket_bits;
+					available_sectors +=
+						buckets_available_cache(ca) << ca->bucket_bits;
+				}
 			}
 			rcu_read_unlock();
 
@@ -401,13 +225,12 @@ static int bch_tiering_thread(void *arg)
 				return 0;
 		}
 
-		read_tiering(c);
+		read_tiering(c, tier);
 	}
 
 	return 0;
 }
 
-#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
 #define TIERING_NR 64
 #define TIERING_READ_NR 8
 #define TIERING_WRITE_NR 32
@@ -423,7 +246,6 @@ int bch_tiering_init_cache(struct cache *ca)
 
 	return bch_queue_init(&ca->tiering_queue,
 			      ca->set,
-			      TIERING_KEYS_MAX_SIZE,
 			      TIERING_NR,
 			      TIERING_READ_NR,
 			      TIERING_WRITE_NR,
@@ -458,12 +280,6 @@ void bch_tiering_write_destroy(struct cache *ca)
 void bch_tiering_write_stop(struct cache *ca)
 {
 	bch_queue_stop(&ca->tiering_queue);
-
-	/*
-	 * Make sure that it is empty so that gc marking doesn't keep
-	 * marking stale entries from when last used.
-	 */
-	bch_scan_keylist_reset(&ca->tiering_queue.keys);
 }
 
 void bch_tiering_read_stop(struct cache_set *c)
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index f4153a5c962d..73b80c8648e4 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -1080,9 +1080,9 @@ DECLARE_EVENT_CLASS(moving_io,
 		__entry->inode		= k->p.inode;
 		__entry->offset		= k->p.offset;
 		__entry->sectors	= k->size;
-		__entry->count		= q->count;
-		__entry->read_count	= q->read_count;
-		__entry->write_count	= q->write_count;
+		__entry->count		= atomic_read(&q->count);
+		__entry->read_count	= atomic_read(&q->read_count);
+		__entry->write_count	= atomic_read(&q->write_count);
 	),
 
 	TP_printk("%p %u:%llu sectors %u queue %u reads %u writes %u",
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-04 06:16:50 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-01-18 21:40:33 -0900
commit	40ff276e09f62f09fb892cc183766b88376783bb (patch)
tree	2e0d68f8223bcc0387aba1a92d6357740a0c0827
parent	b9aadd59175a6475eb2e8b5e21bd38bb9454b099 (diff)