24 files changed, 633 insertions, 2160 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 390c167819c4..5a688e85732c 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -5,6 +5,6 @@ bcache-y		:= acl.o alloc.o bkey.o bkey_methods.o blockdev.o\
 	bset.o btree_cache.o btree_gc.o btree_io.o btree_iter.o btree_update.o\
 	buckets.o chardev.o checksum.o clock.o closure.o debug.o dirent.o\
 	error.o extents.o fs.o fs-gc.o fs-io.o inode.o io.o journal.o keybuf.o\
-	keylist.o migrate.o move.o movinggc.o notify.o opts.o request.o\
-	siphash.o six.o stats.o super.o sysfs.o tier.o trace.o util.o\
-	writeback.o xattr.o
+	keylist.o migrate.o move.o notify.o opts.o rebalance.o request.o\
+	siphash.o six.o stats.o super.o sysfs.o trace.o util.o writeback.o\
+	xattr.o
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c
index 1d1d302c84a1..3485019c535a 100644
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
@@ -148,10 +148,10 @@ static void pd_controllers_update(struct work_struct *work)
 
 			if (fragmented < 0)
 				fragmented = 0;
-
+#if 0
 			bch_pd_controller_update(&ca->moving_gc_pd,
 						 free, fragmented, -1);
-
+#endif
 			if (i == 0)
 				tier0_can_free += fragmented;
 
@@ -165,11 +165,12 @@ static void pd_controllers_update(struct work_struct *work)
 		u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
 
 		tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
-
+#if 0
 		bch_pd_controller_update(&c->tiering_pd,
 					 target,
 					 tier_dirty[0],
 					 -1);
+#endif
 	}
 
 	/*
@@ -579,7 +580,6 @@ static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g)
 
 	g->read_prio = ca->set->prio_clock[READ].hand;
 	g->write_prio = ca->set->prio_clock[WRITE].hand;
-	g->copygc_gen = 0;
 
 	verify_not_on_freelist(ca, g - ca->buckets);
 }
@@ -1643,11 +1643,10 @@ void bch_cache_allocator_stop(struct cache *ca)
 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
 		bch_stop_write_point(ca, &c->write_points[i]);
 
-	for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++)
-		bch_stop_write_point(ca, &ca->gc_buckets[i]);
+	for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+		bch_stop_write_point(ca, &c->rebalance[i].wp);
 
 	bch_stop_write_point(ca, &c->promote_write_point);
-	bch_stop_write_point(ca, &ca->tiering_write_point);
 	bch_stop_write_point(ca, &c->migration_write_point);
 	bch_stop_write_point(ca, &c->btree_write_point);
 
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 065b9c02f185..1372fc26ccb1 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -38,7 +38,7 @@ enum alloc_reserve {
 	RESERVE_PRIO,
 	RESERVE_BTREE,
 	RESERVE_METADATA_LAST = RESERVE_BTREE,
-	RESERVE_MOVINGGC,
+	RESERVE_MOVINGGC, /* hrm */
 
 	RESERVE_NONE,
 	RESERVE_NR,
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index 7f856f7fd1aa..a6bbd38f4316 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -278,7 +278,6 @@
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "keybuf_types.h"
-#include "move_types.h"
 #include "stats_types.h"
 #include "super_types.h"
 
@@ -356,8 +355,6 @@ struct cache {
 
 	struct cache_set	*set;
 
-	struct cache_group	self;
-
 	/*
 	 * Cached version of this device's member info from superblock
 	 * Committed by write_super()
@@ -433,25 +430,6 @@ struct cache {
 	struct mutex		heap_lock;
 	DECLARE_HEAP(struct bucket_heap_entry, heap);
 
-	/* Moving GC: */
-	struct task_struct	*moving_gc_read;
-
-	struct moving_queue	moving_gc_queue;
-	struct bch_pd_controller moving_gc_pd;
-
-	/* Tiering: */
-	struct moving_queue	tiering_queue;
-	struct write_point	tiering_write_point;
-	unsigned		tiering_stripe_size;
-
-	/*
-	 * open buckets used in moving garbage collection
-	 * NOTE: GC_GEN == 0 signifies no moving gc, so accessing the
-	 * gc_buckets array is always GC_GEN-1.
-	 */
-#define NUM_GC_GENS 8
-	struct write_point	gc_buckets[NUM_GC_GENS];
-
 	struct journal_device	journal;
 
 	struct work_struct	io_error_work;
@@ -504,6 +482,26 @@ struct btree_debug {
 	struct dentry		*btree_format;
 };
 
+struct rebalance_bucket_entry {
+	size_t			bucket;
+	u8			dev;
+	u8			gen;
+	unsigned		sectors;
+};
+
+struct rebalance_thread {
+	unsigned		tier;
+	unsigned		initialized;
+	struct task_struct	*p;
+	struct bch_pd_controller pd;
+	struct write_point	wp;
+
+	struct workqueue_struct	*wq;
+
+	struct mutex		heap_lock;
+	DECLARE_HEAP(struct rebalance_bucket_entry, heap);
+};
+
 struct cache_set {
 	struct closure		cl;
 
@@ -702,10 +700,6 @@ struct cache_set {
 	struct task_struct	*gc_thread;
 	atomic_t		kick_gc;
 
-	/* This is a list of scan_keylists for btree GC to scan */
-	struct list_head	gc_scan_keylists;
-	struct mutex		gc_scan_keylist_lock;
-
 	/*
 	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
 	 * has been marked by GC.
@@ -751,9 +745,8 @@ struct cache_set {
 	/* FILESYSTEM */
 	atomic_long_t		nr_inodes;
 
-	/* TIERING */
-	struct task_struct	*tiering_read;
-	struct bch_pd_controller tiering_pd;
+	/* REBALANCE */
+	struct rebalance_thread	rebalance[CACHE_TIERS];
 
 	/* NOTIFICATIONS */
 	struct mutex		uevent_lock;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 965b4a58ba9a..65222eb0b0cb 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -301,23 +301,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void bch_mark_scan_keylists(struct cache_set *c)
-{
-	struct scan_keylist *kl;
-
-	mutex_lock(&c->gc_scan_keylist_lock);
-
-	/* What the goddamn fuck? */
-	list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) {
-		if (kl->owner == NULL)
-			bch_keylist_recalc_oldest_gens(c, kl);
-		else
-			bch_queue_recalc_oldest_gens(c, kl->owner);
-	}
-
-	mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
 /**
  * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
  */
@@ -419,7 +402,6 @@ void bch_gc(struct cache_set *c)
 	bch_mark_metadata(c);
 	bch_mark_pending_btree_node_frees(c);
 	bch_writeback_recalc_oldest_gens(c);
-	bch_mark_scan_keylists(c);
 
 	for_each_cache(ca, c, i)
 		atomic_long_set(&ca->saturated_count, 0);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e878ac09a0f2..be225cb850c7 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -33,6 +33,14 @@ static inline struct cache *PTR_CACHE(const struct cache_set *c,
 	return rcu_dereference(c->cache[ptr->dev]);
 }
 
+static inline unsigned PTR_TIER(const struct cache_member_rcu *mi,
+				const struct bch_extent_ptr *ptr)
+{
+	return ptr->dev < mi->nr_in_set
+		? mi->m[ptr->dev].tier
+		: UINT_MAX;
+}
+
 static inline size_t PTR_BUCKET_NR(const struct cache *ca,
 				   const struct bch_extent_ptr *ptr)
 {
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a1914404531e..78cdcafcf155 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -42,9 +42,6 @@ struct bucket {
 	struct bucket_mark	mark;
 	/* Most out of date gen in the btree */
 	u8			oldest_gen;
-
-	/* generation copygc is going to move this bucket into */
-	u8			copygc_gen;
 };
 
 struct bucket_stats_cache {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index d041f0cfbdc7..6890bcc77dae 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1649,14 +1649,6 @@ static void bch_extent_to_text(struct cache_set *c, char *buf,
 #undef p
 }
 
-static unsigned PTR_TIER(struct cache_member_rcu *mi,
-			 const struct bch_extent_ptr *ptr)
-{
-	return ptr->dev < mi->nr_in_set
-		? mi->m[ptr->dev].tier
-		: UINT_MAX;
-}
-
 void bch_extent_entry_append(struct bkey_i_extent *e,
 			     union bch_extent_entry *entry)
 {
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 638596300575..644734b1d4f2 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -117,193 +117,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
 	BUG_ON(l->top_p > l->end_keys_p);
 	bkey_copy(where, insert);
 }
-
-/* Scan keylists simple utilities */
-
-void bch_scan_keylist_init(struct scan_keylist *kl,
-			   struct cache_set *c,
-			   unsigned max_size)
-
-{
-	kl->c = c;
-	kl->owner = NULL;
-
-	mutex_init(&kl->lock);
-	kl->max_size = max_size;
-	bch_keylist_init(&kl->list, NULL, 0);
-
-	/*
-	 * Order of initialization is tricky, and this makes sure that
-	 * we have a valid cache set in case the order of
-	 * initialization chages and breaks things.
-	 */
-	BUG_ON(c == NULL);
-	mutex_lock(&c->gc_scan_keylist_lock);
-	list_add_tail(&kl->mark_list, &c->gc_scan_keylists);
-	mutex_unlock(&c->gc_scan_keylist_lock);
-}
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl)
-{
-	if (kl->c) {
-		mutex_lock(&kl->c->gc_scan_keylist_lock);
-		list_del(&kl->mark_list);
-		mutex_unlock(&kl->c->gc_scan_keylist_lock);
-	}
-
-	mutex_lock(&kl->lock);
-	bch_keylist_free(&kl->list);
-	mutex_unlock(&kl->lock);
-}
-
-void bch_scan_keylist_reset(struct scan_keylist *kl)
-{
-	mutex_lock(&kl->lock);
-	kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p;
-	mutex_unlock(&kl->lock);
-}
-
-/*
- * This should only be called from sysfs, and holding a lock that prevents
- * re-entrancy.
- */
-void bch_scan_keylist_resize(struct scan_keylist *kl,
-			     unsigned max_size)
-{
-	mutex_lock(&kl->lock);
-	kl->max_size = max_size;	/* May be smaller than current size */
-	mutex_unlock(&kl->lock);
-}
-
-/**
- * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys
- *
- * This prevents us from wrapping around gens for a bucket only referenced from
- * the tiering or moving GC keylists. We don't actually care that the data in
- * those buckets is marked live, only that we don't wrap the gens.
- *
- * Note: This interlocks with insertions, but not all dequeues interlock.
- * The particular case in which dequeues don't interlock is when a
- * scan list used by the copy offload ioctls is used as a plain
- * keylist for btree insertion.
- * The btree insertion code doesn't go through
- * bch_scan_keylist_dequeue below, and instead uses plain
- * bch_keylist_dequeue.  The other pointers (top, start, end) are
- * unchanged in this case.
- * A little care with the bottomp pointer suffices in this case.
- * Of course, we may end up marking stuff that we don't need to mark,
- * but was recently valid and we have likely just inserted in the tree
- * anyway.
- */
-void bch_keylist_recalc_oldest_gens(struct cache_set *c,
-				    struct scan_keylist *kl)
-{
-	struct bkey_i *k;
-
-	mutex_lock(&kl->lock);
-
-	for_each_keylist_key(&kl->list, k)
-		bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k));
-
-	mutex_unlock(&kl->lock);
-}
-
-int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	int ret;
-
-	mutex_lock(&kl->lock);
-	ret = bch_keylist_realloc_max(&kl->list,
-				      k.k->u64s,
-				      kl->max_size);
-
-	if (!ret) {
-		bkey_reassemble(kl->list.top, k);
-		bch_keylist_enqueue(&kl->list);
-		atomic64_add(k.k->size, &kl->sectors);
-	}
-	mutex_unlock(&kl->lock);
-
-	return ret;
-}
-
-/* Actual scanning functionality of scan_keylists */
-
-static void bch_refill_scan_keylist(struct cache_set *c,
-				    struct scan_keylist *kl,
-				    struct bpos *last_scanned,
-				    struct bpos end,
-				    scan_keylist_pred_fn *pred)
-{
-	struct bpos start = *last_scanned;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned nr_found = 0;
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) {
-		if (bkey_cmp(k.k->p, end) >= 0) {
-			*last_scanned = k.k->p;
-			goto done;
-		}
-
-		if (pred(kl, k)) {
-			if (bch_scan_keylist_add(kl, k))
-				goto done;
-
-			nr_found++;
-		}
-
-		*last_scanned = k.k->p;
-		bch_btree_iter_cond_resched(&iter);
-	}
-
-	/* If we end up here, it means:
-	 * - the map_fn didn't fill up the keybuf
-	 * - the map_fn didn't see the end key
-	 * - there were no more keys to map over
-	 * Therefore, we are at the end of the key space */
-	*last_scanned = POS_MAX;
-done:
-	bch_btree_iter_unlock(&iter);
-
-	trace_bcache_keyscan(nr_found,
-			     start.inode, start.offset,
-			     last_scanned->inode,
-			     last_scanned->offset);
-}
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl)
-{
-	if (bch_keylist_empty(&kl->list))
-		return NULL;
-
-	return bch_keylist_front(&kl->list);
-}
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
-					    struct scan_keylist *kl,
-					    struct bpos *last_scanned,
-					    struct bpos end,
-					    scan_keylist_pred_fn *pred)
-{
-	if (bch_keylist_empty(&kl->list)) {
-		if (bkey_cmp(*last_scanned, end) >= 0)
-			return NULL;
-
-		bch_refill_scan_keylist(c, kl, last_scanned, end, pred);
-	}
-
-	return bch_scan_keylist_next(kl);
-}
-
-void bch_scan_keylist_dequeue(struct scan_keylist *kl)
-{
-	u64 sectors;
-
-	mutex_lock(&kl->lock);
-	sectors = kl->list.bot->k.size;
-	bch_keylist_dequeue(&kl->list);
-	mutex_unlock(&kl->lock);
-
-	BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0);
-}
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index 028552757527..8fc92986f22f 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -116,49 +116,5 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
 int bch_keylist_realloc(struct keylist *, unsigned);
 int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned);
 
-void bch_scan_keylist_init(struct scan_keylist *kl,
-			   struct cache_set *c,
-			   unsigned max_size);
-
-void bch_scan_keylist_reset(struct scan_keylist *kl);
-
-/* The keylist is dynamically adjusted. This just clamps the maxima */
-
-static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl)
-{
-	return kl->max_size;
-}
-
-static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl)
-{
-	return atomic64_read(&kl->sectors);
-}
-
-void bch_scan_keylist_resize(struct scan_keylist *kl,
-			     unsigned max_size);
-
-void bch_scan_keylist_destroy(struct scan_keylist *kl);
-
-/*
- * IMPORTANT: The caller of bch_scan_keylist_next or
- * bch_scan_keylist_next_rescan needs to copy any
- * non-null return value before calling either again!
- * These functions return a pointer into the internal structure.
- * Furthermore, they need to call bch_scan_keylist_advance after
- * copying the structure.
- */
-
-struct bkey_i *bch_scan_keylist_next(struct scan_keylist *);
-
-struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c,
-					    struct scan_keylist *kl,
-					    struct bpos *last_scanned,
-					    struct bpos end,
-					    scan_keylist_pred_fn *pred);
-
-int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c);
-void bch_scan_keylist_dequeue(struct scan_keylist *);
-
-void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *);
 
 #endif /* _BCACHE_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
index 569cdc2480e2..156fbe0745fd 100644
--- a/fs/bcachefs/keylist_types.h
+++ b/fs/bcachefs/keylist_types.h
@@ -48,49 +48,4 @@ struct keylist {
 	bool				has_buf;
 };
 
-/*
- * scan_keylists are conceptually similar to keybufs, but they don't
- * have an internal RB tree.
- * keybufs should be used when read or write operations need to
- * examine keys in flight, as for writeback.
- * But for moving operations (moving gc, tiering, moving data off
- * devices), read and writes don't need to look at all, so we don't
- * need the RB tree and use scan_keylists instead.
- *
- * Note that unlike keybufs, they don't contain a semaphore to limit
- * bios.  That must be done externally, if necessary.
- */
-
-#define DFLT_SCAN_KEYLIST_MAX_SIZE	512
-
-struct scan_keylist {
-	struct list_head	mark_list;	/* For GC marking */
-
-	struct cache_set	*c;	/* For destroying */
-
-	/*
-	 * Only one thread is allowed to mutate the keylist. Other threads can
-	 * read it. The mutex has to be taken by the mutator thread when
-	 * mutating the keylist, and by other threads when reading, but not by
-	 * the mutator thread when reading.
-	 */
-	struct mutex		lock;
-	/*
-	 * Maximum size, in u64s. The keylist will not grow beyond this size.
-	 */
-	unsigned		max_size;
-	/*
-	 * Number of sectors in keys currently on the keylist.
-	 */
-	atomic64_t		sectors;
-	/*
-	 * The underlying keylist.
-	 */
-	struct keylist		list;
-
-	struct moving_queue	*owner;
-};
-
-typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c);
-
 #endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index c33606865eb2..aa9e0dd80227 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -12,39 +12,30 @@
 #include "migrate.h"
 #include "move.h"
 
-static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	struct cache *ca = container_of(kl, struct cache,
-					moving_gc_queue.keys);
-
-	return bkey_extent_is_data(k.k) &&
-		bch_extent_has_device(bkey_s_c_to_extent(k),
-				      ca->sb.nr_this_dev);
-}
-
 static void bch_extent_drop_dev_ptrs(struct bkey_s_extent e, unsigned dev)
 {
 	struct bch_extent_ptr *ptr;
+	unsigned dropped = 0;
 
 	extent_for_each_ptr_backwards(e, ptr)
-		if (ptr->dev == dev)
+		if (ptr->dev == dev) {
 			bch_extent_drop_ptr(e, ptr);
+			dropped++;
+		}
+
+	BUG_ON(dropped > 1);
 }
 
-static int issue_migration_move(struct cache *ca,
-				struct moving_context *ctxt,
-				struct bkey_s_c k,
-				u64 *seen_key_count)
+static int migrate_extent(struct cache_set *c, struct cache *ca,
+			  struct bkey_s_c k, struct move_context *m)
 {
-	struct moving_queue *q = &ca->moving_gc_queue;
-	struct cache_set *c = ca->set;
 	struct moving_io *io;
 	struct disk_reservation res;
 
 	if (bch_disk_reservation_get(c, &res, k.k->size, 0))
 		return -ENOSPC;
 
-	io = moving_io_alloc(k);
+	io = bch_moving_io_alloc(k);
 	if (!io) {
 		bch_disk_reservation_put(c, &res);
 		return -ENOMEM;
@@ -60,33 +51,14 @@ static int issue_migration_move(struct cache *ca,
 			  0);
 	io->op.nr_replicas = 1;
 
-	io->op.io_wq = q->wq;
-
 	bch_extent_drop_dev_ptrs(bkey_i_to_s_extent(&io->op.insert_key),
 				 ca->sb.nr_this_dev);
 
-	bch_data_move(q, ctxt, io);
-	(*seen_key_count)++;
-
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
-	bch_scan_keylist_dequeue(&q->keys);
+	bch_data_move(m, io);
 	return 0;
 }
 
-#define MIGRATION_DEBUG		0
-
 #define MAX_DATA_OFF_ITER	10
-#define PASS_LOW_LIMIT		(MIGRATION_DEBUG ? 0 : 2)
-#define MIGRATE_NR		64
-#define MIGRATE_READ_NR		32
-#define MIGRATE_WRITE_NR	32
 
 /*
  * This moves only the data off, leaving the meta-data (if any) in place.
@@ -104,37 +76,9 @@ static int issue_migration_move(struct cache *ca,
 
 int bch_move_data_off_device(struct cache *ca)
 {
-	int ret;
-	struct bkey_i *k;
-	unsigned pass;
-	u64 seen_key_count;
-	unsigned last_error_count;
-	unsigned last_error_flags;
-	struct moving_context context;
 	struct cache_set *c = ca->set;
-	struct moving_queue *queue = &ca->moving_gc_queue;
-
-	/*
-	 * This reuses the moving gc queue as it is no longer in use
-	 * by moving gc, which must have been stopped to call this.
-	 */
-
-	BUG_ON(ca->moving_gc_read != NULL);
-
-	/*
-	 * This may actually need to start the work queue because the
-	 * device may have always been read-only and never have had it
-	 * started (moving gc usually starts it but not for RO
-	 * devices).
-	 */
-
-	bch_queue_start(queue);
-
-	queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR);
-
-	BUG_ON(queue->wq == NULL);
-	bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION);
-	context.avoid = ca;
+	u64 seen_key_count = 1;
+	unsigned pass;
 
 	/*
 	 * In theory, only one pass should be necessary as we've
@@ -153,82 +97,44 @@ int bch_move_data_off_device(struct cache *ca)
 	 * but that can be viewed as a verification pass.
 	 */
 
-	seen_key_count = 1;
-	last_error_count = 0;
-	last_error_flags = 0;
-
 	for (pass = 0;
 	     (seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER));
 	     pass++) {
-		bool again;
-
-		seen_key_count = 0;
-		atomic_set(&context.error_count, 0);
-		atomic_set(&context.error_flags, 0);
-		context.last_scanned = POS_MIN;
-
-again:
-		again = false;
-
-		while (1) {
-			if (bch_queue_full(queue)) {
-				if (queue->rotational) {
-					again = true;
-					break;
-				} else {
-					bch_moving_wait(&context);
-					continue;
-				}
-			}
+		struct btree_iter iter;
+		struct bkey_s_c k;
+		struct move_context m;
 
-			k = bch_scan_keylist_next_rescan(c,
-							 &queue->keys,
-							 &context.last_scanned,
-							 POS_MAX,
-							 migrate_data_pred);
-			if (k == NULL)
-				break;
+		move_context_init(&m);
 
-			if (issue_migration_move(ca, &context, bkey_i_to_s_c(k),
-						 &seen_key_count)) {
-				/*
-				 * Memory allocation failed; we will wait for
-				 * all queued moves to finish and continue
-				 * scanning starting from the same key
-				 */
-				again = true;
-				break;
-			}
-		}
-
-		bch_queue_run(queue, &context);
-		if (again)
-			goto again;
+		for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) {
+			if (bkey_extent_is_data(k.k) &&
+			    bch_extent_has_device(bkey_s_c_to_extent(k),
+						  ca->sb.nr_this_dev)) {
+				BKEY_PADDED(k) tmp;
 
-		if ((pass >= PASS_LOW_LIMIT)
-		    && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) {
-			pr_notice("found %llu keys on pass %u.",
-				  seen_key_count, pass);
-		}
+				bkey_reassemble(&tmp.k, k);
+				bch_btree_iter_unlock(&iter);
 
-		last_error_count = atomic_read(&context.error_count);
-		last_error_flags = atomic_read(&context.error_flags);
+				seen_key_count++;
+				migrate_extent(c, ca,
+					       bkey_i_to_s_c(&tmp.k),
+					       &m);
+			}
 
-		if (last_error_count != 0) {
-			pr_notice("pass %u: error count = %u, error flags = 0x%x",
-				  pass, last_error_count, last_error_flags);
+			bch_btree_iter_cond_resched(&iter);
 		}
+		bch_btree_iter_unlock(&iter);
+
+		closure_sync(&m.cl);
 	}
 
-	if (seen_key_count != 0 || last_error_count != 0) {
+	if (seen_key_count) {
 		pr_err("Unable to migrate all data in %d iterations.",
 		       MAX_DATA_OFF_ITER);
-		ret = -EDEADLK;
-	} else if (MIGRATION_DEBUG)
-		pr_notice("Migrated all data in %d iterations", pass);
+		return -EDEADLK;
+	}
 
-	bch_queue_run(queue, &context);
-	return ret;
+	return 0;
 }
 
 /*
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index bbfcbdae2f37..8e8ae4acb74c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1,75 +1,28 @@
 
 #include "bcache.h"
-#include "btree_gc.h"
-#include "buckets.h"
+#include "extents.h"
 #include "io.h"
 #include "move.h"
-#include "super.h"
-#include "keylist.h"
 
 #include <trace/events/bcache.h>
 
-static void moving_error(struct moving_context *ctxt, unsigned flag)
+void bch_moving_io_free(struct moving_io *io)
 {
-	atomic_inc(&ctxt->error_count);
-	atomic_or(flag, &ctxt->error_flags);
-}
-
-void bch_moving_context_init(struct moving_context *ctxt,
-			     struct bch_ratelimit *rate,
-			     enum moving_purpose purpose)
-{
-	memset(ctxt, 0, sizeof(*ctxt));
-	ctxt->task = current;
-	ctxt->rate = rate;
-	ctxt->purpose = purpose;
-	closure_init_stack(&ctxt->cl);
-}
-
-/*
- * bch_moving_wait() -- wait for a bch_moving_notify() call
- *
- * To deal with lost wakeups, we make this return immediately if notify
- * was already called.
- */
-void bch_moving_wait(struct moving_context *ctxt)
-{
-	while (1) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (atomic_xchg(&ctxt->pending, 0))
-			break;
-		schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-}
-
-static void bch_moving_notify(struct moving_context *ctxt)
-{
-	atomic_set(&ctxt->pending, 1);
-	wake_up_process(ctxt->task);
-}
-
-static bool __bch_queue_reads_pending(struct moving_queue *q)
-{
-	return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree));
+	bch_bio_free_pages(&io->bio.bio.bio);
+	kfree(io);
 }
 
-static bool bch_queue_reads_pending(struct moving_queue *q)
+static void bch_moving_io_destructor(struct closure *cl)
 {
-	unsigned long flags;
-	bool pending;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct move_context *m = container_of(cl->parent,
+				struct move_context, cl);
+	unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS);
 
-	spin_lock_irqsave(&q->lock, flags);
-	pending = __bch_queue_reads_pending(q);
-	spin_unlock_irqrestore(&q->lock, flags);
+	while (nr_pages--)
+		up(&m->nr_pages_limit);
 
-	return pending;
-}
-
-static void bch_queue_write(struct moving_queue *q)
-{
-	BUG_ON(q->wq == NULL);
-	queue_work(q->wq, &q->work);
+	bch_moving_io_free(io);
 }
 
 static void moving_init(struct moving_io *io, struct bio *bio)
@@ -86,531 +39,83 @@ static void moving_init(struct moving_io *io, struct bio *bio)
 	bch_bio_map(bio, NULL);
 }
 
-struct moving_io *moving_io_alloc(struct bkey_s_c k)
-{
-	struct moving_io *io;
-
-	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
-		     * DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
-		     GFP_KERNEL);
-	if (!io)
-		return NULL;
-
-	bkey_reassemble(&io->key, k);
-
-	moving_init(io, &io->rbio.bio);
-
-	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
-		kfree(io);
-		return NULL;
-	}
-
-	return io;
-}
-
-void moving_io_free(struct moving_io *io)
-{
-	bch_bio_free_pages(&io->wbio.bio.bio);
-	kfree(io);
-}
-
-static void moving_io_destructor(struct closure *cl)
+static void write_moving(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_queue *q = io->q;
-	struct moving_context *ctxt = io->context;
-	unsigned long flags;
-	bool kick_writes = true;
-
-	if (io->replace.failures)
-		trace_bcache_copy_collision(q, &io->key.k);
-
-	spin_lock_irqsave(&q->lock, flags);
-
-	BUG_ON(!q->count);
-	q->count--;
-
-	if (io->read_issued) {
-		BUG_ON(!q->read_count);
-		q->read_count--;
-	}
-
-	if (io->write_issued) {
-		BUG_ON(!q->write_count);
-		q->write_count--;
-		trace_bcache_move_write_done(q, &io->key.k);
-	}
-
-	list_del_init(&io->list);
-
-	if ((q->count == 0) && (q->stop_waitcl != NULL)) {
-		closure_put(q->stop_waitcl);
-		q->stop_waitcl = NULL;
-	}
-
-	if (q->rotational && __bch_queue_reads_pending(q))
-		kick_writes = false;
-
-	if (list_empty(&q->pending))
-		kick_writes = false;
-
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	moving_io_free(io);
-
-	if (kick_writes)
-		bch_queue_write(q);
-
-	bch_moving_notify(ctxt);
-}
-
-static void moving_io_after_write(struct closure *cl)
-{
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_context *ctxt = io->context;
 
 	if (io->op.error)
-		moving_error(ctxt, MOVING_FLAG_WRITE);
-
-	moving_io_destructor(cl);
-}
-
-static void write_moving(struct moving_io *io)
-{
-	bool stopped;
-	unsigned long flags;
-	struct bch_write_op *op = &io->op;
-
-	spin_lock_irqsave(&io->q->lock, flags);
-	BUG_ON(io->q->count == 0);
-	stopped = io->q->stopped;
-	spin_unlock_irqrestore(&io->q->lock, flags);
-
-	/*
-	 * If the queue has been stopped, prevent the write from occurring.
-	 * This stops all writes on a device going read-only as quickly
-	 * as possible.
-	 */
-
-	if (op->error || stopped)
-		closure_return_with_destructor(&io->cl, moving_io_destructor);
-	else {
-		moving_init(io, &io->wbio.bio.bio);
-
-		op->bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
-
-		closure_call(&op->cl, bch_write, NULL, &io->cl);
-		closure_return_with_destructor(&io->cl, moving_io_after_write);
-	}
-}
-
-static void bch_queue_write_work(struct work_struct *work)
-{
-	struct moving_queue *q = container_of(work, struct moving_queue, work);
-	struct moving_io *io;
-	unsigned long flags;
-
-	spin_lock_irqsave(&q->lock, flags);
-
-	if (q->rotational && __bch_queue_reads_pending(q)) {
-		/* All reads should have finished before writes start */
-		spin_unlock_irqrestore(&q->lock, flags);
-		return;
-	}
-
-	while (!q->stopped && q->write_count < q->max_write_count) {
-		io = list_first_entry_or_null(&q->pending,
-					struct moving_io, list);
-		/*
-		 * We only issue the writes in insertion order to preserve
-		 * any linearity in the original key list/tree, so if we
-		 * find an io whose read hasn't completed, we don't
-		 * scan beyond it.  Eventually that read will complete,
-		 * at which point we may issue multiple writes (for it
-		 * and any following entries whose reads had already
-		 * completed and we had not examined here).
-		 */
-		if (!io || !io->read_completed)
-			break;
-
-		BUG_ON(io->write_issued);
-		q->write_count++;
-		io->write_issued = 1;
-		list_del(&io->list);
-		list_add_tail(&io->list, &q->write_pending);
-		trace_bcache_move_write(q, &io->key.k);
-		spin_unlock_irqrestore(&q->lock, flags);
-		write_moving(io);
-		spin_lock_irqsave(&q->lock, flags);
-	}
-
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-
-/*
- * IMPORTANT: The caller of queue_init must have zero-filled it when it
- * allocates it.
- */
-
-int bch_queue_init(struct moving_queue *q,
-		   struct cache_set *c,
-		   unsigned max_size,
-		   unsigned max_count,
-		   unsigned max_read_count,
-		   unsigned max_write_count,
-		   bool rotational,
-		   const char *name)
-{
-	INIT_WORK(&q->work, bch_queue_write_work);
-
-	q->keys.owner = q;
-	q->max_count = max_count;
-	q->max_read_count = max_read_count;
-	q->max_write_count = max_write_count;
-	q->rotational = rotational;
+		closure_return_with_destructor(&io->cl, bch_moving_io_destructor);
 
-	spin_lock_init(&q->lock);
-	INIT_LIST_HEAD(&q->pending);
-	INIT_LIST_HEAD(&q->write_pending);
-	q->tree = RB_ROOT;
+	moving_init(io);
 
-	q->wq = alloc_workqueue(name,
-				WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1);
-	if (!q->wq)
-		return -ENOMEM;
+	io->op.bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
 
-	return 0;
-}
-
-void bch_queue_start(struct moving_queue *q)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&q->lock, flags);
-	q->stopped = false;
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	bch_scan_keylist_reset(&q->keys);
-}
-
-void queue_io_resize(struct moving_queue *q,
-		     unsigned max_io,
-		     unsigned max_read,
-		     unsigned max_write)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&q->lock, flags);
-	q->max_count = max_io;
-	q->max_read_count = max_read;
-	q->max_write_count = max_write;
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-
-void bch_queue_destroy(struct moving_queue *q)
-{
-	if (q->wq)
-		destroy_workqueue(q->wq);
-	q->wq = NULL;
-
-	bch_scan_keylist_destroy(&q->keys);
-}
-
-static void bch_queue_cancel_writes(struct moving_queue *q)
-{
-	struct moving_io *io;
-	unsigned long flags;
-	bool read_issued, read_completed;
-
-	spin_lock_irqsave(&q->lock, flags);
-
-	while (1) {
-		io = list_first_entry_or_null(&q->pending,
-					      struct moving_io,
-					      list);
-		if (!io)
-			break;
-
-		BUG_ON(io->write_issued);
-		list_del_init(&io->list);
-		read_issued = io->read_issued;
-		read_completed = io->read_completed;
-		if (!read_issued && !read_completed && q->rotational)
-			rb_erase(&io->node, &q->tree);
-		spin_unlock_irqrestore(&q->lock, flags);
-		if (read_completed)
-			closure_return_with_destructor_noreturn(&io->cl,
-					moving_io_destructor);
-		else if (!read_issued)
-			moving_io_destructor(&io->cl);
-		spin_lock_irqsave(&q->lock, flags);
-	}
-
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-
-void bch_queue_stop(struct moving_queue *q)
-{
-	unsigned long flags;
-	struct closure waitcl;
-
-	closure_init_stack(&waitcl);
-
-	spin_lock_irqsave(&q->lock, flags);
-	if (q->stopped)
-		BUG_ON(q->stop_waitcl != NULL);
-	else {
-		q->stopped = true;
-		if (q->count != 0) {
-			q->stop_waitcl = &waitcl;
-			closure_get(&waitcl);
-		}
-	}
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	bch_queue_cancel_writes(q);
-
-	closure_sync(&waitcl);
-}
-
-static void pending_recalc_oldest_gens(struct cache_set *c, struct list_head *l)
-{
-	struct moving_io *io;
-
-	list_for_each_entry(io, l, list) {
-		/*
-		 * This only marks the (replacement) key and not the
-		 * insertion key in the bch_write_op, as the insertion
-		 * key should be a subset of the replacement key except
-		 * for any new pointers added by the write, and those
-		 * don't need to be marked because they are pointing
-		 * to open buckets until the write completes
-		 */
-		bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&io->key));
-	}
-}
-
-void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q)
-{
-	unsigned long flags;
-
-	/* 1st, mark the keylist keys */
-	bch_keylist_recalc_oldest_gens(c, &q->keys);
-
-	/* 2nd, mark the keys in the I/Os */
-	spin_lock_irqsave(&q->lock, flags);
-
-	pending_recalc_oldest_gens(c, &q->pending);
-	pending_recalc_oldest_gens(c, &q->write_pending);
-
-	spin_unlock_irqrestore(&q->lock, flags);
+	closure_call(&io->op.cl, bch_write, NULL, &io->cl);
+	closure_return_with_destructor(&io->cl, bch_moving_io_destructor);
 }
 
 static void read_moving_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_queue *q = io->q;
-	struct moving_context *ctxt = io->context;
-	bool stopped;
 
-	unsigned long flags;
-
-	if (bio->bi_error) {
+	if (bio->bi_error)
 		io->op.error = bio->bi_error;
-		moving_error(io->context, MOVING_FLAG_READ);
-	}
-
-	bio_put(bio);
-
-	spin_lock_irqsave(&q->lock, flags);
-
-	trace_bcache_move_read_done(q, &io->key.k);
-
-	BUG_ON(!io->read_issued);
-	BUG_ON(io->read_completed);
-	io->read_issued = 0;
-	io->read_completed = 1;
-	BUG_ON(!q->read_count);
-	q->read_count--;
-	stopped = q->stopped;
-	if (stopped)
-		list_del_init(&io->list);
-	spin_unlock_irqrestore(&q->lock, flags);
 
-	if (stopped)
-		closure_return_with_destructor(&io->cl,
-			moving_io_destructor);
-	else if (!q->rotational)
-		bch_queue_write(q);
-
-	bch_moving_notify(ctxt);
+	closure_put(cl);
 }
 
 static void __bch_data_move(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct extent_pick_ptr pick;
-	u64 size = io->key.k.size;
 
-	bch_extent_pick_ptr_avoiding(io->op.c, bkey_i_to_s_c(&io->key),
-				     io->context->avoid, &pick);
+	bch_extent_pick_ptr(io->op.c,
+			    bkey_i_to_s_c(&io->key),
+			    &pick);
 	if (IS_ERR_OR_NULL(pick.ca))
-		closure_return_with_destructor(cl, moving_io_destructor);
-
-	io->context->keys_moved++;
-	io->context->sectors_moved += size;
-	if (io->context->rate)
-		bch_ratelimit_increment(io->context->rate, size);
+		closure_return_with_destructor(cl, bch_moving_io_destructor);
 
 	io->rbio.bio.bi_rw	= READ;
 	io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k);
 	io->rbio.bio.bi_end_io	= read_moving_endio;
 
+	closure_get(cl);
 	bch_read_extent(io->op.c, &io->rbio,
 			bkey_i_to_s_c(&io->key),
 			&pick, BCH_READ_IS_LAST);
-}
-
-/*
- * bch_queue_full() - return if more reads can be queued with bch_data_move().
- *
- * In rotational mode, always returns false if no reads are in flight (see
- * how max_count is initialized in bch_queue_init()).
- */
-bool bch_queue_full(struct moving_queue *q)
-{
-	unsigned long flags;
-	bool full;
 
-	spin_lock_irqsave(&q->lock, flags);
-	BUG_ON(q->count > q->max_count);
-	BUG_ON(q->read_count > q->max_read_count);
-	full = (q->count == q->max_count ||
-		q->read_count == q->max_read_count);
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	return full;
-}
-
-static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2)
-{
-	if (io1->sort_key < io2->sort_key)
-		return -1;
-	else if (io1->sort_key > io2->sort_key)
-		return 1;
-	else {
-		/* We don't want duplicate keys. Eventually, we will have
-		 * support for GC with duplicate pointers -- for now,
-		 * just sort them randomly instead */
-		if (io1 < io2)
-			return -1;
-		else if (io1 > io2)
-			return 1;
-		BUG();
-	}
+	continue_at(cl, write_moving, io->op.io_wq); /* XXX different wq */
 }
 
-void bch_data_move(struct moving_queue *q,
-		   struct moving_context *ctxt,
-		   struct moving_io *io)
+void bch_data_move(struct move_context *m, struct moving_io *io)
 {
-	unsigned long flags;
-	bool stopped = false;
-
-	BUG_ON(q->wq == NULL);
-	io->q = q;
-	io->context = ctxt;
-
-	spin_lock_irqsave(&q->lock, flags);
-	if (q->stopped) {
-		stopped = true;
-		goto out;
-	}
-
-	q->count++;
-	list_add_tail(&io->list, &q->pending);
-	trace_bcache_move_read(q, &io->key.k);
-
-	if (q->rotational)
-		BUG_ON(RB_INSERT(&q->tree, io, node, moving_io_cmp));
-	else {
-		BUG_ON(io->read_issued);
-		io->read_issued = 1;
-		q->read_count++;
-	}
+	unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS);
 
-out:
-	spin_unlock_irqrestore(&q->lock, flags);
+	while (nr_pages--)
+		down(&m->nr_pages_limit);
 
-	if (stopped)
-		moving_io_free(io);
-	else if (!q->rotational)
-		closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
+	closure_call(&io->cl, __bch_data_move, NULL, &m->cl);
 }
 
-/* Rotational device queues */
-
-static bool bch_queue_read(struct moving_queue *q,
-			   struct moving_context *ctxt)
+struct moving_io *bch_moving_io_alloc(struct bkey_s_c k)
 {
-	unsigned long flags;
-	struct rb_node *node;
 	struct moving_io *io;
-	bool stopped;
-
-	BUG_ON(!q->rotational);
-
-	spin_lock_irqsave(&q->lock, flags);
-	node = rb_first(&q->tree);
-	if (!node) {
-		spin_unlock_irqrestore(&q->lock, flags);
-		return false;
-	}
-
-	io = rb_entry(node, struct moving_io, node);
-	rb_erase(node, &q->tree);
-	io->read_issued = 1;
-	q->read_count++;
-	stopped = q->stopped;
-	spin_unlock_irqrestore(&q->lock, flags);
 
-	if (stopped) {
-		moving_io_destructor(&io->cl);
-		return false;
-	} else {
-		closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
-		return true;
-	}
-}
-
-void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt)
-{
-	unsigned long flags;
-	bool full;
-
-	if (!q->rotational)
-		goto sync;
+	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+		     * DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
+		     GFP_KERNEL);
+	if (!io)
+		return NULL;
 
-	while (!bch_moving_context_wait(ctxt)) {
-		spin_lock_irqsave(&q->lock, flags);
-		full = (q->read_count == q->max_read_count);
-		spin_unlock_irqrestore(&q->lock, flags);
+	bkey_reassemble(&io->key, k);
 
-		if (full) {
-			bch_moving_wait(ctxt);
-			continue;
-		}
+	moving_init(io);
 
-		if (!bch_queue_read(q, ctxt))
-			break;
+	if (bio_alloc_pages(&io->bio.bio.bio, GFP_KERNEL)) {
+		kfree(io);
+		return NULL;
 	}
 
-	while (bch_queue_reads_pending(q))
-		bch_moving_wait(ctxt);
-
-	bch_queue_write(q);
-
-sync:
-	closure_sync(&ctxt->cl);
+	return io;
 }
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 2f9998e66e7f..4c5433e36abf 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -3,95 +3,25 @@
 
 #include "buckets.h"
 #include "io_types.h"
+#include <linux/semaphore.h>
 
-enum moving_purpose {
-	MOVING_PURPOSE_UNKNOWN,	/* Un-init */
-	MOVING_PURPOSE_MIGRATION,
-	MOVING_PURPOSE_TIERING,
-	MOVING_PURPOSE_COPY_GC,
-};
-
-enum moving_flag_bitnos {
-	MOVING_FLAG_BITNO_READ = 0,
-	MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ	(1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE	(1U << MOVING_FLAG_BITNO_WRITE)
-
-struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
+struct move_context {
 	struct closure		cl;
-
-	/* Number and types of errors reported */
-	atomic_t		error_count;
-	atomic_t		error_flags;
-
-	/* If != 0, @task is waiting for a read or write to complete */
-	atomic_t		pending;
-	struct task_struct	*task;
-
-	/* Key and sector moves issued, updated from submission context */
-	u64			keys_moved;
-	u64			sectors_moved;
-
-	/* Last key scanned */
-	struct bpos		last_scanned;
-
-	/* Rate-limiter counting submitted reads */
-	struct bch_ratelimit	*rate;
-
-	/* Try to avoid reading the following device */
-	struct cache		*avoid;
-
-	/* Debugging... */
-	enum moving_purpose	purpose;
+	struct semaphore	nr_pages_limit;
 };
 
-void bch_moving_context_init(struct moving_context *, struct bch_ratelimit *,
-			     enum moving_purpose);
-
-static inline int bch_moving_context_wait(struct moving_context *ctxt)
+static inline void move_context_init(struct move_context *m)
 {
-	if (ctxt->rate == NULL)
-		return 0;
-
-	return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl);
+	closure_init_stack(&m->cl);
+	sema_init(&m->nr_pages_limit, (8 << 20) / PAGE_SIZE);
 }
 
-void bch_moving_wait(struct moving_context *);
-
 struct moving_io {
-	struct list_head	list;
-	struct rb_node		node;
 	struct closure		cl;
-	struct moving_queue	*q;
+
 	struct bch_write_op	op;
 	struct bch_replace_info	replace;
-	struct moving_context	*context;
 	BKEY_PADDED(key);
-	/* Sort key for moving_queue->tree */
-	u64			sort_key;
-	/* Protected by q->lock */
-
-	/*
-	 * 1) !read_issued && !read_completed
-	 *    - Closure is not running yet, starts when read_issued is set
-	 *    - IO is in q->tree (if q->rotational) and q->pending
-	 * 2) !write_issued && !write_completed:
-	 *    - IO is in q->pending
-	 * 3) write_issued:
-	 *    - IO is in q->write_pending
-	 * 4) write_completed:
-	 *    - Closure is about to return and the IO is about to be freed
-	 *
-	 * If read_issued, we hold a reference on q->read_count
-	 * If write_issued, we hold a reference on q->write_count
-	 * Until IO is freed, we hold a reference on q->count
-	 */
-	unsigned		read_issued:1;
-	unsigned		read_completed:1;
-	unsigned		write_issued:1;
 
 	struct bch_read_bio	rbio;
 	struct bch_write_bio	wbio;
@@ -99,67 +29,8 @@ struct moving_io {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-struct moving_io *moving_io_alloc(struct bkey_s_c);
-void moving_io_free(struct moving_io *);
-
-typedef struct moving_io *(moving_queue_fn)(struct moving_queue *,
-					    struct moving_context *);
-
-int bch_queue_init(struct moving_queue *,
-		   struct cache_set *,
-		   unsigned max_keys,
-		   unsigned max_ios,
-		   unsigned max_reads,
-		   unsigned max_writes,
-		   bool rotational,
-		   const char *);
-void bch_queue_start(struct moving_queue *);
-bool bch_queue_full(struct moving_queue *);
-void bch_data_move(struct moving_queue *,
-		   struct moving_context *,
-		   struct moving_io *);
-void queue_io_resize(struct moving_queue *,
-		     unsigned,
-		     unsigned,
-		     unsigned);
-void bch_queue_destroy(struct moving_queue *);
-void bch_queue_stop(struct moving_queue *);
-
-void bch_queue_recalc_oldest_gens(struct cache_set *, struct moving_queue *);
-
-void bch_queue_run(struct moving_queue *, struct moving_context *);
-
-#define sysfs_queue_attribute(name)					\
-	rw_attribute(name##_max_count);					\
-	rw_attribute(name##_max_read_count);				\
-	rw_attribute(name##_max_write_count);				\
-	rw_attribute(name##_max_keys)
-
-#define sysfs_queue_files(name)						\
-	&sysfs_##name##_max_count,					\
-	&sysfs_##name##_max_read_count,					\
-	&sysfs_##name##_max_write_count,				\
-	&sysfs_##name##_max_keys
-
-#define sysfs_queue_show(name, var)					\
-do {									\
-	sysfs_hprint(name##_max_count,		(var)->max_count);	\
-	sysfs_print(name##_max_read_count,	(var)->max_read_count);	\
-	sysfs_print(name##_max_write_count,	(var)->max_write_count);\
-	sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\
-} while (0)
-
-#define sysfs_queue_store(name, var)					\
-do {									\
-	sysfs_strtoul(name##_max_count, (var)->max_count);		\
-	sysfs_strtoul(name##_max_read_count, (var)->max_read_count);	\
-	sysfs_strtoul(name##_max_write_count, (var)->max_write_count);	\
-	if (attr == &sysfs_##name##_max_keys) {				\
-		int v = strtoi_h_or_return(buf);			\
-									\
-		v = clamp(v, 2, KEYLIST_MAX);				\
-		bch_scan_keylist_resize(&(var)->keys, v);		\
-	}								\
-} while (0)
+void bch_moving_io_free(struct moving_io *);
+struct moving_io *bch_moving_io_alloc(struct bkey_s_c);
+void bch_data_move(struct move_context *, struct moving_io *);
 
 #endif /* _BCACHE_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
deleted file mode 100644
index d5e1a4a968fa..000000000000
--- a/fs/bcachefs/move_types.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _BCACHE_MOVE_TYPES_H
-#define _BCACHE_MOVE_TYPES_H
-
-/*
- * We rely on moving_queue being kzalloc'd so that the initial value of
- * the flags is 0.
- */
-
-struct moving_queue {
-	struct work_struct work;
-	struct scan_keylist keys;
-	struct workqueue_struct *wq;
-
-	/* Configuration */
-	unsigned		max_count;
-	unsigned		max_read_count;
-	unsigned		max_write_count;
-
-	/*
-	 * If true, reads are coming from rotational media. All reads
-	 * are queued up on @tree and sorted by physical location prior
-	 * to being submitted.
-	 */
-	bool			rotational;
-
-	/* This can be examined without locking */
-	bool			stopped;
-
-	/* Protects everything below */
-	spinlock_t		lock;
-
-	struct closure		*stop_waitcl;
-
-	/*
-	 * Tree of struct moving_io, sorted by moving_io->sort_key.
-	 * Contains reads which have not yet been issued; when a read is
-	 * issued, it is removed from the tree.
-	 *
-	 * Only used if @rotational is set.
-	 */
-	struct rb_root		tree;
-
-	/*
-	 * List of struct moving_io, sorted by logical offset.
-	 * Contains writes which have not yet been issued; when a write is
-	 * issued, it is removed from the list.
-	 *
-	 * Writes are issued in logical offset order, and only when all
-	 * prior writes have been issued.
-	 */
-	struct list_head	pending;
-
-	/*
-	 * List of struct moving_io, sorted by logical offset.
-	 *
-	 * Contains writes which are in-flight.
-	 */
-	struct list_head	write_pending;
-
-	unsigned		count;
-	unsigned		read_count;
-	unsigned		write_count;
-};
-
-#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
deleted file mode 100644
index 0c77ea6c808c..000000000000
--- a/fs/bcachefs/movinggc.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-
-#include <trace/events/bcache.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-
-/* Moving GC - IO loop */
-
-static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	struct cache *ca = container_of(kl, struct cache,
-					moving_gc_queue.keys);
-	struct cache_set *c = ca->set;
-	const struct bch_extent_ptr *ptr;
-	bool ret = false;
-
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
-		rcu_read_lock();
-		extent_for_each_ptr(e, ptr)
-			if (PTR_CACHE(c, ptr) == ca &&
-			    PTR_BUCKET(ca, ptr)->copygc_gen)
-				ret = true;
-		rcu_read_unlock();
-	}
-
-	return ret;
-}
-
-static int issue_moving_gc_move(struct moving_queue *q,
-				struct moving_context *ctxt,
-				struct bkey_i *k)
-{
-	struct cache *ca = container_of(q, struct cache, moving_gc_queue);
-	struct cache_set *c = ca->set;
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct moving_io *io;
-	unsigned gen;
-
-	io = moving_io_alloc(bkey_i_to_s_c(k));
-	if (!io) {
-		trace_bcache_moving_gc_alloc_fail(c, k->k.size);
-		return -ENOMEM;
-	}
-
-	bch_replace_init(&io->replace, bkey_i_to_s_c(k));
-
-	bch_write_op_init(&io->op, c, &io->wbio,
-			  (struct disk_reservation) { 0 },
-			  NULL, bkey_i_to_s_c(k),
-			  &io->replace.hook, NULL,
-			  bkey_extent_is_cached(&k->k)
-			  ? BCH_WRITE_CACHED : 0);
-	io->op.nr_replicas = 1;
-
-	e = bkey_i_to_s_extent(&io->op.insert_key);
-
-	extent_for_each_ptr(e, ptr)
-		if ((ca->sb.nr_this_dev == ptr->dev) &&
-		    (gen = PTR_BUCKET(ca, ptr)->copygc_gen)) {
-			gen--;
-			BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets));
-			io->op.wp = &ca->gc_buckets[gen];
-			io->sort_key = ptr->offset;
-			bch_extent_drop_ptr(e, ptr);
-			goto found;
-		}
-
-	/* We raced - bucket's been reused */
-	moving_io_free(io);
-	goto out;
-found:
-	trace_bcache_gc_copy(&k->k);
-
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
-	bch_data_move(q, ctxt, io);
-out:
-	bch_scan_keylist_dequeue(&q->keys);
-	return 0;
-}
-
-static void read_moving(struct cache *ca, struct moving_context *ctxt)
-{
-	struct bkey_i *k;
-	bool again;
-
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-	do {
-		again = false;
-
-		while (!bch_moving_context_wait(ctxt)) {
-			if (bch_queue_full(&ca->moving_gc_queue)) {
-				if (ca->moving_gc_queue.rotational) {
-					again = true;
-					break;
-				} else {
-					bch_moving_wait(ctxt);
-					continue;
-				}
-			}
-
-			k = bch_scan_keylist_next_rescan(
-				ca->set,
-				&ca->moving_gc_queue.keys,
-				&ctxt->last_scanned,
-				POS_MAX,
-				moving_pred);
-
-			if (k == NULL)
-				break;
-
-			if (issue_moving_gc_move(&ca->moving_gc_queue,
-						 ctxt, k)) {
-				/*
-				 * Memory allocation failed; we will wait for
-				 * all queued moves to finish and continue
-				 * scanning starting from the same key
-				 */
-				again = true;
-				break;
-			}
-		}
-
-		bch_queue_run(&ca->moving_gc_queue, ctxt);
-	} while (!kthread_should_stop() && again);
-}
-
-static void bch_moving_gc(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct bucket *g;
-
-	u64 sectors_to_move, sectors_gen, gen_current, sectors_total;
-	size_t buckets_to_move, buckets_unused = 0;
-	struct bucket_heap_entry e;
-	unsigned sectors_used, i;
-	int reserve_sectors;
-
-	struct moving_context ctxt;
-
-	bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate,
-				MOVING_PURPOSE_COPY_GC);
-
-	/*
-	 * We won't fill up the moving GC reserve completely if the data
-	 * being copied is from different generations. In the worst case,
-	 * there will be NUM_GC_GENS buckets of internal fragmentation
-	 */
-
-	spin_lock(&ca->freelist_lock);
-	reserve_sectors = ca->mi.bucket_size *
-		(fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS);
-	spin_unlock(&ca->freelist_lock);
-
-	if (reserve_sectors < (int) c->sb.block_size) {
-		trace_bcache_moving_gc_reserve_empty(ca);
-		return;
-	}
-
-	trace_bcache_moving_gc_start(ca);
-
-	/*
-	 * Find buckets with lowest sector counts, skipping completely
-	 * empty buckets, by building a maxheap sorted by sector count,
-	 * and repeatedly replacing the maximum element until all
-	 * buckets have been visited.
-	 */
-
-	mutex_lock(&ca->heap_lock);
-	ca->heap.used = 0;
-	for_each_bucket(g, ca) {
-		g->copygc_gen = 0;
-
-		if (bucket_unused(g)) {
-			buckets_unused++;
-			continue;
-		}
-
-		if (g->mark.owned_by_allocator ||
-		    g->mark.is_metadata)
-			continue;
-
-		sectors_used = bucket_sectors_used(g);
-
-		if (sectors_used >= ca->mi.bucket_size)
-			continue;
-
-		bucket_heap_push(ca, g, sectors_used);
-	}
-
-	sectors_to_move = 0;
-	for (i = 0; i < ca->heap.used; i++)
-		sectors_to_move += ca->heap.data[i].val;
-
-	/* XXX: calculate this threshold rigorously */
-
-	if (ca->heap.used < ca->free_inc.size / 2 &&
-	    sectors_to_move < reserve_sectors) {
-		mutex_unlock(&ca->heap_lock);
-		trace_bcache_moving_gc_no_work(ca);
-		return;
-	}
-
-	while (sectors_to_move > reserve_sectors) {
-		BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
-		sectors_to_move -= e.val;
-	}
-
-	buckets_to_move = ca->heap.used;
-
-	/*
-	 * resort by write_prio to group into generations, attempts to
-	 * keep hot and cold data in the same locality.
-	 */
-
-	mutex_lock(&ca->set->bucket_lock);
-	for (i = 0; i < ca->heap.used; i++) {
-		struct bucket_heap_entry *e = &ca->heap.data[i];
-
-		e->val = (c->prio_clock[WRITE].hand - e->g->write_prio);
-	}
-
-	heap_resort(&ca->heap, bucket_max_cmp);
-
-	sectors_gen = sectors_to_move / NUM_GC_GENS;
-	gen_current = 1;
-	sectors_total = 0;
-
-	while (heap_pop(&ca->heap, e, bucket_max_cmp)) {
-		sectors_total += bucket_sectors_used(e.g);
-		e.g->copygc_gen = gen_current;
-		if (gen_current < NUM_GC_GENS &&
-		    sectors_total >= sectors_gen * gen_current)
-			gen_current++;
-	}
-	mutex_unlock(&ca->set->bucket_lock);
-
-	mutex_unlock(&ca->heap_lock);
-
-	read_moving(ca, &ctxt);
-
-	trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-				buckets_to_move);
-}
-
-static int bch_moving_gc_thread(void *arg)
-{
-	struct cache *ca = arg;
-	struct cache_set *c = ca->set;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last;
-	s64 next;
-
-	set_freezable();
-
-	while (!kthread_should_stop()) {
-		if (kthread_wait_freezable(c->copy_gc_enabled))
-			break;
-
-		last = atomic_long_read(&clock->now);
-		/*
-		 * don't start copygc until less than half the gc reserve is
-		 * available:
-		 */
-		next = (buckets_available_cache(ca) -
-			div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-				  c->opts.gc_reserve_percent, 200)) *
-			ca->mi.bucket_size;
-
-		if (next <= 0)
-			bch_moving_gc(ca);
-		else
-			bch_kthread_io_clock_wait(clock, last + next);
-	}
-
-	return 0;
-}
-
-#define MOVING_GC_KEYS_MAX_SIZE	DFLT_SCAN_KEYLIST_MAX_SIZE
-#define MOVING_GC_NR 64
-#define MOVING_GC_READ_NR 32
-#define MOVING_GC_WRITE_NR 32
-
-int bch_moving_init_cache(struct cache *ca)
-{
-	bool rotational = !blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev));
-
-	bch_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
-
-	return bch_queue_init(&ca->moving_gc_queue,
-			      ca->set,
-			      MOVING_GC_KEYS_MAX_SIZE,
-			      MOVING_GC_NR,
-			      MOVING_GC_READ_NR,
-			      MOVING_GC_WRITE_NR,
-			      rotational,
-			      "bch_copygc_write");
-}
-
-int bch_moving_gc_thread_start(struct cache *ca)
-{
-	struct task_struct *t;
-
-	/* The moving gc read thread must be stopped */
-	BUG_ON(ca->moving_gc_read != NULL);
-
-	bch_queue_start(&ca->moving_gc_queue);
-
-	if (cache_set_init_fault("moving_gc_start"))
-		return -ENOMEM;
-
-	t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
-	if (IS_ERR(t))
-		return PTR_ERR(t);
-
-	ca->moving_gc_read = t;
-	wake_up_process(ca->moving_gc_read);
-
-	return 0;
-}
-
-void bch_moving_gc_stop(struct cache *ca)
-{
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-	bch_queue_stop(&ca->moving_gc_queue);
-
-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
-
-	/*
-	 * Make sure that it is empty so that gc marking doesn't keep
-	 * marking stale entries from when last used.
-	 */
-	bch_scan_keylist_reset(&ca->moving_gc_queue.keys);
-}
-
-void bch_moving_gc_destroy(struct cache *ca)
-{
-	bch_queue_destroy(&ca->moving_gc_queue);
-}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
deleted file mode 100644
index 5d09e0fa3ae1..000000000000
--- a/fs/bcachefs/movinggc.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _BCACHE_MOVINGGC_H
-#define _BCACHE_MOVINGGC_H
-
-int bch_moving_init_cache(struct cache *);
-void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
-void bch_moving_gc_destroy(struct cache *);
-
-#endif
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..cedad5462da3
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,467 @@
+/*
+ * Copygc, tiering:
+ */
+
+#include "bcache.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "io.h"
+#include "move.h"
+
+#include <trace/events/bcache.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+
+/*
+ * XXX preserve ordering when reads complete out of order
+ *
+ * do performance testing with disk write cache off
+ */
+
+static inline bool rebalance_entry_sectors_cmp(struct rebalance_bucket_entry l,
+					       struct rebalance_bucket_entry r)
+{
+	return l.sectors < r.sectors;
+}
+
+static int rebalance_entry_bucket_cmp(const void *_l, const void *_r)
+{
+	const struct rebalance_bucket_entry *l = _l;
+	const struct rebalance_bucket_entry *r = _r;
+
+	if (l->dev != r->dev)
+		return l->dev < r->dev ? -1 : 1;
+	if (l->bucket != r->bucket)
+		return l->bucket < r->bucket ? -1 : 1;
+	return 0;
+}
+
+static inline void rebalance_heap_push(struct rebalance_thread *r,
+				       size_t bucket, u8 dev,
+				       u8 gen, unsigned sectors)
+{
+	struct rebalance_bucket_entry new = {
+		.bucket		= bucket,
+		.dev		= dev,
+		.gen		= gen,
+		.sectors	= sectors,
+	};
+
+	if (!heap_full(&r->heap))
+		heap_add(&r->heap, new, rebalance_entry_sectors_cmp);
+	else if (rebalance_entry_sectors_cmp(new, heap_peek(&r->heap))) {
+		r->heap.data[0] = new;
+		heap_sift(&r->heap, 0, rebalance_entry_sectors_cmp);
+	}
+}
+
+/* returns nr of extents that should be written to this tier: */
+static unsigned should_tier_extent(struct cache_set *c,
+				   struct rebalance_thread *r,
+				   struct cache_member_rcu *mi,
+				   struct bkey_s_c_extent e)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned replicas = 0;
+
+	/* Make sure we have room to add a new pointer: */
+	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+	    BKEY_EXTENT_VAL_U64s_MAX)
+		return false;
+
+	extent_for_each_ptr(e, ptr)
+		if (PTR_TIER(mi, ptr) >= r->tier)
+			replicas++;
+
+	return replicas < c->opts.data_replicas
+		? c->opts.data_replicas - replicas
+		: 0;
+}
+
+static bool should_copygc_ptr(struct cache_set *c,
+			      struct rebalance_thread *r,
+			      struct cache_member_rcu *mi,
+			      const struct bch_extent_ptr *ptr)
+{
+	struct cache *ca;
+	bool ret = false;
+
+	if (PTR_TIER(mi, ptr) == r->tier &&
+	    (ca = PTR_CACHE(c, ptr))) {
+		struct rebalance_bucket_entry *e, s = {
+			.dev = ptr->dev,
+			.bucket = PTR_BUCKET_NR(ca, ptr),
+		};
+
+		mutex_lock(&r->heap_lock);
+
+		e = bsearch(&s,
+			    r->heap.data,
+			    r->heap.used,
+			    sizeof(r->heap.data[0]),
+			    rebalance_entry_bucket_cmp);
+		if (e &&
+		    e->gen == ptr->gen &&
+		    e->gen == PTR_BUCKET_GEN(ca, ptr))
+			ret = true;
+
+		mutex_unlock(&r->heap_lock);
+	}
+
+	return ret;
+}
+
+static bool rebalance_pred(struct cache_set *c,
+			   struct rebalance_thread *r,
+			   struct bkey_s_c k)
+{
+	bool need_tier = false, need_copygc = false;
+
+	if (bkey_extent_is_data(k.k)) {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct cache_member_rcu *mi = cache_member_info_get(c);
+
+		if (should_tier_extent(c, r, mi, e))
+			need_tier = true;
+
+		extent_for_each_ptr(e, ptr)
+			if (should_copygc_ptr(c, r, mi, ptr))
+				need_copygc = true;
+
+		cache_member_info_put();
+	}
+
+	return need_tier || need_copygc;
+}
+
+static int rebalance_extent(struct cache_set *c,
+			    struct rebalance_thread *r,
+			    struct bkey_s_c k,
+			    struct move_context *m)
+{
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct moving_io *io;
+	unsigned nr_new_extents;
+	bool have_faster_extent = false;
+	struct cache_member_rcu *mi;
+
+	io = bch_moving_io_alloc(k);
+	if (!io) {
+		//trace_bcache_moving_gc_alloc_fail(c, k.k->size);
+		return -ENOMEM;
+	}
+
+	bch_replace_init(&io->replace, k);
+
+	/* How the piss are reserves going to work? */
+
+	bch_write_op_init(&io->op, c, &io->bio,
+			  (struct disk_reservation) { 0 },
+			  &r->wp, k,
+			  &io->replace.hook, NULL,
+			  bkey_extent_is_cached(k.k)
+			  ? BCH_WRITE_CACHED : 0);
+
+	io->op.io_wq = r->wq;
+
+	e = bkey_i_to_s_extent(&io->op.insert_key);
+
+	mi = cache_member_info_get(c);
+
+	nr_new_extents = should_tier_extent(c, r, mi, e.c);
+
+	extent_for_each_ptr_backwards(e, ptr) {
+		if (PTR_TIER(mi, ptr) < r->tier) {
+			if (have_faster_extent)
+				bch_extent_drop_ptr(e, ptr);
+			else
+				have_faster_extent = true;
+		}
+
+		if (should_copygc_ptr(c, r, mi, ptr)) {
+			bch_extent_drop_ptr(e, ptr);
+			nr_new_extents++;
+		}
+	}
+
+	cache_member_info_put();
+
+	if (!nr_new_extents) {
+		/* We raced - bucket's been reused */
+		bch_moving_io_free(io);
+		return 0;
+	}
+	io->op.nr_replicas	= nr_new_extents;
+
+	bch_data_move(m, io);
+	return 0;
+}
+
+static void rebalance_walk_extents(struct cache_set *c,
+				   struct rebalance_thread *r)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct move_context m;
+
+	move_context_init(&m);
+	bch_ratelimit_reset(&r->pd.rate);
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) {
+		if (kthread_should_stop())
+			break;
+
+		if (rebalance_pred(c, r, k)) {
+			BKEY_PADDED(k) tmp;
+
+			bkey_reassemble(&tmp.k, k);
+			bch_btree_iter_unlock(&iter);
+
+			rebalance_extent(c, r,
+					 bkey_i_to_s_c(&tmp.k),
+					 &m);
+		}
+
+		bch_btree_iter_cond_resched(&iter);
+	}
+	bch_btree_iter_unlock(&iter);
+
+	closure_sync(&m.cl);
+}
+
+static void bch_rebalance(struct cache_set *c, struct rebalance_thread *r)
+{
+	struct cache_group devs, *tier = &c->cache_tiers[r->tier];
+	struct rebalance_bucket_entry e;
+	unsigned i, seq, sectors_used;
+	u64 sectors_to_move, reserve_sectors = 0;
+	size_t buckets_unused = 0;
+
+	rcu_read_lock();
+
+	do {
+		seq = read_seqcount_begin(&tier->lock);
+		devs = *tier;
+	} while (read_seqcount_retry(&tier->lock, seq));
+
+	for (i = 0; i < devs.nr_devices; i++)
+		percpu_ref_get(&rcu_dereference(devs.devices[i])->ref);
+
+	rcu_read_unlock();
+
+	mutex_lock(&r->heap_lock);
+
+	r->heap.used = 0;
+
+	for (i = 0; i < devs.nr_devices; i++) {
+		struct cache *ca =
+			rcu_dereference_protected(devs.devices[i], 1);
+		size_t bucket;
+
+		spin_lock(&ca->freelist_lock);
+		reserve_sectors += ca->mi.bucket_size *
+			fifo_used(&ca->free[RESERVE_MOVINGGC]);
+		spin_unlock(&ca->freelist_lock);
+
+		for (bucket = ca->mi.first_bucket;
+		     bucket < ca->mi.nbuckets;
+		     bucket++) {
+			struct bucket *g = ca->buckets + bucket;
+
+			if (bucket_unused(g)) {
+				buckets_unused++;
+				continue;
+			}
+
+			if (g->mark.owned_by_allocator ||
+			    g->mark.is_metadata)
+				continue;
+
+			sectors_used = bucket_sectors_used(g);
+
+			if (sectors_used >= ca->mi.bucket_size)
+				continue;
+
+			rebalance_heap_push(r, bucket, ca->sb.nr_this_dev,
+					    ca->bucket_gens[bucket],
+					    sectors_used);
+		}
+	}
+
+	/*
+	 * Problems...
+	 * XXX: wait on the allocator? perhaps the allocator just hasn't
+	 * invalidated/discarded buckets we freed up from our last run?
+	 */
+	if (!reserve_sectors)
+		goto out_put;
+
+	sectors_to_move = 0;
+	for (i = 0; i < r->heap.used; i++)
+		sectors_to_move += r->heap.data[i].sectors;
+
+	/*
+	 * If there's not enough work to do, bail out so we aren't scanning the
+	 * btree unnecessarily:
+	 *
+	 * XXX: calculate this threshold rigorously
+	 */
+#if 0
+	if (r->heap.used < ca->free_inc.size / 2 &&
+	    sectors_to_move < reserve_sectors)
+		goto out_put;
+#endif
+
+	/* Pop buckets off until the they fit into our reserve: */
+	while (sectors_to_move > reserve_sectors) {
+		BUG_ON(!heap_pop(&r->heap, e, rebalance_entry_sectors_cmp));
+		sectors_to_move -= e.sectors;
+	}
+
+	sort(r->heap.data,
+	     r->heap.used,
+	     sizeof(r->heap.data[0]),
+	     rebalance_entry_bucket_cmp,
+	     NULL);
+
+	mutex_unlock(&r->heap_lock);
+
+	for (i = 0; i < devs.nr_devices; i++)
+		percpu_ref_put(&rcu_dereference_protected(devs.devices[i],
+							  1)->ref);
+
+	rebalance_walk_extents(c, r);
+	return;
+
+out_put:
+	mutex_unlock(&r->heap_lock);
+	for (i = 0; i < devs.nr_devices; i++)
+		percpu_ref_put(&rcu_dereference(devs.devices[i])->ref);
+}
+
+static int bch_rebalance_thread(void *arg)
+{
+	struct rebalance_thread *r = arg;
+	struct cache_set *c = container_of(r, struct cache_set,
+					   rebalance[r->tier]);
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last;
+	//bool moved;
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled ||
+					   c->tiering_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		bch_rebalance(c, r);
+
+		/*
+		 * This really should be a library code, but it has to be
+		 * kthread specific... ugh
+		 */
+#if 0
+		if (!moved)
+			bch_kthread_io_clock_wait(clock,
+					last + ca->free_inc.size / 2);
+#endif
+	}
+
+	return 0;
+}
+
+static void bch_rebalance_exit_tier(struct rebalance_thread *r)
+{
+	if (r->p)
+		kthread_stop(r->p);
+	r->p = NULL;
+	if (r->wq)
+		destroy_workqueue(r->wq);
+	r->wq = NULL;
+	free_heap(&r->heap);
+}
+
+void bch_rebalance_exit(struct cache_set *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+		bch_rebalance_exit_tier(&c->rebalance[i]);
+}
+
+/*
+ * Called whenever we add a device - initializes the per tier rebalance thread,
+ * or resizes the heap if necessary
+ */
+int bch_rebalance_init(struct cache_set *c, struct cache *ca)
+{
+	unsigned tier = ca->mi.tier;
+	struct rebalance_thread *r = &c->rebalance[tier];
+	struct task_struct *p;
+	u64 nbuckets = 0;
+	size_t heap_size;
+	unsigned i;
+	typeof(r->heap) old_heap;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (!r->initialized) {
+		r->tier = tier;
+		mutex_init(&r->heap_lock);
+		r->wp.group = &c->cache_tiers[tier];
+		r->wp.reserve = RESERVE_MOVINGGC; /* XXX */
+		r->initialized = 1;
+	}
+
+	if (!r->wq)
+		r->wq = create_workqueue("bch_rebalance_io");
+	if (!r->wq)
+		return -ENOMEM;
+
+	if (!r->p) {
+		p = kthread_create(bch_rebalance_thread, r,
+				   "bch_rebalance");
+		if (IS_ERR(p))
+			return PTR_ERR(p);
+
+		r->p = p;
+	}
+
+	/* ca hasn't been added to array of devices yet: */
+	nbuckets += ca->mi.nbuckets;
+
+	rcu_read_lock();
+	for_each_cache_rcu(ca, c, i)
+		if (ca->mi.tier == tier)
+			nbuckets += ca->mi.nbuckets;
+	rcu_read_unlock();
+
+	mutex_lock(&r->heap_lock);
+	old_heap = r->heap;
+
+	heap_size = max_t(size_t, nbuckets >> 7, old_heap.used);
+	BUG_ON(!heap_size);
+
+	if (!init_heap(&r->heap, heap_size, GFP_KERNEL)) {
+		mutex_unlock(&r->heap_lock);
+		return -ENOMEM;
+	}
+
+	if (old_heap.data) {
+		memcpy(r->heap.data,
+		       old_heap.data,
+		       sizeof(old_heap.data[0]) * old_heap.used);
+		r->heap.used = old_heap.used;
+		free_heap(&old_heap);
+	}
+
+	mutex_unlock(&r->heap_lock);
+
+	return 0;
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..3a15dff7bdff
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,7 @@
+#ifndef _BCACHE_REBALANCE_H
+#define _BCACHE_REBALANCE_H
+
+void bch_rebalance_exit(struct cache_set *);
+int bch_rebalance_init(struct cache_set *, struct cache *ca);
+
+#endif /* _BCACHE_REBALANCE_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ef8fb0dac003..beb0587be4ce 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -24,11 +24,10 @@
 #include "keylist.h"
 #include "move.h"
 #include "migrate.h"
-#include "movinggc.h"
 #include "notify.h"
+#include "rebalance.h"
 #include "stats.h"
 #include "super.h"
-#include "tier.h"
 #include "writeback.h"
 
 #include <linux/backing-dev.h>
@@ -683,15 +682,6 @@ static void __bch_cache_set_read_only(struct cache_set *c)
 	struct cache *ca;
 	unsigned i;
 
-	c->tiering_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&c->tiering_pd.rate);
-	bch_tiering_read_stop(c);
-
-	for_each_cache(ca, c, i) {
-		bch_tiering_write_stop(ca);
-		bch_moving_gc_stop(ca);
-	}
-
 	bch_gc_thread_stop(c);
 
 	bch_btree_flush(c);
@@ -804,7 +794,6 @@ void bch_cache_set_read_only_sync(struct cache_set *c)
 
 static const char *__bch_cache_set_read_write(struct cache_set *c)
 {
-	struct cache *ca;
 	const char *err;
 	unsigned i;
 
@@ -822,22 +811,9 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	for_each_cache(ca, c, i) {
-		if (ca->mi.state != CACHE_ACTIVE)
-			continue;
-
-		err = "error starting moving GC thread";
-		if (bch_moving_gc_thread_start(ca)) {
-			percpu_ref_put(&ca->ref);
-			goto err;
-		}
-
-		bch_tiering_write_start(ca);
-	}
-
-	err = "error starting tiering thread";
-	if (bch_tiering_read_start(c))
-		goto err;
+	for (i = 0; i < ARRAY_SIZE(c->rebalance); i++)
+		if (c->rebalance[i].p)
+			wake_up_process(c->rebalance[i].p);
 
 	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
@@ -877,6 +853,7 @@ static void cache_set_free(struct cache_set *c)
 	cancel_work_sync(&c->bio_submit_work);
 	cancel_work_sync(&c->read_retry_work);
 
+	bch_rebalance_exit(c);
 	bch_bset_sort_state_free(&c->sort);
 	bch_btree_cache_free(c);
 	bch_journal_free(&c->journal);
@@ -1061,11 +1038,8 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
 	mutex_init(&c->btree_root_lock);
 	INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
 	mutex_init(&c->mi_lock);
-
 	init_rwsem(&c->gc_lock);
 	mutex_init(&c->trigger_gc_lock);
-	mutex_init(&c->gc_scan_keylist_lock);
-	INIT_LIST_HEAD(&c->gc_scan_keylists);
 
 #define BCH_TIME_STAT(name, frequency_units, duration_units)		\
 	spin_lock_init(&c->name##_time.lock);
@@ -1073,7 +1047,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
 #undef BCH_TIME_STAT
 
 	bch_open_buckets_init(c);
-	bch_tiering_init_cache_set(c);
 
 	INIT_LIST_HEAD(&c->list);
 	INIT_LIST_HEAD(&c->cached_devs);
@@ -1507,8 +1480,7 @@ static void __bch_cache_read_only(struct cache *ca)
 {
 	trace_bcache_cache_read_only(ca);
 
-	bch_tiering_write_stop(ca);
-	bch_moving_gc_stop(ca);
+	/* XXX do stuff with rebalance thread */
 
 	/*
 	 * This stops new data writes (e.g. to existing open data
@@ -1564,19 +1536,12 @@ static const char *__bch_cache_read_write(struct cache *ca)
 
 	trace_bcache_cache_read_write(ca);
 
-	bch_tiering_write_start(ca);
-
-	trace_bcache_cache_read_write_done(ca);
-
-	/* XXX wtf? */
-	return NULL;
-
-	err = "error starting moving GC thread";
-	if (!bch_moving_gc_thread_start(ca))
-		err = NULL;
+	if (bch_cache_allocator_start(ca))
+		return "error starting allocator thread";
 
-	wake_up_process(ca->set->tiering_read);
+	/* XXX notify rebalance thread?  */
 
+	trace_bcache_cache_read_write_done(ca);
 	bch_notify_cache_read_write(ca);
 
 	return err;
@@ -1633,8 +1598,6 @@ static void bch_cache_free_work(struct work_struct *work)
 	 * to unregister them before we drop our reference to
 	 * @c.
 	 */
-	bch_moving_gc_destroy(ca);
-	bch_tiering_write_destroy(ca);
 
 	cancel_work_sync(&ca->io_error_work);
 
@@ -1890,9 +1853,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	seqcount_init(&ca->self.lock);
-	ca->self.nr_devices = 1;
-	rcu_assign_pointer(ca->self.devices[0], ca);
 	ca->sb.nr_this_dev = sb->sb->nr_this_dev;
 
 	INIT_WORK(&ca->free_work, bch_cache_free_work);
@@ -1919,8 +1879,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 	ca->bucket_bits = ilog2(ca->mi.bucket_size);
 
 	/* XXX: tune these */
-	movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2,
-				 ca->mi.nbuckets >> 7);
+	movinggc_reserve = ca->mi.nbuckets >> 7;
 	reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
 	free_inc_reserve = reserve_none << 1;
 	heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve);
@@ -1946,8 +1905,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio.bio)) ||
 	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
-	    bch_moving_init_cache(ca) ||
-	    bch_tiering_init_cache(ca))
+	    bch_rebalance_init(c, ca))
 		goto err;
 
 	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1957,20 +1915,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
 		total_reserve += ca->free[i].size;
 	pr_debug("%zu buckets reserved", total_reserve);
 
-	for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++) {
-		ca->gc_buckets[i].reserve = RESERVE_MOVINGGC;
-		ca->gc_buckets[i].group = &ca->self;
-	}
-
-	ca->tiering_write_point.reserve = RESERVE_NONE;
-	ca->tiering_write_point.group = &ca->self;
-
-	/* XXX: scan keylists will die */
-	bch_scan_keylist_init(&ca->moving_gc_queue.keys, c,
-			      DFLT_SCAN_KEYLIST_MAX_SIZE);
-	bch_scan_keylist_init(&ca->tiering_queue.keys, c,
-			      DFLT_SCAN_KEYLIST_MAX_SIZE);
-
 	kobject_get(&c->kobj);
 	ca->set = c;
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a901b5d8368a..446552c460ec 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -138,14 +138,14 @@ rw_attribute(cache_replacement_policy);
 
 rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
-sysfs_queue_attribute(copy_gc);
-sysfs_pd_controller_attribute(copy_gc);
+//sysfs_queue_attribute(copy_gc);
+//sysfs_pd_controller_attribute(copy_gc);
 rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
 
-sysfs_queue_attribute(tiering);
-rw_attribute(tiering_stripe_size);
+//sysfs_queue_attribute(tiering);
+//rw_attribute(tiering_stripe_size);
 
 sysfs_pd_controller_attribute(foreground_write);
 
@@ -701,7 +701,7 @@ SHOW(bch_cache_set)
 
 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_show(tiering,	&c->tiering_pd);
+	//sysfs_pd_controller_show(tiering,	&c->tiering_pd);
 
 	sysfs_print(btree_flush_delay,		c->btree_flush_delay);
 
@@ -781,23 +781,26 @@ STORE(__bch_cache_set)
 		      c->foreground_write_ratelimit_enabled);
 
 	if (attr == &sysfs_copy_gc_enabled) {
-		struct cache *ca;
-		unsigned i;
 		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
 			?: (ssize_t) size;
+#if 0
+		struct cache *ca;
+		unsigned i;
 
 		for_each_cache(ca, c, i)
 			if (ca->moving_gc_read)
 				wake_up_process(ca->moving_gc_read);
+#endif
 		return ret;
 	}
 
 	if (attr == &sysfs_tiering_enabled) {
 		ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
 			?: (ssize_t) size;
-
+#if 0
 		if (c->tiering_read)
 			wake_up_process(c->tiering_read);
+#endif
 		return ret;
 	}
 
@@ -807,7 +810,6 @@ STORE(__bch_cache_set)
 
 	if (attr == &sysfs_journal_flush) {
 		bch_journal_meta_async(&c->journal, NULL);
-
 		return size;
 	}
 
@@ -816,7 +818,7 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 
 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
-	sysfs_pd_controller_store(tiering,	&c->tiering_pd);
+	//sysfs_pd_controller_store(tiering,	&c->tiering_pd);
 
 	/* Debugging: */
 
@@ -1210,13 +1212,13 @@ SHOW(bch_cache)
 	sysfs_print(free_buckets,	buckets_free_cache(ca, RESERVE_NONE));
 	sysfs_print(has_data,		ca->mi.has_data);
 	sysfs_print(has_metadata,	ca->mi.has_metadata);
-
+#if 0
 	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
 	sysfs_queue_show(copy_gc, &ca->moving_gc_queue);
 
 	sysfs_queue_show(tiering, &ca->tiering_queue);
 	sysfs_print(tiering_stripe_size, ca->tiering_stripe_size);
-
+#endif
 	if (attr == &sysfs_cache_replacement_policy)
 		return bch_snprint_string_list(buf, PAGE_SIZE,
 					       cache_replacement_policies,
@@ -1250,13 +1252,13 @@ STORE(__bch_cache)
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 	struct cache_set *c = ca->set;
 	struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
-
+#if 0
 	sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
 	sysfs_queue_store(copy_gc, &ca->moving_gc_queue);
 
 	sysfs_queue_store(tiering, &ca->tiering_queue);
 	sysfs_strtoul(tiering_stripe_size, ca->tiering_stripe_size);
-
+#endif
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
 
@@ -1377,10 +1379,6 @@ static struct attribute *bch_cache_files[] = {
 	&sysfs_state_rw,
 	&sysfs_alloc_debug,
 
-	sysfs_pd_controller_files(copy_gc),
-	sysfs_queue_files(copy_gc),
-	sysfs_queue_files(tiering),
-	&sysfs_tiering_stripe_size,
 	NULL
 };
 KTYPE(bch_cache);
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c
deleted file mode 100644
index caf6b3df2c9c..000000000000
--- a/fs/bcachefs/tier.c
+++ /dev/null
@@ -1,466 +0,0 @@
-
-#include "bcache.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "tier.h"
-
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <trace/events/bcache.h>
-
-/**
- * tiering_pred - check if tiering should copy an extent to tier 1
- */
-static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k)
-{
-	struct cache *ca = container_of(kl, struct cache,
-					tiering_queue.keys);
-	struct cache_set *c = ca->set;
-
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-		struct cache_member_rcu *mi;
-		unsigned replicas = 0;
-
-		/* Make sure we have room to add a new pointer: */
-		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-		    BKEY_EXTENT_VAL_U64s_MAX)
-			return false;
-
-		mi = cache_member_info_get(c);
-		extent_for_each_ptr(e, ptr)
-			if (ptr->dev < mi->nr_in_set &&
-			    mi->m[ptr->dev].tier)
-				replicas++;
-		cache_member_info_put();
-
-		return replicas < c->opts.data_replicas;
-	}
-
-	return false;
-}
-
-struct tiering_refill {
-	struct bpos		start;
-	struct cache		*ca;
-	int			cache_iter;
-	u64			sectors;
-};
-
-static void refill_done(struct tiering_refill *refill)
-{
-	if (refill->ca) {
-		percpu_ref_put(&refill->ca->ref);
-		refill->ca = NULL;
-	}
-}
-
-/**
- * refill_next - move on to refilling the next cache's tiering keylist
- */
-static void refill_next(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-
-	refill_done(refill);
-
-	rcu_read_lock();
-	tier = &c->cache_tiers[1];
-	if (tier->nr_devices == 0)
-		goto out;
-
-	while (1) {
-		while (refill->cache_iter < tier->nr_devices) {
-			refill->ca = rcu_dereference(
-					tier->devices[refill->cache_iter]);
-			if (refill->ca != NULL) {
-				percpu_ref_get(&refill->ca->ref);
-				goto out;
-			}
-			refill->cache_iter++;
-		}
-
-		/* Reached the end, wrap around */
-		refill->cache_iter = 0;
-	}
-
-out:
-	rcu_read_unlock();
-}
-
-/*
- * refill_init - Start refilling a random cache device -- this ensures we
- * distribute data sanely even if each tiering pass discovers only a few
- * keys to tier
- */
-static void refill_init(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-
-	memset(refill, 0, sizeof(*refill));
-	refill->start = POS_MIN;
-
-	rcu_read_lock();
-	tier = &c->cache_tiers[1];
-	if (tier->nr_devices != 0)
-		refill->cache_iter = bch_rand_range(tier->nr_devices);
-	rcu_read_unlock();
-
-	refill_next(c, refill);
-}
-
-/**
- * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache
- * device's tiering keylist before we move on to the next cache device
- */
-static bool tiering_keylist_full(struct tiering_refill *refill)
-{
-	return (refill->sectors >= refill->ca->tiering_stripe_size);
-}
-
-/**
- * tiering_keylist_empty - to prevent a keylist from growing to more than twice
- * the tiering stripe size, we stop refill when a keylist has more than a single
- * stripe of sectors
- */
-static bool tiering_keylist_empty(struct cache *ca)
-{
-	return (bch_scan_keylist_sectors(&ca->tiering_queue.keys)
-		<= ca->tiering_stripe_size);
-}
-
-/**
- * tiering_refill - to keep all queues busy as much as possible, we add
- * up to a single stripe of sectors to each cache device's queue, iterating
- * over all cache devices twice, so each one has two stripe's of writes
- * queued up, before we have to wait for move operations to complete.
- */
-static void tiering_refill(struct cache_set *c, struct tiering_refill *refill)
-{
-	struct scan_keylist *keys;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-
-	if (bkey_cmp(refill->start, POS_MAX) >= 0)
-		return;
-
-	if (refill->ca == NULL)
-		return;
-
-	if (!tiering_keylist_empty(refill->ca))
-		return;
-
-	trace_bcache_tiering_refill_start(c);
-
-	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) {
-		keys = &refill->ca->tiering_queue.keys;
-
-		if (!tiering_pred(keys, k)) {
-			refill->start = k.k->p;
-			goto next;
-		}
-
-		/* Growing the keylist might fail */
-		if (bch_scan_keylist_add(keys, k))
-			goto done;
-
-		/* TODO: split key if refill->sectors is now > stripe_size */
-		refill->sectors += k.k->size;
-		refill->start = k.k->p;
-
-		/* Check if we've added enough keys to this keylist */
-		if (tiering_keylist_full(refill)) {
-			/* Move on to refill the next cache device's keylist */
-			refill->sectors = 0;
-			refill->cache_iter++;
-			refill_next(c, refill);
-
-			/* All cache devices got removed somehow */
-			if (refill->ca == NULL)
-				goto done;
-
-			/*
-			 * If the next cache's keylist is not sufficiently
-			 * empty, wait for it to drain before refilling
-			 * anything.  We prioritize even distribution of data
-			 * over maximizing write bandwidth.
-			 */
-			if (!tiering_keylist_empty(refill->ca))
-				goto done;
-		}
-next:
-		bch_btree_iter_cond_resched(&iter);
-	}
-	/* Reached the end of the keyspace */
-	refill->start = POS_MAX;
-done:
-	bch_btree_iter_unlock(&iter);
-
-	trace_bcache_tiering_refill_end(c);
-}
-
-static int issue_tiering_move(struct moving_queue *q,
-			      struct moving_context *ctxt,
-			      struct bkey_s_c k)
-{
-	struct cache *ca = container_of(q, struct cache, tiering_queue);
-	struct cache_set *c = ca->set;
-	struct moving_io *io;
-
-	io = moving_io_alloc(k);
-	if (!io) {
-		trace_bcache_tiering_alloc_fail(c, k.k->size);
-		return -ENOMEM;
-	}
-
-	bch_replace_init(&io->replace, bkey_i_to_s_c(&io->key));
-
-	bch_write_op_init(&io->op, c, &io->wbio,
-			  (struct disk_reservation) { 0 },
-			  &ca->tiering_write_point,
-			  bkey_i_to_s_c(&io->key),
-			  &io->replace.hook, NULL, 0);
-	io->op.io_wq = q->wq;
-	io->op.nr_replicas = 1;
-
-	trace_bcache_tiering_copy(k.k);
-
-	/*
-	 * IMPORTANT: We must call bch_data_move before we dequeue so
-	 * that the key can always be found in either the pending list
-	 * in the moving queue or in the scan keylist list in the
-	 * moving queue.
-	 * If we reorder, there is a window where a key is not found
-	 * by btree gc marking.
-	 */
-	bch_data_move(q, ctxt, io);
-	bch_scan_keylist_dequeue(&q->keys);
-	return 0;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static int tiering_next_cache(struct cache_set *c,
-			      int *cache_iter,
-			      struct moving_context *ctxt,
-			      struct tiering_refill *refill)
-{
-	struct cache_group *tier;
-	int start = *cache_iter;
-	struct cache *ca;
-
-	/* If true at the end of the loop, all keylists were empty, so we
-	 * have reached the end of the keyspace */
-	bool done = true;
-	/* If true at the end of the loop, all queues were full, so we must
-	 * wait for some ops to finish */
-	bool full = true;
-
-	do {
-		rcu_read_lock();
-		tier = &c->cache_tiers[1];
-		if (tier->nr_devices == 0) {
-			rcu_read_unlock();
-			return 0;
-		}
-
-		if (*cache_iter >= tier->nr_devices) {
-			rcu_read_unlock();
-			*cache_iter = 0;
-			continue;
-		}
-
-		ca = rcu_dereference(tier->devices[*cache_iter]);
-		if (ca == NULL ||
-		    ca->mi.state != CACHE_ACTIVE ||
-		    ca->tiering_queue.stopped) {
-			rcu_read_unlock();
-			(*cache_iter)++;
-			continue;
-		}
-
-		percpu_ref_get(&ca->ref);
-		rcu_read_unlock();
-		(*cache_iter)++;
-
-		tiering_refill(c, refill);
-
-		if (bch_queue_full(&ca->tiering_queue)) {
-			done = false;
-		} else {
-			struct bkey_i *k =
-				bch_scan_keylist_next(&ca->tiering_queue.keys);
-			if (k) {
-				issue_tiering_move(&ca->tiering_queue, ctxt,
-						   bkey_i_to_s_c(k));
-				done = false;
-				full = false;
-			}
-		}
-
-		percpu_ref_put(&ca->ref);
-	} while (*cache_iter != start);
-
-	if (done) {
-		/*
-		 * All devices have an empty keylist now, just wait for
-		 * pending moves to finish and we're done.
-		 */
-		return 0;
-	} else if (full) {
-		/*
-		 * No device with keys still remaining on its keylist has a
-		 * queue that is not full. In this case, we have to wait for
-		 * at least one read to complete before trying again.
-		 * Otherwise, we could issue a read for this device.
-		 */
-		return -EAGAIN;
-	} else {
-		/* Try again immediately */
-		return -EIOCBQUEUED;
-	}
-}
-
-static u64 read_tiering(struct cache_set *c)
-{
-	struct moving_context ctxt;
-	struct tiering_refill refill;
-	int cache_iter = 0;
-	int ret;
-
-	trace_bcache_tiering_start(c);
-
-	refill_init(c, &refill);
-
-	bch_moving_context_init(&ctxt, &c->tiering_pd.rate,
-				MOVING_PURPOSE_TIERING);
-
-	while (!bch_moving_context_wait(&ctxt)) {
-		cond_resched();
-
-		ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill);
-		if (ret == -EAGAIN)
-			bch_moving_wait(&ctxt);
-		else if (!ret)
-			break;
-	}
-
-	closure_sync(&ctxt.cl);
-	refill_done(&refill);
-
-	trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
-
-	return ctxt.sectors_moved;
-}
-
-static int bch_tiering_thread(void *arg)
-{
-	struct cache_set *c = arg;
-	struct io_clock *clock = &c->io_clock[WRITE];
-	struct cache *ca;
-	u64 sectors, tier_capacity;
-	unsigned long last;
-	unsigned i;
-
-	set_freezable();
-
-	while (!kthread_should_stop()) {
-		if (kthread_wait_freezable(c->tiering_enabled &&
-					   c->cache_tiers[1].nr_devices))
-			break;
-
-		last = atomic_long_read(&clock->now);
-
-		sectors = read_tiering(c);
-
-		tier_capacity = 0;
-		rcu_read_lock();
-		group_for_each_cache_rcu(ca, &c->cache_tiers[0], i)
-			tier_capacity +=
-				(ca->mi.nbuckets -
-				 ca->mi.first_bucket) << ca->bucket_bits;
-		rcu_read_unlock();
-
-		if (sectors < tier_capacity >> 4)
-			bch_kthread_io_clock_wait(clock,
-					  last + (tier_capacity >> 5));
-	}
-
-	return 0;
-}
-
-#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE
-#define TIERING_NR 64
-#define TIERING_READ_NR 8
-#define TIERING_WRITE_NR 32
-
-void bch_tiering_init_cache_set(struct cache_set *c)
-{
-	bch_pd_controller_init(&c->tiering_pd);
-}
-
-int bch_tiering_init_cache(struct cache *ca)
-{
-	ca->tiering_stripe_size = ca->mi.bucket_size * 2;
-
-	return bch_queue_init(&ca->tiering_queue,
-			      ca->set,
-			      TIERING_KEYS_MAX_SIZE,
-			      TIERING_NR,
-			      TIERING_READ_NR,
-			      TIERING_WRITE_NR,
-			      false,
-			      "bch_tier_write");
-}
-
-void bch_tiering_write_start(struct cache *ca)
-{
-	bch_queue_start(&ca->tiering_queue);
-}
-
-int bch_tiering_read_start(struct cache_set *c)
-{
-	struct task_struct *t;
-
-	t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
-	if (IS_ERR(t))
-		return PTR_ERR(t);
-
-	c->tiering_read = t;
-	wake_up_process(c->tiering_read);
-
-	return 0;
-}
-
-void bch_tiering_write_destroy(struct cache *ca)
-{
-	bch_queue_destroy(&ca->tiering_queue);
-}
-
-void bch_tiering_write_stop(struct cache *ca)
-{
-	bch_queue_stop(&ca->tiering_queue);
-
-	/*
-	 * Make sure that it is empty so that gc marking doesn't keep
-	 * marking stale entries from when last used.
-	 */
-	bch_scan_keylist_reset(&ca->tiering_queue.keys);
-}
-
-void bch_tiering_read_stop(struct cache_set *c)
-{
-	if (!IS_ERR_OR_NULL(c->tiering_read)) {
-		kthread_stop(c->tiering_read);
-		c->tiering_read = NULL;
-	}
-}
diff --git a/fs/bcachefs/tier.h b/fs/bcachefs/tier.h
deleted file mode 100644
index 57b4acf86fb5..000000000000
--- a/fs/bcachefs/tier.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _BCACHE_TIER_H
-#define _BCACHE_TIER_H
-
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_init_cache(struct cache *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_write_start(struct cache *);
-void bch_tiering_write_destroy(struct cache *);
-void bch_tiering_write_stop(struct cache *);
-void bch_tiering_read_stop(struct cache_set *);
-
-#endif
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 00bd60de2ce4..78eac03455a8 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -1060,6 +1060,7 @@ TRACE_EVENT(bcache_keyscan,
 		  __entry->end_inode, __entry->end_offset)
 );
 
+#if 0
 /* Moving IO */
 
 DECLARE_EVENT_CLASS(moving_io,
@@ -1215,6 +1216,7 @@ DEFINE_EVENT(bkey, bcache_tiering_copy,
 	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
+#endif
 
 /* Background writeback */