1 files changed, 482 insertions, 400 deletions
diff --git a/libbcache/super.c b/libbcache/super.c
index d2863e62..bb4a7dc3 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -62,28 +62,77 @@ static const uuid_le invalid_uuid = {
 };
 
 static struct kset *bcache_kset;
-struct mutex bch_register_lock;
-LIST_HEAD(bch_fs_list);
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
 
 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
 struct crypto_shash *bch_sha256;
 
-static void bch_dev_free(struct cache *);
-static int bch_dev_online(struct cache *);
+static void bch_dev_free(struct bch_dev *);
+static int bch_dev_alloc(struct bch_fs *, unsigned);
+static int bch_dev_sysfs_online(struct bch_dev *);
+static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *);
 
-static int bch_congested_fn(void *data, int bdi_bits)
+struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		for_each_member_device_rcu(ca, c, i)
+			if (ca->disk_sb.bdev == bdev) {
+				closure_get(&c->cl);
+				goto found;
+			}
+	c = NULL;
+found:
+	rcu_read_unlock();
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid)
+{
+	struct bch_fs *c;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+struct bch_fs *bch_uuid_to_fs(uuid_le uuid)
+{
+	struct bch_fs *c;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch_uuid_to_fs(uuid);
+	if (c)
+		closure_get(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+int bch_congested(struct bch_fs *c, int bdi_bits)
 {
 	struct backing_dev_info *bdi;
-	struct cache_set *c = data;
-	struct cache *ca;
+	struct bch_dev *ca;
 	unsigned i;
 	int ret = 0;
 
-	rcu_read_lock();
 	if (bdi_bits & (1 << WB_sync_congested)) {
 		/* Reads - check all devices: */
-		for_each_cache_rcu(ca, c, i) {
+		for_each_readable_member(ca, c, i) {
 			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
 			if (bdi_congested(bdi, bdi_bits)) {
@@ -94,9 +143,10 @@ static int bch_congested_fn(void *data, int bdi_bits)
 	} else {
 		/* Writes prefer fastest tier: */
 		struct bch_tier *tier = READ_ONCE(c->fastest_tier);
-		struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+		struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
 
-		group_for_each_cache_rcu(ca, grp, i) {
+		rcu_read_lock();
+		group_for_each_dev(ca, grp, i) {
 			bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
 			if (bdi_congested(bdi, bdi_bits)) {
@@ -104,12 +154,19 @@ static int bch_congested_fn(void *data, int bdi_bits)
 				break;
 			}
 		}
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
 
 	return ret;
 }
 
+static int bch_congested_fn(void *data, int bdi_bits)
+{
+	struct bch_fs *c = data;
+
+	return bch_congested(c, bdi_bits);
+}
+
 /* Filesystem RO/RW: */
 
 /*
@@ -127,21 +184,21 @@ static int bch_congested_fn(void *data, int bdi_bits)
  * - allocator depends on the journal (when it rewrites prios and gens)
  */
 
-static void __bch_fs_read_only(struct cache_set *c)
+static void __bch_fs_read_only(struct bch_fs *c)
 {
-	struct cache *ca;
+	struct bch_dev *ca;
 	unsigned i;
 
 	bch_tiering_stop(c);
 
-	for_each_cache(ca, c, i)
+	for_each_member_device(ca, c, i)
 		bch_moving_gc_stop(ca);
 
 	bch_gc_thread_stop(c);
 
 	bch_btree_flush(c);
 
-	for_each_cache(ca, c, i)
+	for_each_member_device(ca, c, i)
 		bch_dev_allocator_stop(ca);
 
 	bch_fs_journal_stop(&c->journal);
@@ -149,13 +206,13 @@ static void __bch_fs_read_only(struct cache_set *c)
 
 static void bch_writes_disabled(struct percpu_ref *writes)
 {
-	struct cache_set *c = container_of(writes, struct cache_set, writes);
+	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
 
 	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 	wake_up(&bch_read_only_wait);
 }
 
-void bch_fs_read_only(struct cache_set *c)
+void bch_fs_read_only(struct bch_fs *c)
 {
 	mutex_lock(&c->state_lock);
 	if (c->state != BCH_FS_STARTING &&
@@ -222,18 +279,18 @@ out:
 
 static void bch_fs_read_only_work(struct work_struct *work)
 {
-	struct cache_set *c =
-		container_of(work, struct cache_set, read_only_work);
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, read_only_work);
 
 	bch_fs_read_only(c);
 }
 
-static void bch_fs_read_only_async(struct cache_set *c)
+static void bch_fs_read_only_async(struct bch_fs *c)
 {
 	queue_work(system_long_wq, &c->read_only_work);
 }
 
-bool bch_fs_emergency_read_only(struct cache_set *c)
+bool bch_fs_emergency_read_only(struct bch_fs *c)
 {
 	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
@@ -244,9 +301,9 @@ bool bch_fs_emergency_read_only(struct cache_set *c)
 	return ret;
 }
 
-const char *bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct bch_fs *c)
 {
-	struct cache *ca;
+	struct bch_dev *ca;
 	const char *err = NULL;
 	unsigned i;
 
@@ -256,10 +313,9 @@ const char *bch_fs_read_write(struct cache_set *c)
 		goto out;
 
 	err = "error starting allocator thread";
-	for_each_cache(ca, c, i)
-		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-		    bch_dev_allocator_start(ca)) {
-			percpu_ref_put(&ca->ref);
+	for_each_rw_member(ca, c, i)
+		if (bch_dev_allocator_start(ca)) {
+			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
 
@@ -268,10 +324,9 @@ const char *bch_fs_read_write(struct cache_set *c)
 		goto err;
 
 	err = "error starting moving GC thread";
-	for_each_cache(ca, c, i)
-		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-		    bch_moving_gc_start(ca)) {
-			percpu_ref_put(&ca->ref);
+	for_each_rw_member(ca, c, i)
+		if (bch_moving_gc_start(ca)) {
+			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
 
@@ -296,7 +351,7 @@ err:
 
 /* Filesystem startup/shutdown: */
 
-static void bch_fs_free(struct cache_set *c)
+static void bch_fs_free(struct bch_fs *c)
 {
 	bch_fs_encryption_exit(c);
 	bch_fs_btree_exit(c);
@@ -306,8 +361,8 @@ static void bch_fs_free(struct cache_set *c)
 	bch_fs_compress_exit(c);
 	bch_fs_blockdev_exit(c);
 	bdi_destroy(&c->bdi);
-	lg_lock_free(&c->bucket_stats_lock);
-	free_percpu(c->bucket_stats_percpu);
+	lg_lock_free(&c->usage_lock);
+	free_percpu(c->usage_percpu);
 	mempool_exit(&c->btree_bounce_pool);
 	mempool_exit(&c->bio_bounce_pages);
 	bioset_exit(&c->bio_write);
@@ -324,13 +379,12 @@ static void bch_fs_free(struct cache_set *c)
 	if (c->wq)
 		destroy_workqueue(c->wq);
 
-	kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
 	free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
 	kfree(c);
 	module_put(THIS_MODULE);
 }
 
-static void bch_fs_exit(struct cache_set *c)
+static void bch_fs_exit(struct bch_fs *c)
 {
 	unsigned i;
 
@@ -341,29 +395,31 @@ static void bch_fs_exit(struct cache_set *c)
 	cancel_work_sync(&c->read_retry_work);
 
 	for (i = 0; i < c->sb.nr_devices; i++)
-		if (c->cache[i])
-			bch_dev_free(c->cache[i]);
+		if (c->devs[i])
+			bch_dev_free(c->devs[i]);
 
 	closure_debug_destroy(&c->cl);
 	kobject_put(&c->kobj);
 }
 
-static void bch_fs_offline(struct cache_set *c)
+static void bch_fs_offline(struct bch_fs *c)
 {
-	struct cache *ca;
+	struct bch_dev *ca;
 	unsigned i;
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&bch_fs_list_lock);
 	list_del(&c->list);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&bch_fs_list_lock);
+
+	for_each_member_device(ca, c, i)
+		if (ca->kobj.state_in_sysfs &&
+		    ca->disk_sb.bdev)
+			sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+					  "bcache");
 
 	if (c->kobj.state_in_sysfs)
 		kobject_del(&c->kobj);
 
-	for_each_cache(ca, c, i)
-		if (ca->kobj.state_in_sysfs)
-			kobject_del(&ca->kobj);
-
 	bch_fs_debug_exit(c);
 	bch_fs_chardev_exit(c);
 
@@ -382,18 +438,18 @@ static void bch_fs_offline(struct cache_set *c)
  */
 void bch_fs_release(struct kobject *kobj)
 {
-	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
 	bch_notify_fs_stopped(c);
 	bch_fs_free(c);
 }
 
 /*
- * All activity on the cache_set should have stopped now - close devices:
+ * All activity on the filesystem should have stopped now - close devices:
  */
 static void __bch_fs_stop3(struct closure *cl)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	struct bch_fs *c = container_of(cl, struct bch_fs, cl);
 
 	bch_fs_exit(c);
 }
@@ -404,7 +460,7 @@ static void __bch_fs_stop3(struct closure *cl)
  */
 static void __bch_fs_stop2(struct closure *cl)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct bch_fs *c = container_of(cl, struct bch_fs, caching);
 
 	bch_fs_offline(c);
 
@@ -418,14 +474,14 @@ static void __bch_fs_stop2(struct closure *cl)
  */
 static void __bch_fs_stop1(struct closure *cl)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct bch_fs *c = container_of(cl, struct bch_fs, caching);
 
 	bch_blockdevs_stop(c);
 
 	continue_at(cl, __bch_fs_stop2, system_wq);
 }
 
-void bch_fs_stop_async(struct cache_set *c)
+void bch_fs_stop_async(struct bch_fs *c)
 {
 	mutex_lock(&c->state_lock);
 	if (c->state != BCH_FS_STOPPING) {
@@ -435,7 +491,7 @@ void bch_fs_stop_async(struct cache_set *c)
 	mutex_unlock(&c->state_lock);
 }
 
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop(struct bch_fs *c)
 {
 	mutex_lock(&c->state_lock);
 	BUG_ON(c->state == BCH_FS_STOPPING);
@@ -453,53 +509,25 @@ void bch_fs_stop(struct cache_set *c)
 	closure_sync(&c->cl);
 
 	bch_fs_exit(c);
-	kobject_put(&c->kobj);
 }
 
 /* Stop, detaching from backing devices: */
-void bch_fs_detach(struct cache_set *c)
+void bch_fs_detach(struct bch_fs *c)
 {
 	if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
 		bch_fs_stop_async(c);
 }
 
-static unsigned bch_fs_nr_devices(struct cache_set *c)
-{
-	struct bch_sb_field_members *mi;
-	unsigned i, nr = 0;
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-
-	for (i = 0; i < c->disk_sb->nr_devices; i++)
-		if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
-			nr++;
-
-	mutex_unlock(&c->sb_lock);
-
-	return nr;
-}
-
-static unsigned bch_fs_nr_online_devices(struct cache_set *c)
-{
-	unsigned i, nr = 0;
-
-	for (i = 0; i < c->sb.nr_devices; i++)
-		if (c->cache[i])
-			nr++;
-
-	return nr;
-}
-
 #define alloc_bucket_pages(gfp, ca)			\
 	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 
-static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
-	struct cache_set *c;
-	unsigned iter_size, journal_entry_bytes;
+	struct bch_sb_field_members *mi;
+	struct bch_fs *c;
+	unsigned i, iter_size, journal_entry_bytes;
 
-	c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+	c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
 	if (!c)
 		return NULL;
 
@@ -572,7 +600,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	mutex_lock(&c->sb_lock);
 
-	if (bch_sb_to_cache_set(c, sb)) {
+	if (bch_sb_to_fs(c, sb)) {
 		mutex_unlock(&c->sb_lock);
 		goto err;
 	}
@@ -616,8 +644,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 					 c->sb.btree_node_size,
 					 BCH_ENCODED_EXTENT_MAX) /
 				   PAGE_SECTORS, 0) ||
-	    !(c->bucket_stats_percpu = alloc_percpu(struct bch_fs_usage)) ||
-	    lg_lock_init(&c->bucket_stats_lock) ||
+	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+	    lg_lock_init(&c->usage_lock) ||
 	    mempool_init_page_pool(&c->btree_bounce_pool, 1,
 				   ilog2(btree_pages(c))) ||
 	    bdi_setup_and_register(&c->bdi, "bcache") ||
@@ -635,6 +663,12 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->bdi.congested_fn	= bch_congested_fn;
 	c->bdi.congested_data	= c;
 
+	mi = bch_sb_get_members(c->disk_sb);
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
+		    bch_dev_alloc(c, i))
+			goto err;
+
 	/*
 	 * Now that all allocations have succeeded, init various refcounty
 	 * things that let us shutdown:
@@ -660,31 +694,19 @@ err:
 	return NULL;
 }
 
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
-	struct cache_set *c;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
-			return c;
-
-	return NULL;
-}
-
-static const char *__bch_fs_online(struct cache_set *c)
+static const char *__bch_fs_online(struct bch_fs *c)
 {
-	struct cache *ca;
+	struct bch_dev *ca;
+	const char *err = NULL;
 	unsigned i;
 	int ret;
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&bch_fs_list_lock);
 
 	if (!list_empty(&c->list))
 		return NULL;
 
-	if (bch_fs_lookup(c->sb.uuid))
+	if (__bch_uuid_to_fs(c->sb.uuid))
 		return "filesystem UUID already open";
 
 	ret = bch_fs_chardev_init(c);
@@ -700,44 +722,42 @@ static const char *__bch_fs_online(struct cache_set *c)
 	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
 		return "error creating sysfs objects";
 
-	for_each_cache(ca, c, i)
-		if (bch_dev_online(ca)) {
-			percpu_ref_put(&ca->ref);
-			return "error creating sysfs objects";
-		}
-
 	mutex_lock(&c->state_lock);
 
-	if (bch_blockdev_volumes_start(c)) {
-		mutex_unlock(&c->state_lock);
-		return "can't bring up blockdev volumes";
-	}
+	err = "error creating sysfs objects";
+	__for_each_member_device(ca, c, i)
+		if (bch_dev_sysfs_online(ca))
+			goto err;
 
-	bch_attach_backing_devs(c);
+	err = "can't bring up blockdev volumes";
+	if (bch_blockdev_volumes_start(c))
+		goto err;
 
-	mutex_unlock(&c->state_lock);
+	bch_attach_backing_devs(c);
 
 	list_add(&c->list, &bch_fs_list);
-
-	return 0;
+	err = NULL;
+err:
+	mutex_unlock(&c->state_lock);
+	return err;
 }
 
-static const char *bch_fs_online(struct cache_set *c)
+static const char *bch_fs_online(struct bch_fs *c)
 {
 	const char *err;
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&bch_fs_list_lock);
 	err = __bch_fs_online(c);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&bch_fs_list_lock);
 
 	return err;
 }
 
-static const char *__bch_fs_start(struct cache_set *c)
+static const char *__bch_fs_start(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
-	struct cache *ca;
+	struct bch_dev *ca;
 	unsigned i, id;
 	time64_t now;
 	LIST_HEAD(journal);
@@ -746,39 +766,30 @@ static const char *__bch_fs_start(struct cache_set *c)
 
 	BUG_ON(c->state != BCH_FS_STARTING);
 
-	/*
-	 * Make sure that each cache object's mi is up to date before
-	 * we start testing it.
-	 */
-	for_each_cache(ca, c, i)
-		bch_sb_from_cache_set(c, ca);
+	mutex_lock(&c->sb_lock);
+	for_each_online_member(ca, c, i)
+		bch_sb_from_fs(c, ca);
+	mutex_unlock(&c->sb_lock);
 
 	if (BCH_SB_INITIALIZED(c->disk_sb)) {
 		ret = bch_journal_read(c, &journal);
 		if (ret)
 			goto err;
 
-		pr_debug("btree_journal_read() done");
-
 		j = &list_entry(journal.prev, struct journal_replay, list)->j;
 
+		c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+
 		err = "error reading priorities";
-		for_each_cache(ca, c, i) {
+		for_each_readable_member(ca, c, i) {
 			ret = bch_prio_read(ca);
 			if (ret) {
-				percpu_ref_put(&ca->ref);
+				percpu_ref_put(&ca->io_ref);
 				goto err;
 			}
 		}
 
-		c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
-		c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
-		for_each_cache(ca, c, i) {
-			bch_recalc_min_prio(ca, READ);
-			bch_recalc_min_prio(ca, WRITE);
-		}
-
 		for (id = 0; id < BTREE_ID_NR; id++) {
 			unsigned level;
 			struct bkey_i *k;
@@ -816,10 +827,9 @@ static const char *__bch_fs_start(struct cache_set *c)
 		bch_journal_start(c);
 
 		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-			    bch_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->ref);
+		for_each_rw_member(ca, c, i)
+			if (bch_dev_allocator_start(ca)) {
+				percpu_ref_put(&ca->io_ref);
 				goto err;
 			}
 
@@ -853,18 +863,10 @@ static const char *__bch_fs_start(struct cache_set *c)
 
 		bch_initial_gc(c, NULL);
 
-		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-			    bch_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->ref);
-				goto err;
-			}
-
 		err = "unable to allocate journal buckets";
-		for_each_cache(ca, c, i)
+		for_each_rw_member(ca, c, i)
 			if (bch_dev_journal_alloc(ca)) {
-				percpu_ref_put(&ca->ref);
+				percpu_ref_put(&ca->io_ref);
 				goto err;
 			}
 
@@ -875,6 +877,13 @@ static const char *__bch_fs_start(struct cache_set *c)
 		bch_journal_start(c);
 		bch_journal_set_replay_done(&c->journal);
 
+		err = "error starting allocator thread";
+		for_each_rw_member(ca, c, i)
+			if (bch_dev_allocator_start(ca)) {
+				percpu_ref_put(&ca->io_ref);
+				goto err;
+			}
+
 		err = "cannot allocate new btree root";
 		for (id = 0; id < BTREE_ID_NR; id++)
 			if (bch_btree_root_alloc(c, id, &cl)) {
@@ -918,10 +927,8 @@ recovery_done:
 	mi = bch_sb_get_members(c->disk_sb);
 	now = ktime_get_seconds();
 
-	rcu_read_lock();
-	for_each_cache_rcu(ca, c, i)
+	for_each_member_device(ca, c, i)
 		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
-	rcu_read_unlock();
 
 	SET_BCH_SB_INITIALIZED(c->disk_sb, true);
 	SET_BCH_SB_CLEAN(c->disk_sb, false);
@@ -966,12 +973,12 @@ err:
 	goto out;
 }
 
-const char *bch_fs_start(struct cache_set *c)
+const char *bch_fs_start(struct bch_fs *c)
 {
 	return __bch_fs_start(c) ?: bch_fs_online(c);
 }
 
-static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
+static const char *bch_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
 	struct bch_sb_field_members *sb_mi;
 
@@ -984,33 +991,28 @@ static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 
 	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
-		return "new cache bucket_size is too small";
+		return "new cache bucket size is too small";
 
 	return NULL;
 }
 
-static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
+static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 {
-	struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
-	struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
-	uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
-	const char *err;
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	struct bch_sb_field_members *mi = bch_sb_get_members(newest);
 
-	err = bch_dev_may_add(sb, c);
-	if (err)
-		return err;
+	if (uuid_le_cmp(fs->uuid, sb->uuid))
+		return "device not a member of filesystem";
+
+	if (sb->dev_idx >= newest->nr_devices)
+		return "device has invalid dev_idx";
 
-	if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
+	if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
 		return "device has been removed";
 
-	/*
-	 * When attaching an existing device, the cache set superblock must
-	 * already contain member_info with a matching UUID
-	 */
-	if (sb->dev_idx >= c->disk_sb->nr_devices ||
-	    memcmp(&mi->members[sb->dev_idx].uuid,
-		   &dev_uuid, sizeof(uuid_le)))
-		return "cache sb does not match set";
+	if (fs->block_size != sb->block_size)
+		return "mismatched block size";
 
 	return NULL;
 }
@@ -1019,37 +1021,34 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 
 void bch_dev_release(struct kobject *kobj)
 {
-	struct cache *ca = container_of(kobj, struct cache, kobj);
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 
 	kfree(ca);
 }
 
-static void bch_dev_free(struct cache *ca)
+static void bch_dev_free(struct bch_dev *ca)
 {
-	struct cache_set *c = ca->set;
 	unsigned i;
 
 	cancel_work_sync(&ca->io_error_work);
 
-	if (c && c->kobj.state_in_sysfs) {
-		char buf[12];
-
-		sprintf(buf, "cache%u", ca->dev_idx);
-		sysfs_remove_link(&c->kobj, buf);
-	}
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev)
+		sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
+				  "bcache");
 
 	if (ca->kobj.state_in_sysfs)
 		kobject_del(&ca->kobj);
 
 	bch_free_super(&ca->disk_sb);
 	bch_dev_journal_exit(ca);
+
 	free_percpu(ca->sectors_written);
 	bioset_exit(&ca->replica_set);
-	free_percpu(ca->bucket_stats_percpu);
+	free_percpu(ca->usage_percpu);
 	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
 	kfree(ca->prio_buckets);
 	kfree(ca->bio_prio);
-	kfree(ca->journal.bio);
 	vfree(ca->buckets);
 	vfree(ca->oldest_gens);
 	free_heap(&ca->heap);
@@ -1058,133 +1057,135 @@ static void bch_dev_free(struct cache *ca)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
+	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
-
-	if (c)
-		kobject_put(&c->kobj);
 }
 
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_io_ref_release(struct percpu_ref *ref)
 {
-	struct cache *ca = container_of(work, struct cache, free_work);
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
 
-	bch_dev_free(ca);
+	complete(&ca->offline_complete);
 }
 
-static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
+static void bch_dev_offline(struct bch_dev *ca)
 {
-	struct cache *ca = container_of(ref, struct cache, ref);
+	struct bch_fs *c = ca->fs;
+
+	lockdep_assert_held(&c->state_lock);
+
+	__bch_dev_read_only(ca->fs, ca);
+
+	reinit_completion(&ca->offline_complete);
+	percpu_ref_kill(&ca->io_ref);
+	wait_for_completion(&ca->offline_complete);
+
+	if (ca->kobj.state_in_sysfs) {
+		struct kobject *block =
+			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
+
+		sysfs_remove_link(block, "bcache");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
 
-	schedule_work(&ca->free_work);
+	bch_free_super(&ca->disk_sb);
+	bch_dev_journal_exit(ca);
 }
 
-static void bch_dev_free_rcu(struct rcu_head *rcu)
+static void bch_dev_ref_release(struct percpu_ref *ref)
 {
-	struct cache *ca = container_of(rcu, struct cache, free_rcu);
+	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
 
-	/*
-	 * This decrements the ref count to ca, and once the ref count
-	 * is 0 (outstanding bios to the ca also incremented it and
-	 * decrement it on completion/error), bch_dev_percpu_ref_release
-	 * is called, and that eventually results in bch_dev_free_work
-	 * being called, which in turn results in bch_dev_release being
-	 * called.
-	 *
-	 * In particular, these functions won't be called until there are no
-	 * bios outstanding (the per-cpu ref counts are all 0), so it
-	 * is safe to remove the actual sysfs device at that point,
-	 * and that can indicate success to the user.
-	 */
-
-	percpu_ref_kill(&ca->ref);
+	complete(&ca->stop_complete);
 }
 
-static void bch_dev_stop(struct cache *ca)
+static void bch_dev_stop(struct bch_dev *ca)
 {
-	struct cache_set *c = ca->set;
+	struct bch_fs *c = ca->fs;
 
 	lockdep_assert_held(&c->state_lock);
 
-	BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
-	rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
+	BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
 
-	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
+	synchronize_rcu();
+
+	reinit_completion(&ca->stop_complete);
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->stop_complete);
 }
 
-static int bch_dev_online(struct cache *ca)
+static int bch_dev_sysfs_online(struct bch_dev *ca)
 {
-	char buf[12];
+	struct bch_fs *c = ca->fs;
+	int ret;
+
+	if (!c->kobj.state_in_sysfs)
+		return 0;
+
+	if (!ca->kobj.state_in_sysfs) {
+		ret = kobject_add(&ca->kobj, &ca->fs->kobj,
+				  "dev-%u", ca->dev_idx);
+		if (ret)
+			return ret;
+	}
 
-	sprintf(buf, "cache%u", ca->dev_idx);
+	if (ca->disk_sb.bdev) {
+		struct kobject *block =
+			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
 
-	if (kobject_add(&ca->kobj,
-			&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
-			"bcache") ||
-	    sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
-	    sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
-		return -1;
+		ret = sysfs_create_link(block, &ca->kobj, "bcache");
+		if (ret)
+			return ret;
+		ret = sysfs_create_link(&ca->kobj, block, "block");
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
 
-static const char *bch_dev_alloc(struct bcache_superblock *sb,
-				 struct cache_set *c,
-				 struct cache **ret)
+static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
 	struct bch_member *member;
 	size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
 	size_t heap_size;
 	unsigned i;
-	const char *err = "cannot allocate memory";
-	struct cache *ca;
-
-	if (c->sb.nr_devices == 1)
-		bdevname(sb->bdev, c->name);
+	struct bch_dev *ca;
 
 	if (bch_fs_init_fault("dev_alloc"))
-		return err;
+		return -ENOMEM;
 
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
-		return err;
-
-	if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
-			    0, GFP_KERNEL)) {
-		kfree(ca);
-		return err;
-	}
+		return -ENOMEM;
 
 	kobject_init(&ca->kobj, &bch_dev_ktype);
+	init_completion(&ca->stop_complete);
+	init_completion(&ca->offline_complete);
 
 	spin_lock_init(&ca->self.lock);
 	ca->self.nr = 1;
 	rcu_assign_pointer(ca->self.d[0].dev, ca);
-	ca->dev_idx = sb->sb->dev_idx;
+	ca->dev_idx = dev_idx;
 
-	INIT_WORK(&ca->free_work, bch_dev_free_work);
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
 	bch_dev_moving_gc_init(ca);
 
-	ca->disk_sb = *sb;
-	if (sb->mode & FMODE_EXCL)
-		ca->disk_sb.bdev->bd_holder = ca;
-	memset(sb, 0, sizeof(*sb));
-
 	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
 
-	err = "dynamic fault";
 	if (bch_fs_init_fault("dev_alloc"))
 		goto err;
 
-	member = bch_sb_get_members(ca->disk_sb.sb)->members +
-		ca->disk_sb.sb->dev_idx;
+	member = bch_sb_get_members(c->disk_sb)->members + dev_idx;
 
-	ca->mi = cache_mi_to_cpu_mi(member);
+	ca->mi = bch_mi_to_cpu(member);
 	ca->uuid = member->uuid;
 	ca->bucket_bits = ilog2(ca->mi.bucket_size);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
 
 	/* XXX: tune these */
 	movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
@@ -1197,7 +1198,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	free_inc_reserve = movinggc_reserve / 2;
 	heap_size = movinggc_reserve * 8;
 
-	if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+	if (percpu_ref_init(&ca->ref, bch_dev_ref_release,
+			    0, GFP_KERNEL) ||
+	    percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_MOVINGGC],
 		       movinggc_reserve, GFP_KERNEL) ||
@@ -1208,15 +1213,14 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 					  ca->mi.nbuckets)) ||
 	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
 					  ca->mi.nbuckets)) ||
-	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+	    !(ca->prio_buckets	= kzalloc(sizeof(u64) * prio_buckets(ca) *
 					  2, GFP_KERNEL)) ||
 	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
-	    !(ca->bucket_stats_percpu = alloc_percpu(struct bch_dev_usage)) ||
+	    !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
 	    !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
 	    bioset_init(&ca->replica_set, 4,
 			offsetof(struct bch_write_bio, bio)) ||
-	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
-	    bch_dev_journal_init(ca))
+	    !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
 		goto err;
 
 	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1224,94 +1228,130 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	total_reserve = ca->free_inc.size;
 	for (i = 0; i < RESERVE_NR; i++)
 		total_reserve += ca->free[i].size;
-	pr_debug("%zu buckets reserved", total_reserve);
 
 	ca->copygc_write_point.group = &ca->self;
 	ca->tiering_write_point.group = &ca->self;
 
+	ca->fs = c;
+	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+	if (bch_dev_sysfs_online(ca))
+		pr_warn("error creating sysfs objects");
+
+	return 0;
+err:
+	bch_dev_free(ca);
+	return -ENOMEM;
+}
+
+static int bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb->seq))
+		bch_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = c->devs[sb->sb->dev_idx];
+	if (ca->disk_sb.bdev) {
+		bch_err(c, "already have device online in slot %u",
+			sb->sb->dev_idx);
+		return -EINVAL;
+	}
+
+	ret = bch_dev_journal_init(ca, sb->sb);
+	if (ret)
+		return ret;
+
 	/*
 	 * Increase journal write timeout if flushes to this device are
 	 * expensive:
 	 */
-	if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
+	if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
 	    journal_flushes_device(ca))
 		c->journal.write_delay_ms =
 			max(c->journal.write_delay_ms, 1000U);
 
-	kobject_get(&c->kobj);
-	ca->set = c;
-
-	kobject_get(&ca->kobj);
-	rcu_assign_pointer(c->cache[ca->dev_idx], ca);
-
-	mutex_lock(&c->sb_lock);
-
-	if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
-		bch_sb_to_cache_set(c, ca->disk_sb.sb);
+	/* Commit: */
+	ca->disk_sb = *sb;
+	if (sb->mode & FMODE_EXCL)
+		ca->disk_sb.bdev->bd_holder = ca;
+	memset(sb, 0, sizeof(*sb));
 
-	mutex_unlock(&c->sb_lock);
+	if (c->sb.nr_devices == 1)
+		bdevname(ca->disk_sb.bdev, c->name);
+	bdevname(ca->disk_sb.bdev, ca->name);
 
-	err = "error creating kobject";
-	if (c->kobj.state_in_sysfs &&
-	    bch_dev_online(ca))
+	if (bch_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");
 
-	if (ret)
-		*ret = ca;
-	else
-		kobject_put(&ca->kobj);
-	return NULL;
-err:
-	bch_dev_free(ca);
-	return err;
+	lg_local_lock(&c->usage_lock);
+	if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
+		bch_mark_dev_metadata(ca->fs, ca);
+	lg_local_unlock(&c->usage_lock);
+
+	percpu_ref_reinit(&ca->io_ref);
+	return 0;
 }
 
 /* Device management: */
 
-static void __bch_dev_read_only(struct cache_set *c, struct cache *ca)
+bool bch_fs_may_start(struct bch_fs *c, int flags)
 {
-	bch_moving_gc_stop(ca);
-
-	/*
-	 * This stops new data writes (e.g. to existing open data
-	 * buckets) and then waits for all existing writes to
-	 * complete.
-	 */
-	bch_dev_allocator_stop(ca);
-
-	bch_dev_group_remove(&c->journal.devs, ca);
-}
+	struct bch_sb_field_members *mi;
+	unsigned meta_missing = 0;
+	unsigned data_missing = 0;
+	bool degraded = false;
+	unsigned i;
 
-static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
-{
-	lockdep_assert_held(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
 
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
-		return NULL;
+	for (i = 0; i < c->disk_sb->nr_devices; i++)
+		if (!c->devs[i] &&
+		    !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
+			degraded = true;
+			if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
+				meta_missing++;
+			if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
+				data_missing++;
+		}
+	mutex_unlock(&c->sb_lock);
 
-	trace_bcache_cache_read_write(ca);
+	if (degraded &&
+	    !(flags & BCH_FORCE_IF_DEGRADED))
+		return false;
 
-	if (bch_dev_allocator_start(ca))
-		return "error starting allocator thread";
+	if (meta_missing &&
+	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
+		return false;
 
-	if (bch_moving_gc_start(ca))
-		return "error starting moving GC thread";
+	if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
+	    !(flags & BCH_FORCE_IF_METADATA_LOST))
+		return false;
 
-	if (bch_tiering_start(c))
-		return "error starting tiering thread";
+	if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+		return false;
 
-	bch_notify_dev_read_write(ca);
-	trace_bcache_cache_read_write_done(ca);
+	if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
+	    !(flags & BCH_FORCE_IF_DATA_LOST))
+		return false;
 
-	return NULL;
+	return true;
 }
 
-bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
+bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			   enum bch_member_state new_state, int flags)
 {
 	lockdep_assert_held(&c->state_lock);
 
-	if (new_state == BCH_MEMBER_STATE_ACTIVE)
+	if (new_state == BCH_MEMBER_STATE_RW)
 		return true;
 
 	if (ca->mi.has_data &&
@@ -1335,11 +1375,47 @@ bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
 	return true;
 }
 
-int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
+static void __bch_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_dev_allocator_stop(ca);
+
+	bch_dev_group_remove(&c->journal.devs, ca);
+}
+
+static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_start(ca))
+		return "error starting moving GC thread";
+
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
+
+	bch_notify_dev_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
+int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 			enum bch_member_state new_state, int flags)
 {
 	struct bch_sb_field_members *mi;
-	char buf[BDEVNAME_SIZE];
 
 	if (ca->mi.state == new_state)
 		return 0;
@@ -1347,16 +1423,14 @@ int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
 	if (!bch_dev_state_allowed(c, ca, new_state, flags))
 		return -EINVAL;
 
-	if (new_state == BCH_MEMBER_STATE_ACTIVE) {
+	if (new_state == BCH_MEMBER_STATE_RW) {
 		if (__bch_dev_read_write(c, ca))
 			return -ENOMEM;
 	} else {
 		__bch_dev_read_only(c, ca);
 	}
 
-	bch_notice(c, "%s %s",
-		   bdevname(ca->disk_sb.bdev, buf),
-		   bch_dev_state[new_state]);
+	bch_notice(ca, "%s", bch_dev_state[new_state]);
 
 	mutex_lock(&c->sb_lock);
 	mi = bch_sb_get_members(c->disk_sb);
@@ -1367,7 +1441,7 @@ int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
 	return 0;
 }
 
-int bch_dev_set_state(struct cache_set *c, struct cache *ca,
+int bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 		      enum bch_member_state new_state, int flags)
 {
 	int ret;
@@ -1380,7 +1454,7 @@ int bch_dev_set_state(struct cache_set *c, struct cache *ca,
 }
 
 #if 0
-int bch_dev_migrate_from(struct cache_set *c, struct cache *ca)
+int bch_dev_migrate_from(struct bch_fs *c, struct bch_dev *ca)
 {
 	/* First, go RO before we try to migrate data off: */
 	ret = bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, flags);
@@ -1414,23 +1488,20 @@ int bch_dev_migrate_from(struct cache_set *c, struct cache *ca)
 
 /* Device add/removal: */
 
-static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+static int __bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	struct bch_sb_field_members *mi;
-	char name[BDEVNAME_SIZE];
 	unsigned dev_idx = ca->dev_idx;
 	int ret;
 
-	bdevname(ca->disk_sb.bdev, name);
-
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
-		bch_err(ca->set, "Cannot remove RW device");
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		bch_err(ca, "Cannot remove RW device");
 		bch_notify_dev_remove_failed(ca);
 		return -EINVAL;
 	}
 
 	if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
-		bch_err(ca->set, "Cannot remove %s without losing data", name);
+		bch_err(ca, "Cannot remove without losing data");
 		bch_notify_dev_remove_failed(ca);
 		return -EINVAL;
 	}
@@ -1442,7 +1513,12 @@ static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
 	 */
 	ret = bch_flag_data_bad(ca);
 	if (ret) {
-		bch_err(c, "Remove of %s failed", name);
+		bch_err(ca, "Remove failed");
+		return ret;
+	}
+
+	if (ca->mi.has_data || ca->mi.has_metadata) {
+		bch_err(ca, "Can't remove, still has data");
 		return ret;
 	}
 
@@ -1458,13 +1534,9 @@ static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
 
 	bch_journal_meta(&c->journal);
 
+	bch_dev_offline(ca);
 	bch_dev_stop(ca);
-
-	/*
-	 * RCU barrier between dropping between c->cache and dropping from
-	 * member info:
-	 */
-	synchronize_rcu();
+	bch_dev_free(ca);
 
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
@@ -1481,28 +1553,29 @@ static int __bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
 	return 0;
 }
 
-int bch_dev_remove(struct cache_set *c, struct cache *ca, int flags)
+int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	int ret;
 
 	mutex_lock(&c->state_lock);
+	percpu_ref_put(&ca->ref);
 	ret = __bch_dev_remove(c, ca, flags);
 	mutex_unlock(&c->state_lock);
 
 	return ret;
 }
 
-int bch_dev_add(struct cache_set *c, const char *path)
+int bch_dev_add(struct bch_fs *c, const char *path)
 {
 	struct bcache_superblock sb;
 	const char *err;
-	struct cache *ca;
+	struct bch_dev *ca = NULL;
 	struct bch_sb_field_members *mi, *dev_mi;
 	struct bch_member saved_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret = -EINVAL;
 
-	err = bch_read_super(&sb, c->opts, path);
+	err = bch_read_super(&sb, bch_opts_empty(), path);
 	if (err)
 		return -EINVAL;
 
@@ -1525,14 +1598,9 @@ int bch_dev_add(struct cache_set *c, const char *path)
 	saved_mi = dev_mi->members[sb.sb->dev_idx];
 	saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
 
-	down_read(&c->gc_lock);
-
 	if (dynamic_fault("bcache:add:no_slot"))
 		goto no_slot;
 
-	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-		goto no_slot;
-
 	mi = bch_sb_get_members(c->disk_sb);
 	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
 		if (dev_idx >= c->sb.nr_devices ||
@@ -1540,15 +1608,11 @@ int bch_dev_add(struct cache_set *c, const char *path)
 				 sizeof(uuid_le)))
 			goto have_slot;
 no_slot:
-	up_read(&c->gc_lock);
-
 	err = "no slots available in superblock";
 	ret = -ENOSPC;
 	goto err_unlock;
 
 have_slot:
-	up_read(&c->gc_lock);
-
 	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
 	u64s = (sizeof(struct bch_sb_field_members) +
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
@@ -1565,46 +1629,47 @@ have_slot:
 	memcpy(dev_mi, mi, u64s * sizeof(u64));
 	dev_mi->members[dev_idx] = saved_mi;
 
+	sb.sb->uuid		= c->disk_sb->uuid;
 	sb.sb->dev_idx		= dev_idx;
 	sb.sb->nr_devices	= nr_devices;
 
-	if (bch_fs_mi_update(c, dev_mi->members, nr_devices)) {
-		err = "cannot allocate memory";
-		ret = -ENOMEM;
-		goto err_unlock;
-	}
-
 	/* commit new member info */
 	memcpy(mi, dev_mi, u64s * sizeof(u64));
 	c->disk_sb->nr_devices	= nr_devices;
 	c->sb.nr_devices	= nr_devices;
 
-	err = bch_dev_alloc(&sb, c, &ca);
-	if (err)
+	if (bch_dev_alloc(c, dev_idx)) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
 		goto err_unlock;
+	}
 
-	bch_write_super(c);
+	if (bch_dev_online(c, &sb)) {
+		err = "bch_dev_online() error";
+		ret = -ENOMEM;
+		goto err_unlock;
+	}
 
-	err = "journal alloc failed";
-	if (bch_dev_journal_alloc(ca))
-		goto err_put;
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
 
-	bch_notify_dev_added(ca);
+	ca = c->devs[dev_idx];
+	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+		err = "journal alloc failed";
+		if (bch_dev_journal_alloc(ca))
+			goto err;
 
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
 		err = __bch_dev_read_write(c, ca);
 		if (err)
-			goto err_put;
+			goto err;
 	}
 
-	kobject_put(&ca->kobj);
-	mutex_unlock(&c->sb_lock);
+	bch_notify_dev_added(ca);
 	mutex_unlock(&c->state_lock);
 	return 0;
-err_put:
-	bch_dev_stop(ca);
 err_unlock:
 	mutex_unlock(&c->sb_lock);
+err:
 	mutex_unlock(&c->state_lock);
 	bch_free_super(&sb);
 
@@ -1615,12 +1680,12 @@ err_unlock:
 /* Filesystem open: */
 
 const char *bch_fs_open(char * const *devices, unsigned nr_devices,
-			struct bch_opts opts, struct cache_set **ret)
+			struct bch_opts opts, struct bch_fs **ret)
 {
 	const char *err;
-	struct cache_set *c = NULL;
+	struct bch_fs *c = NULL;
 	struct bcache_superblock *sb;
-	unsigned i;
+	unsigned i, best_sb = 0;
 
 	if (!nr_devices)
 		return "need at least one device";
@@ -1647,19 +1712,33 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 			goto err;
 	}
 
-	err = "cannot allocate memory";
-	c = bch_fs_alloc(sb[0].sb, opts);
-	if (!c)
-		goto err;
+	for (i = 1; i < nr_devices; i++)
+		if (le64_to_cpu(sb[i].sb->seq) >
+		    le64_to_cpu(sb[best_sb].sb->seq))
+			best_sb = i;
 
 	for (i = 0; i < nr_devices; i++) {
-		err = bch_dev_alloc(&sb[i], c, NULL);
+		err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb);
 		if (err)
 			goto err;
 	}
 
+	err = "cannot allocate memory";
+	c = bch_fs_alloc(sb[best_sb].sb, opts);
+	if (!c)
+		goto err;
+
+	err = "bch_dev_online() error";
+	mutex_lock(&c->sb_lock);
+	for (i = 0; i < nr_devices; i++)
+		if (bch_dev_online(c, &sb[i])) {
+			mutex_unlock(&c->sb_lock);
+			goto err;
+		}
+	mutex_unlock(&c->sb_lock);
+
 	err = "insufficient devices";
-	if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
+	if (!bch_fs_may_start(c, 0))
 		goto err;
 
 	if (!c->opts.nostart) {
@@ -1697,19 +1776,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 					     struct bch_opts opts)
 {
 	const char *err;
-	struct cache_set *c;
-	bool allocated_cache_set = false;
+	struct bch_fs *c;
+	bool allocated_fs = false;
 
 	err = bch_validate_cache_super(sb);
 	if (err)
 		return err;
 
-	mutex_lock(&bch_register_lock);
-	c = bch_fs_lookup(sb->sb->uuid);
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch_uuid_to_fs(sb->sb->uuid);
 	if (c) {
 		closure_get(&c->cl);
 
-		err = bch_dev_in_fs(sb->sb, c);
+		err = bch_dev_in_fs(c->disk_sb, sb->sb);
 		if (err)
 			goto err;
 	} else {
@@ -1718,15 +1797,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 		if (!c)
 			goto err;
 
-		allocated_cache_set = true;
+		allocated_fs = true;
 	}
 
-	err = bch_dev_alloc(sb, c, NULL);
-	if (err)
+	err = "bch_dev_online() error";
+
+	mutex_lock(&c->sb_lock);
+	if (bch_dev_online(c, sb)) {
+		mutex_unlock(&c->sb_lock);
 		goto err;
+	}
+	mutex_unlock(&c->sb_lock);
 
-	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
-	    !c->opts.nostart) {
+	if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
 		err = __bch_fs_start(c);
 		if (err)
 			goto err;
@@ -1737,13 +1820,13 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 		goto err;
 
 	closure_put(&c->cl);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&bch_fs_list_lock);
 
 	return NULL;
 err:
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&bch_fs_list_lock);
 
-	if (allocated_cache_set)
+	if (allocated_fs)
 		bch_fs_stop(c);
 	else if (c)
 		closure_put(&c->cl);
@@ -1762,9 +1845,9 @@ const char *bch_fs_open_incremental(const char *path)
 		return err;
 
 	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&bch_fs_list_lock);
 		err = bch_backing_dev_register(&sb);
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&bch_fs_list_lock);
 	} else {
 		err = __bch_fs_open_incremental(&sb, opts);
 	}
@@ -1821,9 +1904,9 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
 	if (code == SYS_DOWN ||
 	    code == SYS_HALT ||
 	    code == SYS_POWER_OFF) {
-		struct cache_set *c;
+		struct bch_fs *c;
 
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&bch_fs_list_lock);
 
 		if (!list_empty(&bch_fs_list))
 			pr_info("Setting all devices read only:");
@@ -1834,7 +1917,7 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
 		list_for_each_entry(c, &bch_fs_list, list)
 			bch_fs_read_only(c);
 
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&bch_fs_list_lock);
 	}
 
 	return NOTIFY_DONE;
@@ -1878,7 +1961,6 @@ static int __init bcache_init(void)
 		NULL
 	};
 
-	mutex_init(&bch_register_lock);
 	register_reboot_notifier(&reboot);
 	closure_debug_init();
 	bkey_pack_test();