7 files changed, 283 insertions, 163 deletions
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index bbd9b29fae86..c803cec5baa4 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -273,16 +273,11 @@ static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
 	} while (b < end >> ca->bucket_bits);
 }
 
-/*
- * Mark non btree metadata - prios, journal
- */
-static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
+void bch_dev_mark_superblocks(struct cache *ca)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	unsigned i;
-	u64 b;
 
-	/* Mark superblocks: */
 	for (i = 0; i < layout->nr_superblocks; i++) {
 		if (layout->sb_offset[i] == BCH_SB_SECTOR)
 			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
@@ -294,6 +289,17 @@ static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
 				      (1 << layout->sb_max_size_bits),
 				      BUCKET_SB);
 	}
+}
+
+/*
+ * Mark non btree metadata - prios, journal
+ */
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
+{
+	unsigned i;
+	u64 b;
+
+	bch_dev_mark_superblocks(ca);
 
 	spin_lock(&c->journal.lock);
 
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 0607187f6081..293af0c86e23 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -13,6 +13,7 @@ int bch_initial_gc(struct cache_set *, struct list_head *);
 u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
 u8 bch_btree_mark_key_initial(struct cache_set *, enum bkey_type,
 				struct bkey_s_c);
+void bch_dev_mark_superblocks(struct cache *);
 
 /*
  * For concurrent mark and sweep (with other index updates), we define a total
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0dde6fb0c6eb..df37a8817b20 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -247,7 +247,7 @@ static bool bucket_became_unavailable(struct cache_set *c,
 {
 	return is_available_bucket(old) &&
 	       !is_available_bucket(new) &&
-	       c->gc_pos.phase == GC_PHASE_DONE;
+	       c && c->gc_pos.phase == GC_PHASE_DONE;
 }
 
 static void bucket_stats_update(struct cache *ca,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2992789850d4..9cd31c437833 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1527,8 +1527,13 @@ err:
 	return ret;
 }
 
+#if 0
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
 static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
-				      unsigned nr, bool write_super)
+				      unsigned nr)
 {
 	struct journal *j = &c->journal;
 	struct journal_device *ja = &ca->journal;
@@ -1615,8 +1620,7 @@ static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
 
 	BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
 
-	if (write_super)
-		bch_write_super(c);
+	bch_write_super(c);
 
 	ret = 0;
 err:
@@ -1628,9 +1632,15 @@ err:
 
 	return ret;
 }
+#endif
 
 int bch_dev_journal_alloc(struct cache *ca)
 {
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets;
+	unsigned i, nr;
+	u64 b, *p;
+
 	if (dynamic_fault("bcache:add:journal_alloc"))
 		return -ENOMEM;
 
@@ -1638,12 +1648,50 @@ int bch_dev_journal_alloc(struct cache *ca)
 	 * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
 	 * is smaller:
 	 */
-	return bch_set_nr_journal_buckets(ca->set, ca,
-			clamp_t(unsigned, ca->mi.nbuckets >> 8,
-				BCH_JOURNAL_BUCKETS_MIN,
-				min(1 << 10,
-				    (1 << 20) / ca->mi.bucket_size)),
-			false);
+	nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+		     BCH_JOURNAL_BUCKETS_MIN,
+		     min(1 << 10,
+			 (1 << 20) / ca->mi.bucket_size));
+
+	p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+		     GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ja->bucket_seq = p;
+
+	p = krealloc(ja->buckets, nr * sizeof(u64),
+		     GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+
+	ja->buckets = p;
+
+	journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+				nr + sizeof(*journal_buckets) / sizeof(u64));
+	if (!journal_buckets)
+		return -ENOMEM;
+
+	for (i = 0, b = ca->mi.first_bucket;
+	     i < nr && b < ca->mi.nbuckets; b++) {
+		if (!is_available_bucket(ca->buckets[b].mark))
+			continue;
+
+		bch_mark_metadata_bucket(ca, &ca->buckets[b],
+					 BUCKET_JOURNAL, true);
+		ja->buckets[i] = b;
+		journal_buckets->buckets[i] = cpu_to_le64(b);
+		i++;
+	}
+
+	if (i < nr)
+		return -ENOSPC;
+
+	BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+	ja->nr = nr;
+
+	return 0;
 }
 
 /* Journalling */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3a53b7ea2761..75f861b70381 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -276,6 +276,31 @@ err:
 	return err;
 }
 
+static const char *bch_sb_validate_members(struct bch_sb *sb)
+{
+	struct bch_sb_field_members *mi;
+	unsigned i;
+
+	mi = bch_sb_get_members(sb);
+	if (!mi)
+		return "Invalid superblock: member info area missing";
+
+	if ((void *) (mi->members + sb->nr_devices) >
+	    vstruct_end(&mi->field))
+		return "Invalid superblock: bad member info";
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		if (bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
+			continue;
+
+		if (le16_to_cpu(mi->members[i].bucket_size) <
+		    BCH_SB_BTREE_NODE_SIZE(sb))
+			return "bucket size smaller than btree node size";
+	}
+
+	return NULL;
+}
+
 const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 {
 	struct bch_sb *sb = disk_sb->sb;
@@ -378,15 +403,11 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 			return "Invalid superblock: unknown optional field type";
 	}
 
-	/* Validate member info: */
-	sb_mi = bch_sb_get_members(sb);
-	if (!sb_mi)
-		return "Invalid superblock: member info area missing";
-
-	if ((void *) (sb_mi->members + sb->nr_devices) >
-	    vstruct_end(&sb_mi->field))
-		return "Invalid superblock: bad member info";
+	err = bch_sb_validate_members(sb);
+	if (err)
+		return err;
 
+	sb_mi = bch_sb_get_members(sb);
 	mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
 
 	if (mi.nbuckets > LONG_MAX)
@@ -808,9 +829,6 @@ void bch_write_super(struct cache_set *c)
 
 	lockdep_assert_held(&c->sb_lock);
 
-	if (c->opts.nochanges)
-		return;
-
 	closure_init_stack(cl);
 
 	le64_add_cpu(&c->disk_sb->seq, 1);
@@ -818,6 +836,9 @@ void bch_write_super(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		bch_sb_from_cache_set(c, ca);
 
+	if (c->opts.nochanges)
+		goto out;
+
 	do {
 		wrote = false;
 		for_each_cache(ca, c, i)
@@ -827,7 +848,7 @@ void bch_write_super(struct cache_set *c)
 		closure_sync(cl);
 		super_idx++;
 	} while (wrote);
-
+out:
 	/* Make new options visible after they're persistent: */
 	bch_fs_mi_update(c, members->members, c->sb.nr_devices);
 	bch_sb_update(c);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 57b6a0a42491..9aa35bbe5a9c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -463,34 +463,6 @@ void bch_fs_detach(struct cache_set *c)
 		bch_fs_stop_async(c);
 }
 
-static unsigned bch_fs_nr_devices(struct cache_set *c)
-{
-	struct bch_sb_field_members *mi;
-	unsigned i, nr = 0;
-
-	mutex_lock(&c->sb_lock);
-	mi = bch_sb_get_members(c->disk_sb);
-
-	for (i = 0; i < c->disk_sb->nr_devices; i++)
-		if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
-			nr++;
-
-	mutex_unlock(&c->sb_lock);
-
-	return nr;
-}
-
-static unsigned bch_fs_nr_online_devices(struct cache_set *c)
-{
-	unsigned i, nr = 0;
-
-	for (i = 0; i < c->sb.nr_devices; i++)
-		if (c->cache[i])
-			nr++;
-
-	return nr;
-}
-
 #define alloc_bucket_pages(gfp, ca)			\
 	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 
@@ -746,12 +718,10 @@ static const char *__bch_fs_start(struct cache_set *c)
 
 	BUG_ON(c->state != BCH_FS_STARTING);
 
-	/*
-	 * Make sure that each cache object's mi is up to date before
-	 * we start testing it.
-	 */
+	mutex_lock(&c->sb_lock);
 	for_each_cache(ca, c, i)
 		bch_sb_from_cache_set(c, ca);
+	mutex_unlock(&c->sb_lock);
 
 	if (BCH_SB_INITIALIZED(c->disk_sb)) {
 		ret = bch_journal_read(c, &journal);
@@ -853,14 +823,6 @@ static const char *__bch_fs_start(struct cache_set *c)
 
 		bch_initial_gc(c, NULL);
 
-		err = "error starting allocator thread";
-		for_each_cache(ca, c, i)
-			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-			    bch_dev_allocator_start(ca)) {
-				percpu_ref_put(&ca->ref);
-				goto err;
-			}
-
 		err = "unable to allocate journal buckets";
 		for_each_cache(ca, c, i)
 			if (bch_dev_journal_alloc(ca)) {
@@ -875,6 +837,14 @@ static const char *__bch_fs_start(struct cache_set *c)
 		bch_journal_start(c);
 		bch_journal_set_replay_done(&c->journal);
 
+		err = "error starting allocator thread";
+		for_each_cache(ca, c, i)
+			if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+			    bch_dev_allocator_start(ca)) {
+				percpu_ref_put(&ca->ref);
+				goto err;
+			}
+
 		err = "cannot allocate new btree root";
 		for (id = 0; id < BTREE_ID_NR; id++)
 			if (bch_btree_root_alloc(c, id, &cl)) {
@@ -984,33 +954,28 @@ static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 
 	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
 	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
-		return "new cache bucket_size is too small";
+		return "new cache bucket size is too small";
 
 	return NULL;
 }
 
-static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
+static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 {
-	struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
-	struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
-	uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
-	const char *err;
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+	struct bch_sb_field_members *mi = bch_sb_get_members(newest);
 
-	err = bch_dev_may_add(sb, c);
-	if (err)
-		return err;
+	if (uuid_le_cmp(fs->uuid, sb->uuid))
+		return "device not a member of filesystem";
 
-	if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
+	if (sb->dev_idx >= newest->nr_devices)
+		return "device has invalid dev_idx";
+
+	if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
 		return "device has been removed";
 
-	/*
-	 * When attaching an existing device, the cache set superblock must
-	 * already contain member_info with a matching UUID
-	 */
-	if (sb->dev_idx >= c->disk_sb->nr_devices ||
-	    memcmp(&mi->members[sb->dev_idx].uuid,
-		   &dev_uuid, sizeof(uuid_le)))
-		return "cache sb does not match set";
+	if (fs->block_size != sb->block_size)
+		return "mismatched block size";
 
 	return NULL;
 }
@@ -1128,31 +1093,25 @@ static int bch_dev_online(struct cache *ca)
 	return 0;
 }
 
-static const char *bch_dev_alloc(struct bcache_superblock *sb,
-				 struct cache_set *c,
-				 struct cache **ret)
+static struct cache *__bch_dev_alloc(struct bcache_superblock *sb)
 {
 	struct bch_member *member;
 	size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
 	size_t heap_size;
 	unsigned i;
-	const char *err = "cannot allocate memory";
 	struct cache *ca;
 
-	if (c->sb.nr_devices == 1)
-		bdevname(sb->bdev, c->name);
-
 	if (bch_fs_init_fault("dev_alloc"))
-		return err;
+		return NULL;
 
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
-		return err;
+		return NULL;
 
 	if (percpu_ref_init(&ca->ref, bch_dev_percpu_ref_release,
 			    0, GFP_KERNEL)) {
 		kfree(ca);
-		return err;
+		return NULL;
 	}
 
 	kobject_init(&ca->kobj, &bch_dev_ktype);
@@ -1175,7 +1134,6 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 
 	INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
 
-	err = "dynamic fault";
 	if (bch_fs_init_fault("dev_alloc"))
 		goto err;
 
@@ -1229,6 +1187,20 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	ca->copygc_write_point.group = &ca->self;
 	ca->tiering_write_point.group = &ca->self;
 
+	return ca;
+err:
+	bch_dev_free(ca);
+	return NULL;
+}
+
+static const char *__bch_dev_add(struct cache_set *c, struct cache *ca)
+{
+	if (c->cache[ca->dev_idx])
+		return "already have device online in this slot";
+
+	if (c->sb.nr_devices == 1)
+		bdevname(ca->disk_sb.bdev, c->name);
+
 	/*
 	 * Increase journal write timeout if flushes to this device are
 	 * expensive:
@@ -1244,66 +1216,87 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	kobject_get(&ca->kobj);
 	rcu_assign_pointer(c->cache[ca->dev_idx], ca);
 
-	mutex_lock(&c->sb_lock);
-
-	if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
-		bch_sb_to_cache_set(c, ca->disk_sb.sb);
-
-	mutex_unlock(&c->sb_lock);
-
-	err = "error creating kobject";
 	if (c->kobj.state_in_sysfs &&
 	    bch_dev_online(ca))
 		pr_warn("error creating sysfs objects");
 
+	return NULL;
+}
+
+static const char *bch_dev_alloc(struct bcache_superblock *sb,
+				 struct cache_set *c,
+				 struct cache **ret)
+{
+	struct cache *ca;
+	const char *err;
+
+	ca = __bch_dev_alloc(sb);
+	if (!ca)
+		return "cannot allocate memory";
+
+	err = __bch_dev_add(c, ca);
+	if (err) {
+		bch_dev_free(ca);
+		return err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	if (le64_to_cpu(ca->disk_sb.sb->seq) >
+	    le64_to_cpu(c->disk_sb->seq))
+		bch_sb_to_cache_set(c, ca->disk_sb.sb);
+	mutex_unlock(&c->sb_lock);
+
 	if (ret)
 		*ret = ca;
 	else
 		kobject_put(&ca->kobj);
 	return NULL;
-err:
-	bch_dev_free(ca);
-	return err;
 }
 
 /* Device management: */
 
-static void __bch_dev_read_only(struct cache_set *c, struct cache *ca)
+bool bch_fs_may_start(struct cache_set *c, int flags)
 {
-	bch_moving_gc_stop(ca);
-
-	/*
-	 * This stops new data writes (e.g. to existing open data
-	 * buckets) and then waits for all existing writes to
-	 * complete.
-	 */
-	bch_dev_allocator_stop(ca);
-
-	bch_dev_group_remove(&c->journal.devs, ca);
-}
+	struct bch_sb_field_members *mi;
+	unsigned meta_missing = 0;
+	unsigned data_missing = 0;
+	bool degraded = false;
+	unsigned i;
 
-static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
-{
-	lockdep_assert_held(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+	mi = bch_sb_get_members(c->disk_sb);
 
-	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
-		return NULL;
+	for (i = 0; i < c->disk_sb->nr_devices; i++)
+		if (!c->cache[i] &&
+		    !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
+			degraded = true;
+			if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
+				meta_missing++;
+			if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
+				data_missing++;
+		}
+	mutex_unlock(&c->sb_lock);
 
-	trace_bcache_cache_read_write(ca);
+	if (degraded &&
+	    !(flags & BCH_FORCE_IF_DEGRADED))
+		return false;
 
-	if (bch_dev_allocator_start(ca))
-		return "error starting allocator thread";
+	if (meta_missing &&
+	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
+		return false;
 
-	if (bch_moving_gc_start(ca))
-		return "error starting moving GC thread";
+	if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
+	    !(flags & BCH_FORCE_IF_METADATA_LOST))
+		return false;
 
-	if (bch_tiering_start(c))
-		return "error starting tiering thread";
+	if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+		return false;
 
-	bch_notify_dev_read_write(ca);
-	trace_bcache_cache_read_write_done(ca);
+	if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
+	    !(flags & BCH_FORCE_IF_DATA_LOST))
+		return false;
 
-	return NULL;
+	return true;
 }
 
 bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
@@ -1335,6 +1328,44 @@ bool bch_dev_state_allowed(struct cache_set *c, struct cache *ca,
 	return true;
 }
 
+static void __bch_dev_read_only(struct cache_set *c, struct cache *ca)
+{
+	bch_moving_gc_stop(ca);
+
+	/*
+	 * This stops new data writes (e.g. to existing open data
+	 * buckets) and then waits for all existing writes to
+	 * complete.
+	 */
+	bch_dev_allocator_stop(ca);
+
+	bch_dev_group_remove(&c->journal.devs, ca);
+}
+
+static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
+		return NULL;
+
+	trace_bcache_cache_read_write(ca);
+
+	if (bch_dev_allocator_start(ca))
+		return "error starting allocator thread";
+
+	if (bch_moving_gc_start(ca))
+		return "error starting moving GC thread";
+
+	if (bch_tiering_start(c))
+		return "error starting tiering thread";
+
+	bch_notify_dev_read_write(ca);
+	trace_bcache_cache_read_write_done(ca);
+
+	return NULL;
+}
+
 int __bch_dev_set_state(struct cache_set *c, struct cache *ca,
 			enum bch_member_state new_state, int flags)
 {
@@ -1496,13 +1527,13 @@ int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
 	const char *err;
-	struct cache *ca;
+	struct cache *ca = NULL;
 	struct bch_sb_field_members *mi, *dev_mi;
 	struct bch_member saved_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret = -EINVAL;
 
-	err = bch_read_super(&sb, c->opts, path);
+	err = bch_read_super(&sb, bch_opts_empty(), path);
 	if (err)
 		return -EINVAL;
 
@@ -1525,6 +1556,10 @@ int bch_dev_add(struct cache_set *c, const char *path)
 	saved_mi = dev_mi->members[sb.sb->dev_idx];
 	saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
 
+	/*
+	 * XXX: ditch the GC stuff, just don't remove a device until nothing is
+	 * using its dev_idx anymore
+	 */
 	down_read(&c->gc_lock);
 
 	if (dynamic_fault("bcache:add:no_slot"))
@@ -1565,6 +1600,7 @@ have_slot:
 	memcpy(dev_mi, mi, u64s * sizeof(u64));
 	dev_mi->members[dev_idx] = saved_mi;
 
+	sb.sb->uuid		= c->disk_sb->uuid;
 	sb.sb->dev_idx		= dev_idx;
 	sb.sb->nr_devices	= nr_devices;
 
@@ -1579,33 +1615,42 @@ have_slot:
 	c->disk_sb->nr_devices	= nr_devices;
 	c->sb.nr_devices	= nr_devices;
 
-	err = bch_dev_alloc(&sb, c, &ca);
-	if (err)
+	ca = __bch_dev_alloc(&sb);
+	if (!ca) {
+		err = "cannot allocate memory";
+		ret = -ENOMEM;
 		goto err_unlock;
+	}
 
-	bch_write_super(c);
+	bch_dev_mark_superblocks(ca);
 
 	err = "journal alloc failed";
 	if (bch_dev_journal_alloc(ca))
-		goto err_put;
+		goto err_unlock;
 
-	bch_notify_dev_added(ca);
+	err = __bch_dev_add(c, ca);
+	BUG_ON(err);
+
+	bch_write_super(c);
+	mutex_unlock(&c->sb_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
 		err = __bch_dev_read_write(c, ca);
 		if (err)
-			goto err_put;
+			goto err;
 	}
 
+	bch_notify_dev_added(ca);
+
 	kobject_put(&ca->kobj);
-	mutex_unlock(&c->sb_lock);
 	mutex_unlock(&c->state_lock);
 	return 0;
-err_put:
-	bch_dev_stop(ca);
 err_unlock:
 	mutex_unlock(&c->sb_lock);
+err:
 	mutex_unlock(&c->state_lock);
+	if (ca)
+		bch_dev_stop(ca);
 	bch_free_super(&sb);
 
 	bch_err(c, "Unable to add device: %s", err);
@@ -1620,7 +1665,7 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	const char *err;
 	struct cache_set *c = NULL;
 	struct bcache_superblock *sb;
-	unsigned i;
+	unsigned i, best_sb = 0;
 
 	if (!nr_devices)
 		return "need at least one device";
@@ -1647,8 +1692,19 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 			goto err;
 	}
 
+	for (i = 1; i < nr_devices; i++)
+		if (le64_to_cpu(sb[i].sb->seq) >
+		    le64_to_cpu(sb[best_sb].sb->seq))
+			best_sb = i;
+
+	for (i = 0; i < nr_devices; i++) {
+		err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+		if (err)
+			goto err;
+	}
+
 	err = "cannot allocate memory";
-	c = bch_fs_alloc(sb[0].sb, opts);
+	c = bch_fs_alloc(sb[best_sb].sb, opts);
 	if (!c)
 		goto err;
 
@@ -1659,7 +1715,7 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	}
 
 	err = "insufficient devices";
-	if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
+	if (!bch_fs_may_start(c, 0))
 		goto err;
 
 	if (!c->opts.nostart) {
@@ -1709,7 +1765,7 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (c) {
 		closure_get(&c->cl);
 
-		err = bch_dev_in_fs(sb->sb, c);
+		err = bch_dev_in_fs(c->disk_sb, sb->sb);
 		if (err)
 			goto err;
 	} else {
@@ -1725,8 +1781,7 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		goto err;
 
-	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
-	    !c->opts.nostart) {
+	if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
 		err = __bch_fs_start(c);
 		if (err)
 			goto err;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 56df0089467a..5717bffe5c8c 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -37,7 +37,6 @@ write_attribute(trigger_btree_coalesce);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 write_attribute(blockdev_volume_create);
-write_attribute(add_device);
 
 read_attribute(uuid);
 read_attribute(minor);
@@ -839,15 +838,6 @@ STORE(bch_fs)
 	size = __bch_fs_store(kobj, attr, buf, size);
 	mutex_unlock(&c->state_lock);
 
-	if (attr == &sysfs_add_device) {
-		char *path = kstrdup(buf, GFP_KERNEL);
-		int r = bch_dev_add(c, strim(path));
-
-		kfree(path);
-		if (r)
-			return r;
-	}
-
 	return size;
 }
 
@@ -858,7 +848,6 @@ static struct attribute *bch_fs_files[] = {
 	&sysfs_journal_reclaim_delay_ms,
 	&sysfs_journal_entry_size_max,
 	&sysfs_blockdev_volume_create,
-	&sysfs_add_device,
 
 	&sysfs_block_size,
 	&sysfs_block_size_bytes,