bcachefs: Make shutdown path less asynchronous

Don't punt to workqueue unnecessarily - this fixes some deadlocks that were exposed when building in userspace Also refactor a bunch of locking - start to break up/get rid of bch_register_lock.
author: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-07 21:01:45 -0900
committer: Kent Overstreet <kent.overstreet@gmail.com> 2017-03-17 19:49:21 -0800
commit: af0c15d3b5b82454c19aa0ec6338001aa95f81b6 (patch)
tree: 86d97f2883ad2c3dc506cab43237d22a1aeb853d
parent: 71b5e60dbca8d8260d4b2c425523a9a7945eb3d7 (diff)
14 files changed, 395 insertions, 388 deletions
diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h
index 249258c1474b..ca4646d5c7d6 100644
--- a/fs/bcachefs/bcache.h
+++ b/fs/bcachefs/bcache.h
@@ -464,24 +464,10 @@ struct cache {
  * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
  * all the backing devices first (their cached data gets invalidated, and they
  * won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
  */
 enum {
-	/* Startup: */
 	BCH_FS_INITIAL_GC_DONE,
-	BCH_FS_RUNNING,
-
-	/* Shutdown: */
 	BCH_FS_DETACHING,
-	BCH_FS_STOPPING,
-	BCH_FS_RO,
-	BCH_FS_RO_COMPLETE,
 	BCH_FS_EMERGENCY_RO,
 	BCH_FS_WRITE_DISABLE_COMPLETE,
 	BCH_FS_GC_STOPPING,
@@ -506,6 +492,13 @@ struct bch_tier {
 	struct cache_group	devs;
 };
 
+enum bch_fs_state {
+	BCH_FS_STARTING		= 0,
+	BCH_FS_STOPPING,
+	BCH_FS_RO,
+	BCH_FS_RW,
+};
+
 struct cache_set {
 	struct closure		cl;
 
@@ -514,7 +507,6 @@ struct cache_set {
 	struct kobject		internal;
 	struct kobject		opts_dir;
 	struct kobject		time_stats;
-	struct completion	*stop_completion;
 	unsigned long		flags;
 
 	int			minor;
@@ -522,6 +514,10 @@ struct cache_set {
 	struct super_block	*vfs_sb;
 	char			name[40];
 
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+	enum bch_fs_state	state;
+
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
 	struct work_struct	read_only_work;
@@ -834,6 +830,11 @@ struct cache_set {
 #undef BCH_TIME_STAT
 };
 
+static inline bool bch_fs_running(struct cache_set *c)
+{
+	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
 static inline unsigned bucket_pages(const struct cache *ca)
 {
 	return ca->mi.bucket_size / PAGE_SECTORS;
diff --git a/fs/bcachefs/blockdev.c b/fs/bcachefs/blockdev.c
index 82b07f594a65..ba2e9a8cd891 100644
--- a/fs/bcachefs/blockdev.c
+++ b/fs/bcachefs/blockdev.c
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	bool found;
 	int ret;
 
+	lockdep_assert_held(&c->state_lock);
+
 	bdevname(dc->disk_sb.bdev, buf);
 
 	if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 		return -EINVAL;
 	}
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
-		return 0;
-
-	if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-		pr_err("Can't attach %s: shutting down", buf);
+	if (!bch_fs_running(c)) {
+		pr_err("Can't attach %s: not running", buf);
 		return -EINVAL;
 	}
 
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
 	struct cached_dev *dc, *t;
 
 	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
 		bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
 	struct bkey_s_c_inode_blockdev inode;
 	int ret = 0;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EINVAL;
 
 	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 32475f6c13d5..f474e8db0c50 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -792,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
 	u64 start_time;
 	enum btree_id id;
 
-	if (btree_gc_coalesce_disabled(c))
-		return;
-
 	if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
 		return;
 
@@ -853,7 +850,8 @@ static int bch_gc_thread(void *arg)
 		last_kick = atomic_read(&c->kick_gc);
 
 		bch_gc(c);
-		bch_coalesce(c);
+		if (!btree_gc_coalesce_disabled(c))
+			bch_coalesce(c);
 
 		debug_check_no_locks_held();
 	}
@@ -865,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
 {
 	set_bit(BCH_FS_GC_STOPPING, &c->flags);
 
-	if (!IS_ERR_OR_NULL(c->gc_thread))
+	if (c->gc_thread)
 		kthread_stop(c->gc_thread);
+
+	c->gc_thread = NULL;
+	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 }
 
 int bch_gc_thread_start(struct cache_set *c)
 {
-	clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+	struct task_struct *p;
+
+	BUG_ON(c->gc_thread);
 
-	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
-	if (IS_ERR(c->gc_thread))
-		return PTR_ERR(c->gc_thread);
+	p = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
 
+	c->gc_thread = p;
 	wake_up_process(c->gc_thread);
 	return 0;
 }
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index daf53bf53ca1..f4109da6ebeb 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 	} else {
 		bch_notify_dev_error(ca, true);
 
-		mutex_lock(&bch_register_lock);
+		mutex_lock(&c->state_lock);
 		dev = bch_dev_may_remove(ca);
 		if (dev
 		    ? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
 				"too many IO errors on %s, setting %s RO",
 				bdevname(ca->disk_sb.bdev, buf),
 				dev ? "device" : "filesystem");
-		mutex_unlock(&bch_register_lock);
+		mutex_unlock(&c->state_lock);
 	}
 }
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index a36f943c6c3a..ec70a3e39f75 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
 		if (!c)
 			goto err_unlock;
 
-		if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+		mutex_lock(&c->state_lock);
+
+		if (!bch_fs_running(c)) {
+			mutex_unlock(&c->state_lock);
 			err = "incomplete cache set";
 			c = NULL;
 			goto err_unlock;
 		}
 
 		closure_get(&c->cl);
+		mutex_unlock(&c->state_lock);
 		mutex_unlock(&bch_register_lock);
 	}
 
@@ -1291,8 +1295,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		return ret;
 
-	mutex_lock(&bch_register_lock);
-
 	if (opts.read_only >= 0 &&
 	    opts.read_only != c->opts.read_only) {
 		const char *err = NULL;
@@ -1305,8 +1307,7 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 			err = bch_fs_read_write(c);
 			if (err) {
 				bch_err(c, "error going rw: %s", err);
-				ret = -EINVAL;
-				goto unlock;
+				return -EINVAL;
 			}
 
 			sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.errors >= 0)
 		c->opts.errors = opts.errors;
 
-unlock:
-	mutex_unlock(&bch_register_lock);
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index b219f74bd36e..9b1e74018d48 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -717,9 +717,7 @@ void bch_wake_delayed_writes(unsigned long data)
 	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
 
 	while ((op = c->write_wait_head)) {
-		if (!test_bit(BCH_FS_RO, &c->flags) &&
-		    !test_bit(BCH_FS_STOPPING, &c->flags) &&
-		    time_after(op->expires, jiffies)) {
+		if (time_after(op->expires, jiffies)) {
 			mod_timer(&c->foreground_write_wakeup, op->expires);
 			break;
 		}
@@ -1063,9 +1061,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
 		return;
 	}
 
-	if (rbio->promote &&
-	    !test_bit(BCH_FS_RO, &c->flags) &&
-	    !test_bit(BCH_FS_STOPPING, &c->flags)) {
+	if (rbio->promote) {
 		struct cache_promote_op *promote = rbio->promote;
 		struct closure *cl = &promote->cl;
 
@@ -1141,6 +1137,9 @@ static bool should_promote(struct cache_set *c,
 	if (!(flags & BCH_READ_PROMOTE))
 		return false;
 
+	if (percpu_ref_is_dying(&c->writes))
+		return false;
+
 	return c->fastest_tier &&
 		c->fastest_tier < c->tiers + pick->ca->mi.tier;
 }
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 204eb6b5691f..037361bf2c01 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1826,7 +1826,7 @@ void bch_journal_flush_pins(struct journal *j)
 	while ((pin = journal_get_next_pin(j, U64_MAX)))
 		pin->flush(j, pin);
 
-	wait_event(j->wait, !journal_has_pins(j));
+	wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
 }
 
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
@@ -2679,6 +2679,24 @@ int bch_journal_move(struct cache *ca)
 	return ret;
 }
 
+void bch_fs_journal_stop(struct journal *j)
+{
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	/*
+	 * Empty out the journal by first flushing everything pinning existing
+	 * journal entries, then force a brand new empty journal entry to be
+	 * written:
+	 */
+	bch_journal_flush_pins(j);
+	bch_journal_flush_async(j, NULL);
+	bch_journal_meta(j);
+
+	cancel_delayed_work_sync(&j->write_work);
+	cancel_delayed_work_sync(&j->reclaim_work);
+}
+
 void bch_dev_journal_exit(struct cache *ca)
 {
 	kfree(ca->journal.buckets);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index fa2f527a205a..d3a1db0c41eb 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -364,6 +364,7 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 
 int bch_journal_move(struct cache *);
 
+void bch_fs_journal_stop(struct journal *);
 void bch_dev_journal_exit(struct cache *);
 int bch_dev_journal_init(struct cache *);
 void bch_fs_journal_exit(struct journal *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index f023ef1d48e4..923aa56269b6 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
 	return 0;
 }
 
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
 {
-	bch_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
+	ca->moving_gc_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+	if (ca->moving_gc_read)
+		kthread_stop(ca->moving_gc_read);
+	ca->moving_gc_read = NULL;
 }
 
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
 {
 	struct task_struct *t;
 
-	/* The moving gc read thread must be stopped */
-	BUG_ON(ca->moving_gc_read != NULL);
+	BUG_ON(ca->moving_gc_read);
 
 	if (ca->set->opts.nochanges)
 		return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
 	return 0;
 }
 
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
+	bch_pd_controller_init(&ca->moving_gc_pd);
+	ca->moving_gc_pd.d_term = 0;
 }
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index 5f15308593d4..e8ae95e5cfd1 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -23,8 +23,8 @@
 #define COPYGC_SECTORS_PER_ITER(ca)					\
 	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
 
-void bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
 
 #endif
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3200cebc8983..f50a5ee8b104 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -424,9 +424,13 @@ static bool bch_is_open_cache(struct block_device *bdev)
 
 static bool bch_is_open(struct block_device *bdev)
 {
-	lockdep_assert_held(&bch_register_lock);
+	bool ret;
 
-	return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_lock(&bch_register_lock);
+	ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
 }
 
 static const char *bch_blkdev_open(const char *path, fmode_t mode,
@@ -653,8 +657,6 @@ const char *bch_read_super(struct bcache_superblock *sb,
 	const char *err;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
-
 	memset(sb, 0, sizeof(*sb));
 	sb->mode = FMODE_READ;
 
@@ -798,6 +800,9 @@ void bch_write_super(struct cache_set *c)
 
 	lockdep_assert_held(&c->sb_lock);
 
+	if (c->opts.nochanges)
+		return;
+
 	closure_init_stack(cl);
 
 	le64_add_cpu(&c->disk_sb->seq, 1);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 7b9144d4dadb..d1695a63ca0f 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
 struct crypto_shash *bch_sha256;
 
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
 static int bch_dev_online(struct cache *);
 
 static int bch_congested_fn(void *data, int bdi_bits)
@@ -110,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
 	return ret;
 }
 
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
 
 /*
  * For startup/shutdown of RW stuff, the dependencies are:
@@ -144,22 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
 	for_each_cache(ca, c, i)
 		bch_dev_allocator_stop(ca);
 
-	/*
-	 * Write a journal entry after flushing the btree, so we don't end up
-	 * replaying everything we just flushed:
-	 */
-	if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-		int ret;
-
-		bch_journal_flush_pins(&c->journal);
-		bch_journal_flush_async(&c->journal, NULL);
-
-		ret = bch_journal_meta(&c->journal);
-		BUG_ON(ret && !bch_journal_error(&c->journal));
-	}
-
-	cancel_delayed_work_sync(&c->journal.write_work);
-	cancel_delayed_work_sync(&c->journal.reclaim_work);
+	bch_fs_journal_stop(&c->journal);
 }
 
 static void bch_writes_disabled(struct percpu_ref *writes)
@@ -170,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes)
 	wake_up(&bch_read_only_wait);
 }
 
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
 {
-	struct cache_set *c =
-		container_of(work, struct cache_set, read_only_work);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RW)
+		goto out;
+
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		goto out;
+
+	trace_fs_read_only(c);
 
-	percpu_ref_put(&c->writes);
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch_dev_allocator_stop()).
+	 */
+	percpu_ref_kill(&c->writes);
 
 	del_timer(&c->foreground_write_wakeup);
 	cancel_delayed_work(&c->pd_controllers_update);
@@ -183,66 +183,54 @@ static void bch_fs_read_only_work(struct work_struct *work)
 	c->foreground_write_pd.rate.rate = UINT_MAX;
 	bch_wake_delayed_writes((unsigned long) c);
 
-	if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-		/*
-		 * If we're not doing an emergency shutdown, we want to wait on
-		 * outstanding writes to complete so they don't see spurious
-		 * errors due to shutting down the allocator:
-		 */
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
-		__bch_fs_read_only(c);
+	__bch_fs_read_only(c);
 
-		if (!bch_journal_error(&c->journal) &&
-		    !test_bit(BCH_FS_ERROR, &c->flags)) {
-			mutex_lock(&c->sb_lock);
-			SET_BCH_SB_CLEAN(c->disk_sb, true);
-			bch_write_super(c);
-			mutex_unlock(&c->sb_lock);
-		}
-	} else {
-		/*
-		 * If we are doing an emergency shutdown outstanding writes may
-		 * hang until we shutdown the allocator so we don't want to wait
-		 * on outstanding writes before shutting everything down - but
-		 * we do need to wait on them before returning and signalling
-		 * that going RO is complete:
-		 */
-		__bch_fs_read_only(c);
+	wait_event(bch_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
-		wait_event(bch_read_only_wait,
-			   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+
+	if (!bch_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		mutex_lock(&c->sb_lock);
+		SET_BCH_SB_CLEAN(c->disk_sb, true);
+		bch_write_super(c);
+		mutex_unlock(&c->sb_lock);
 	}
 
+	c->state = BCH_FS_RO;
 	bch_notify_fs_read_only(c);
 	trace_fs_read_only_done(c);
-
-	set_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	wake_up(&bch_read_only_wait);
+out:
+	mutex_unlock(&c->state_lock);
 }
 
-bool bch_fs_read_only_async(struct cache_set *c)
+static void bch_fs_read_only_work(struct work_struct *work)
 {
-	if (test_and_set_bit(BCH_FS_RO, &c->flags))
-		return false;
-
-	trace_fs_read_only(c);
-
-	percpu_ref_get(&c->writes);
+	struct cache_set *c =
+		container_of(work, struct cache_set, read_only_work);
 
-	/*
-	 * Block new foreground-end write operations from starting - any new
-	 * writes will return -EROFS:
-	 *
-	 * (This is really blocking new _allocations_, writes to previously
-	 * allocated space can still happen until stopping the allocator in
-	 * bch_dev_allocator_stop()).
-	 */
-	percpu_ref_kill(&c->writes);
+	bch_fs_read_only(c);
+}
 
-	queue_work(system_freezable_wq, &c->read_only_work);
-	return true;
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
 }
 
 bool bch_fs_emergency_read_only(struct cache_set *c)
@@ -256,25 +244,16 @@ bool bch_fs_emergency_read_only(struct cache_set *c)
 	return ret;
 }
 
-void bch_fs_read_only(struct cache_set *c)
-{
-	/* so we don't race with bch_fs_read_write() */
-	lockdep_assert_held(&bch_register_lock);
-
-	bch_fs_read_only_async(c);
-
-	wait_event(bch_read_only_wait,
-		   test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
-		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
 {
 	struct cache *ca;
-	const char *err;
+	const char *err = NULL;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STARTING &&
+	    c->state != BCH_FS_RO)
+		goto out;
 
 	err = "error starting allocator thread";
 	for_each_cache(ca, c, i)
@@ -288,16 +267,13 @@ static const char *__bch_fs_read_write(struct cache_set *c)
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	for_each_cache(ca, c, i) {
-		if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-			continue;
-
-		err = "error starting moving GC thread";
-		if (bch_moving_gc_thread_start(ca)) {
+	err = "error starting moving GC thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+		    bch_moving_gc_start(ca)) {
 			percpu_ref_put(&ca->ref);
 			goto err;
 		}
-	}
 
 	err = "error starting tiering thread";
 	if (bch_tiering_start(c))
@@ -305,44 +281,23 @@ static const char *__bch_fs_read_write(struct cache_set *c)
 
 	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
-	return NULL;
+	if (c->state != BCH_FS_STARTING)
+		percpu_ref_reinit(&c->writes);
+
+	c->state = BCH_FS_RW;
+	err = NULL;
+out:
+	mutex_unlock(&c->state_lock);
+	return err;
 err:
 	__bch_fs_read_only(c);
-	return err;
-}
-
-const char *bch_fs_read_write(struct cache_set *c)
-{
-	const char *err;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
-		return NULL;
-
-	err = __bch_fs_read_write(c);
-	if (err)
-		return err;
-
-	percpu_ref_reinit(&c->writes);
-
-	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
-	clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
-	clear_bit(BCH_FS_RO, &c->flags);
-	return NULL;
+	goto out;
 }
 
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
 
 static void bch_fs_free(struct cache_set *c)
 {
-	del_timer_sync(&c->foreground_write_wakeup);
-	cancel_delayed_work_sync(&c->pd_controllers_update);
-	cancel_work_sync(&c->read_only_work);
-	cancel_work_sync(&c->bio_submit_work);
-	cancel_work_sync(&c->read_retry_work);
-
 	bch_fs_encryption_exit(c);
 	bch_fs_btree_exit(c);
 	bch_fs_journal_exit(&c->journal);
@@ -375,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
 	module_put(THIS_MODULE);
 }
 
+static void bch_fs_exit(struct cache_set *c)
+{
+	unsigned i;
+
+	del_timer_sync(&c->foreground_write_wakeup);
+	cancel_delayed_work_sync(&c->pd_controllers_update);
+	cancel_work_sync(&c->read_only_work);
+	cancel_work_sync(&c->bio_submit_work);
+	cancel_work_sync(&c->read_retry_work);
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (c->cache[i])
+			bch_dev_free(c->cache[i]);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned i;
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_register_lock);
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	for_each_cache(ca, c, i)
+		if (ca->kobj.state_in_sysfs)
+			kobject_del(&ca->kobj);
+
+	bch_fs_debug_exit(c);
+	bch_fs_chardev_exit(c);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	__bch_fs_read_only(c);
+}
+
 /*
  * should be __bch_fs_stop4 - block devices are closed, now we can finally
  * free it
@@ -382,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
 void bch_fs_release(struct kobject *kobj)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
-	struct completion *stop_completion = c->stop_completion;
 
 	bch_notify_fs_stopped(c);
-	bch_info(c, "stopped");
-
 	bch_fs_free(c);
-
-	if (stop_completion)
-		complete(stop_completion);
 }
 
 /*
@@ -399,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
 static void __bch_fs_stop3(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, cl);
-	struct cache *ca;
-	unsigned i;
-
-	mutex_lock(&bch_register_lock);
-	for_each_cache(ca, c, i)
-		bch_dev_stop(ca);
 
-	list_del(&c->list);
-	mutex_unlock(&bch_register_lock);
-
-	closure_debug_destroy(&c->cl);
-	kobject_put(&c->kobj);
+	bch_fs_exit(c);
 }
 
 /*
@@ -421,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
 {
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 
-	bch_fs_debug_exit(c);
-	bch_fs_chardev_exit(c);
-
-	if (c->kobj.state_in_sysfs)
-		kobject_del(&c->kobj);
-
-	bch_cache_accounting_destroy(&c->accounting);
-
-	kobject_put(&c->time_stats);
-	kobject_put(&c->opts_dir);
-	kobject_put(&c->internal);
-
-	mutex_lock(&bch_register_lock);
-	bch_fs_read_only(c);
-	mutex_unlock(&bch_register_lock);
+	bch_fs_offline(c);
 
 	closure_return(cl);
 }
 
 /*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
  * context to shut down block devices:
  */
 static void __bch_fs_stop1(struct closure *cl)
@@ -456,20 +427,33 @@ static void __bch_fs_stop1(struct closure *cl)
 
 void bch_fs_stop_async(struct cache_set *c)
 {
-	if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+	mutex_lock(&c->state_lock);
+	if (c->state != BCH_FS_STOPPING) {
+		c->state = BCH_FS_STOPPING;
 		closure_queue(&c->caching);
+	}
+	mutex_unlock(&c->state_lock);
 }
 
 void bch_fs_stop(struct cache_set *c)
 {
-	DECLARE_COMPLETION_ONSTACK(complete);
+	mutex_lock(&c->state_lock);
+	BUG_ON(c->state == BCH_FS_STOPPING);
+	c->state = BCH_FS_STOPPING;
+	mutex_unlock(&c->state_lock);
+
+	bch_blockdevs_stop(c);
+
+	closure_sync(&c->caching);
+	closure_debug_destroy(&c->caching);
+
+	bch_fs_offline(c);
 
-	c->stop_completion = &complete;
-	bch_fs_stop_async(c);
 	closure_put(&c->cl);
+	closure_sync(&c->cl);
 
-	/* Killable? */
-	wait_for_completion(&complete);
+	bch_fs_exit(c);
+	kobject_put(&c->kobj);
 }
 
 /* Stop, detaching from backing devices: */
@@ -523,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->minor		= -1;
 
+	mutex_init(&c->state_lock);
 	mutex_init(&c->sb_lock);
 	INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
 	mutex_init(&c->btree_cache_lock);
@@ -667,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	closure_init(&c->caching, &c->cl);
 	set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 
+	closure_get(&c->cl);
 	continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
 	return c;
 err:
@@ -674,7 +660,20 @@ err:
 	return NULL;
 }
 
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+	struct cache_set *c;
+
+	lockdep_assert_held(&bch_register_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+			return c;
+
+	return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
 {
 	struct cache *ca;
 	unsigned i;
@@ -683,13 +682,14 @@ static int bch_fs_online(struct cache_set *c)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (!list_empty(&c->list))
-		return 0;
+		return NULL;
 
-	list_add(&c->list, &bch_fs_list);
+	if (bch_fs_lookup(c->sb.uuid))
+		return "filesystem UUID already open";
 
 	ret = bch_fs_chardev_init(c);
 	if (ret)
-		return ret;
+		return "error creating character device";
 
 	bch_fs_debug_init(c);
 
@@ -698,18 +698,42 @@ static int bch_fs_online(struct cache_set *c)
 	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
 	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
 	    bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
-		return -1;
+		return "error creating sysfs objects";
 
 	for_each_cache(ca, c, i)
 		if (bch_dev_online(ca)) {
 			percpu_ref_put(&ca->ref);
-			return -1;
+			return "error creating sysfs objects";
 		}
 
+	mutex_lock(&c->state_lock);
+
+	if (bch_blockdev_volumes_start(c)) {
+		mutex_unlock(&c->state_lock);
+		return "can't bring up blockdev volumes";
+	}
+
+	bch_attach_backing_devs(c);
+
+	mutex_unlock(&c->state_lock);
+
+	list_add(&c->list, &bch_fs_list);
+
 	return 0;
 }
 
-const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+	const char *err;
+
+	mutex_lock(&bch_register_lock);
+	err = __bch_fs_online(c);
+	mutex_unlock(&bch_register_lock);
+
+	return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
@@ -720,11 +744,7 @@ const char *bch_fs_start(struct cache_set *c)
 	struct jset *j;
 	int ret = -EINVAL;
 
-	lockdep_assert_held(&bch_register_lock);
-	BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
-	/* We don't want bch_fatal_error() to free underneath us */
-	closure_get(&c->caching);
+	BUG_ON(c->state != BCH_FS_STARTING);
 
 	/*
 	 * Make sure that each cache object's mi is up to date before
@@ -882,10 +902,14 @@ const char *bch_fs_start(struct cache_set *c)
 			goto err;
 	}
 recovery_done:
+	err = "dynamic fault";
+	if (bch_fs_init_fault("fs_start"))
+		goto err;
+
 	if (c->opts.read_only) {
 		bch_fs_read_only(c);
 	} else {
-		err = __bch_fs_read_write(c);
+		err = bch_fs_read_write(c);
 		if (err)
 			goto err;
 	}
@@ -906,26 +930,9 @@ recovery_done:
 	bch_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	err = "dynamic fault";
-	if (bch_fs_init_fault("fs_start"))
-		goto err;
-
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err;
-
-	err = "can't bring up blockdev volumes";
-	if (bch_blockdev_volumes_start(c))
-		goto err;
-
-	set_bit(BCH_FS_RUNNING, &c->flags);
-	bch_attach_backing_devs(c);
-
-	bch_notify_fs_read_write(c);
 	err = NULL;
 out:
 	bch_journal_entries_free(&journal);
-	closure_put(&c->caching);
 	return err;
 err:
 	switch (ret) {
@@ -959,6 +966,11 @@ err:
 	goto out;
 }
 
+const char *bch_fs_start(struct cache_set *c)
+{
+	return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 {
 	struct bch_sb_field_members *sb_mi;
@@ -1003,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
 	return NULL;
 }
 
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
 
 bool bch_dev_read_only(struct cache *ca)
 {
@@ -1013,7 +1025,7 @@ bool bch_dev_read_only(struct cache *ca)
 
 	bdevname(ca->disk_sb.bdev, buf);
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
 		return false;
@@ -1057,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
 
 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 {
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
 		return NULL;
@@ -1070,7 +1082,7 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 	if (bch_dev_allocator_start(ca))
 		return "error starting allocator thread";
 
-	if (bch_moving_gc_thread_start(ca))
+	if (bch_moving_gc_start(ca))
 		return "error starting moving GC thread";
 
 	if (bch_tiering_start(c))
@@ -1102,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
 	return NULL;
 }
 
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
 void bch_dev_release(struct kobject *kobj)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
 
-	percpu_ref_exit(&ca->ref);
 	kfree(ca);
 }
 
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
 {
-	struct cache *ca = container_of(work, struct cache, free_work);
 	struct cache_set *c = ca->set;
 	unsigned i;
 
@@ -1134,14 +1139,6 @@ static void bch_dev_free_work(struct work_struct *work)
 		kobject_del(&ca->kobj);
 
 	bch_free_super(&ca->disk_sb);
-
-	/*
-	 * bch_dev_stop can be called in the middle of initialization
-	 * of the struct cache object.
-	 * As such, not all the sub-structures may be initialized.
-	 * However, they were zeroed when the object was allocated.
-	 */
-
 	bch_dev_journal_exit(ca);
 	free_percpu(ca->sectors_written);
 	bioset_exit(&ca->replica_set);
@@ -1158,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
+	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
 
 	if (c)
 		kobject_put(&c->kobj);
 }
 
+static void bch_dev_free_work(struct work_struct *work)
+{
+	struct cache *ca = container_of(work, struct cache, free_work);
+
+	bch_dev_free(ca);
+}
+
 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
 {
 	struct cache *ca = container_of(ref, struct cache, ref);
@@ -1196,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 
-	lockdep_assert_held(&bch_register_lock);
+	lockdep_assert_held(&c->state_lock);
 
-	if (c) {
-		BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
-		rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
-	}
+	BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+	rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
 
 	call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }
@@ -1284,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	closure_get(&c->cl);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
+
 	bch_dev_stop(ca);
 
 	/*
@@ -1293,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
 	 */
 	synchronize_rcu();
 
-	lockdep_assert_held(&bch_register_lock);
-
 	/*
 	 * Free this device's slot in the bch_member array - all pointers to
 	 * this device must be gone:
@@ -1304,17 +1306,15 @@ static void bch_dev_remove_work(struct work_struct *work)
 	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
 
 	bch_write_super(c);
-	mutex_unlock(&c->sb_lock);
 
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->sb_lock);
+	mutex_unlock(&c->state_lock);
 
 	closure_put(&c->cl);
 }
 
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
 {
-	mutex_lock(&bch_register_lock);
-
 	if (test_bit(BCH_DEV_REMOVING, &ca->flags))
 		return false;
 
@@ -1329,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
 
 	if (force)
 		set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
 	set_bit(BCH_DEV_REMOVING, &ca->flags);
 	bch_notify_dev_removing(ca);
 
-	mutex_unlock(&bch_register_lock);
-
 	/* Migrate the data and finish removal asynchronously: */
 
 	queue_work(system_long_wq, &ca->remove_work);
 	return true;
 }
 
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+	struct cache_set *c = ca->set;
+	bool ret;
+
+	mutex_lock(&c->state_lock);
+	ret = __bch_dev_remove(c, ca, force);
+	mutex_unlock(&c->state_lock);
+
+	return ret;
+}
+
 static int bch_dev_online(struct cache *ca)
 {
 	char buf[12];
 
-	lockdep_assert_held(&bch_register_lock);
-
 	sprintf(buf, "cache%u", ca->dev_idx);
 
 	if (kobject_add(&ca->kobj,
@@ -1397,7 +1406,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	spin_lock_init(&ca->freelist_lock);
 	spin_lock_init(&ca->prio_buckets_lock);
 	mutex_init(&ca->heap_lock);
-	bch_moving_init_cache(ca);
+	bch_dev_moving_gc_init(ca);
 
 	ca->disk_sb = *sb;
 	if (sb->mode & FMODE_EXCL)
@@ -1485,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 	err = "error creating kobject";
 	if (c->kobj.state_in_sysfs &&
 	    bch_dev_online(ca))
-		goto err;
+		pr_warn("error creating sysfs objects");
 
 	if (ret)
 		*ret = ca;
@@ -1493,23 +1502,10 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
 		kobject_put(&ca->kobj);
 	return NULL;
 err:
-	bch_dev_stop(ca);
+	bch_dev_free(ca);
 	return err;
 }
 
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
-	struct cache_set *c;
-
-	lockdep_assert_held(&bch_register_lock);
-
-	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
-			return c;
-
-	return NULL;
-}
-
 int bch_dev_add(struct cache_set *c, const char *path)
 {
 	struct bcache_superblock sb;
@@ -1520,21 +1516,20 @@ int bch_dev_add(struct cache_set *c, const char *path)
 	unsigned dev_idx, nr_devices, u64s;
 	int ret = -EINVAL;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, c->opts, path);
 	if (err)
-		goto err_unlock_register;
+		return -EINVAL;
 
 	err = bch_validate_cache_super(&sb);
 	if (err)
-		goto err_unlock_register;
-
-	mutex_lock(&c->sb_lock);
+		return -EINVAL;
 
 	err = bch_dev_may_add(sb.sb, c);
 	if (err)
-		goto err_unlock;
+		return -EINVAL;
+
+	mutex_lock(&c->state_lock);
+	mutex_lock(&c->sb_lock);
 
 	/*
 	 * Preserve the old cache member information (esp. tier)
@@ -1618,14 +1613,13 @@ have_slot:
 
 	kobject_put(&ca->kobj);
 	mutex_unlock(&c->sb_lock);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	return 0;
 err_put:
 	bch_dev_stop(ca);
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-err_unlock_register:
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 	bch_free_super(&sb);
 
 	bch_err(c, "Unable to add device: %s", err);
@@ -1638,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	const char *err;
 	struct cache_set *c = NULL;
 	struct bcache_superblock *sb;
-	uuid_le uuid;
 	unsigned i;
 
-	memset(&uuid, 0, sizeof(uuid_le));
-
 	if (!nr_devices)
 		return "need at least one device";
 
@@ -1654,62 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
 	if (!sb)
 		goto err;
 
-	/*
-	 * bch_read_super() needs to happen under register_lock, so that the
-	 * exclusive open is atomic with adding the new cache set to the list of
-	 * cache sets:
-	 */
-	mutex_lock(&bch_register_lock);
-
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_read_super(&sb[i], opts, devices[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 
 		err = "attempting to register backing device";
 		if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
-			goto err_unlock;
+			goto err;
 
 		err = bch_validate_cache_super(&sb[i]);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
-	err = "cache set already registered";
-	if (bch_fs_lookup(sb->sb->uuid))
-		goto err_unlock;
-
 	err = "cannot allocate memory";
 	c = bch_fs_alloc(sb[0].sb, opts);
 	if (!c)
-		goto err_unlock;
+		goto err;
 
 	for (i = 0; i < nr_devices; i++) {
 		err = bch_dev_alloc(&sb[i], c, NULL);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
 	err = "insufficient devices";
 	if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
-		goto err_unlock;
+		goto err;
 
 	if (!c->opts.nostart) {
-		err = bch_fs_start(c);
+		err = __bch_fs_start(c);
 		if (err)
-			goto err_unlock;
+			goto err;
 	}
 
-	err = "error creating kobject";
-	if (bch_fs_online(c))
-		goto err_unlock;
+	err = bch_fs_online(c);
+	if (err)
+		goto err;
 
-	if (ret) {
-		closure_get(&c->cl);
+	if (ret)
 		*ret = c;
-	}
-
-	mutex_unlock(&bch_register_lock);
+	else
+		closure_put(&c->cl);
 
 	err = NULL;
 out:
@@ -1718,11 +1696,10 @@ out:
 	if (err)
 		c = NULL;
 	return err;
-err_unlock:
-	if (c)
-		bch_fs_stop_async(c);
-	mutex_unlock(&bch_register_lock);
 err:
+	if (c)
+		bch_fs_stop(c);
+
 	for (i = 0; i < nr_devices; i++)
 		bch_free_super(&sb[i]);
 	goto out;
@@ -1731,7 +1708,6 @@ err:
 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 					     struct bch_opts opts)
 {
-	char name[BDEVNAME_SIZE];
 	const char *err;
 	struct cache_set *c;
 	bool allocated_cache_set = false;
@@ -1740,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 	if (err)
 		return err;
 
-	bdevname(sb->bdev, name);
-
+	mutex_lock(&bch_register_lock);
 	c = bch_fs_lookup(sb->sb->uuid);
 	if (c) {
+		closure_get(&c->cl);
+
 		err = bch_dev_in_fs(sb->sb, c);
 		if (err)
-			return err;
+			goto err;
 	} else {
 		c = bch_fs_alloc(sb->sb, opts);
+		err = "cannot allocate memory";
 		if (!c)
-			return "cannot allocate memory";
+			goto err;
 
 		allocated_cache_set = true;
 	}
@@ -1761,20 +1739,27 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
 
 	if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
 	    !c->opts.nostart) {
-		err = bch_fs_start(c);
+		err = __bch_fs_start(c);
 		if (err)
 			goto err;
-	} else {
-		err = "error creating kobject";
-		if (bch_fs_online(c))
-			goto err;
 	}
 
-	bch_info(c, "started");
+	err = __bch_fs_online(c);
+	if (err)
+		goto err;
+
+	closure_put(&c->cl);
+	mutex_unlock(&bch_register_lock);
+
 	return NULL;
 err:
+	mutex_unlock(&bch_register_lock);
+
 	if (allocated_cache_set)
-		bch_fs_stop_async(c);
+		bch_fs_stop(c);
+	else if (c)
+		closure_put(&c->cl);
+
 	return err;
 }
 
@@ -1784,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
 	struct bch_opts opts = bch_opts_empty();
 	const char *err;
 
-	mutex_lock(&bch_register_lock);
-
 	err = bch_read_super(&sb, opts, path);
 	if (err)
-		goto err;
+		return err;
 
-	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+	if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+		mutex_lock(&bch_register_lock);
 		err = bch_backing_dev_register(&sb);
-	else
+		mutex_unlock(&bch_register_lock);
+	} else {
 		err = __bch_fs_open_incremental(&sb, opts);
+	}
 
 	bch_free_super(&sb);
-err:
-	mutex_unlock(&bch_register_lock);
+
 	return err;
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d20c644fdeed..551a7afdd76a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -697,7 +697,7 @@ SHOW(bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
 	if (attr == &sysfs_bset_tree_stats)
@@ -782,12 +782,6 @@ STORE(__bch_fs)
 
 	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
 
-	if (attr == &sysfs_journal_flush) {
-		bch_journal_meta_async(&c->journal, NULL);
-
-		return size;
-	}
-
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
 	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
@@ -801,11 +795,14 @@ STORE(__bch_fs)
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-	if (!test_bit(BCH_FS_RUNNING, &c->flags))
+	if (!bch_fs_running(c))
 		return -EPERM;
 
-	if (test_bit(BCH_FS_STOPPING, &c->flags))
-		return -EINTR;
+	if (attr == &sysfs_journal_flush) {
+		bch_journal_meta_async(&c->journal, NULL);
+
+		return size;
+	}
 
 	if (attr == &sysfs_blockdev_volume_create) {
 		u64 v = strtoi_h_or_return(buf);
@@ -838,9 +835,9 @@ STORE(bch_fs)
 {
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
-	mutex_lock(&bch_register_lock);
+	mutex_lock(&c->state_lock);
 	size = __bch_fs_store(kobj, attr, buf, size);
-	mutex_unlock(&bch_register_lock);
+	mutex_unlock(&c->state_lock);
 
 	if (attr == &sysfs_add_device) {
 		char *path = kstrdup(buf, GFP_KERNEL);
diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c
index 84cd9e64c210..289d70967b46 100644
--- a/fs/bcachefs/tier.c
+++ b/fs/bcachefs/tier.c
@@ -219,7 +219,7 @@ static void __bch_tiering_stop(struct bch_tier *tier)
 	tier->pd.rate.rate = UINT_MAX;
 	bch_ratelimit_reset(&tier->pd.rate);
 
-	if (!IS_ERR_OR_NULL(tier->migrate))
+	if (tier->migrate)
 		kthread_stop(tier->migrate);
 
 	tier->migrate = NULL;
author	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-07 21:01:45 -0900
committer	Kent Overstreet <kent.overstreet@gmail.com>	2017-03-17 19:49:21 -0800
commit	af0c15d3b5b82454c19aa0ec6338001aa95f81b6 (patch)
tree	86d97f2883ad2c3dc506cab43237d22a1aeb853d
parent	71b5e60dbca8d8260d4b2c425523a9a7945eb3d7 (diff)