bcache: bch_cache_set_emergency_read_only()

going ro with bch_cache_set_read_only() on error, when the allocator has already stopped, could deadlock - we'd be waiting on foreground writes to complete that are stuck because the allocator already stopped (due to the original error)
author: Kent Overstreet <kent.overstreet@gmail.com> 2016-03-15 17:00:41 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:35:49 -0800
commit: 1b60766af54b16e92783289e97605aafcf1b4a74 (patch)
tree: 22cbfa609f56e933c5d1514ebe37389a2548f8ef
parent: 7e4ed6a922b0358d7950b3724dd0ac2d59e0f319 (diff)
8 files changed, 129 insertions, 81 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index deeed810de86..469ef659c2d3 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -1666,6 +1666,12 @@ void bch_cache_allocator_stop(struct cache *ca)
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
+	/* Avoid deadlocks.. */
+
+	closure_wake_up(&c->buckets_available_wait);
+	closure_wake_up(&c->freelist_wait);
+	wake_up(&c->journal.wait);
+
 	/* Now wait for any in flight writes: */
 
 	while (1) {
@@ -1695,15 +1701,23 @@ void bch_cache_allocator_stop(struct cache *ca)
 /*
  * Startup the allocator thread for transition to RW mode:
  */
-const char *bch_cache_allocator_start(struct cache *ca)
+int bch_cache_allocator_start(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
 	struct task_struct *k;
 
+	/*
+	 * allocator thread already started?
+	 * (run_cache_set() starts allocator separately from normal rw path, via
+	 * bch_cache_allocator_start_once())
+	 */
+	if (ca->alloc_thread)
+		return 0;
+
 	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
 	if (IS_ERR(k))
-		return "error starting allocator thread";
+		return 0;
 
 	get_task_struct(k);
 	ca->alloc_thread = k;
@@ -1719,8 +1733,7 @@ const char *bch_cache_allocator_start(struct cache *ca)
 	 * -EROFS due to prio_write() -> journal_meta() not finding any devices:
 	 */
 	wake_up_process(k);
-
-	return NULL;
+	return 0;
 }
 
 /*
@@ -1769,7 +1782,10 @@ const char *bch_cache_allocator_start_once(struct cache *ca)
 	if (!fifo_full(&ca->free[RESERVE_PRIO]))
 		return "couldn't find enough available buckets to write prios";
 
-	return bch_cache_allocator_start(ca);
+	if (bch_cache_allocator_start(ca))
+		return "error starting allocator thread";
+
+	return NULL;
 }
 
 void bch_open_buckets_init(struct cache_set *c)
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index 5fc36604a78f..7dbbb23ff67a 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -55,7 +55,7 @@ static inline void bch_wake_allocator(struct cache *ca)
 	     (_ptr)++)
 
 void bch_cache_allocator_stop(struct cache *);
-const char *bch_cache_allocator_start(struct cache *);
+int bch_cache_allocator_start(struct cache *);
 const char *bch_cache_allocator_start_once(struct cache *);
 void bch_open_buckets_init(struct cache_set *);
 
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d5336b3e3691..86bd9fc41a84 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -331,6 +331,12 @@ struct cache_member_rcu {
 	struct cache_member_cpu	m[];
 };
 
+/* cache->flags: */
+enum {
+	CACHE_DEV_REMOVING,
+	CACHE_DEV_FORCE_REMOVE,
+};
+
 struct cache {
 	struct percpu_ref	ref;
 	struct rcu_head		free_rcu;
@@ -476,6 +482,8 @@ enum {
 	CACHE_SET_RUNNING,
 	CACHE_SET_RO,
 	CACHE_SET_RO_COMPLETE,
+	CACHE_SET_EMERGENCY_RO,
+	CACHE_SET_WRITE_DISABLE_COMPLETE,
 	CACHE_SET_GC_STOPPING,
 	CACHE_SET_GC_FAILURE,
 	CACHE_SET_BDEV_MOUNTED,
@@ -506,7 +514,6 @@ struct cache_set {
 
 	/* Counts outstanding writes, for clean transition to read-only */
 	struct percpu_ref	writes;
-	struct completion	write_disable_complete;
 	struct work_struct	read_only_work;
 
 	struct cache __rcu	*cache[MAX_CACHES_PER_SET];
diff --git a/drivers/md/bcache/blockdev_types.h b/drivers/md/bcache/blockdev_types.h
index 65ecc42c2f85..67dd4dec868b 100644
--- a/drivers/md/bcache/blockdev_types.h
+++ b/drivers/md/bcache/blockdev_types.h
@@ -120,7 +120,4 @@ struct cached_dev {
 	unsigned char		writeback_percent;
 };
 
-#define CACHE_DEV_REMOVING	0
-#define CACHE_DEV_FORCE_REMOVE	1
-
 #endif /* _BCACHE_BLOCKDEV_TYPES_H */
diff --git a/drivers/md/bcache/error.c b/drivers/md/bcache/error.c
index 569348541132..2272cb76cf13 100644
--- a/drivers/md/bcache/error.c
+++ b/drivers/md/bcache/error.c
@@ -16,7 +16,7 @@ void bch_inconsistent_error(struct cache_set *c)
 			return;
 		}
 
-		if (bch_cache_set_read_only(c))
+		if (bch_cache_set_emergency_read_only(c))
 			__bch_cache_set_error(c, "emergency read only");
 		break;
 	case BCH_ON_ERROR_PANIC:
@@ -28,9 +28,8 @@ void bch_inconsistent_error(struct cache_set *c)
 
 void bch_fatal_error(struct cache_set *c)
 {
-	if (bch_cache_set_read_only(c))
-		printk(KERN_ERR "bcache: %pU emergency read only\n",
-		       c->disk_sb.user_uuid.b);
+	if (bch_cache_set_emergency_read_only(c))
+		__bch_cache_set_error(c, "emergency read only");
 }
 
 /* Nonfatal IO errors, IO error/latency accounting: */
@@ -110,20 +109,25 @@ void bch_account_io_completion_time(struct cache *ca,
 void bch_nonfatal_io_error_work(struct work_struct *work)
 {
 	struct cache *ca = container_of(work, struct cache, io_error_work);
+	struct cache_set *c = ca->set;
 	unsigned errors = atomic_read(&ca->io_errors);
 	char buf[BDEVNAME_SIZE];
+	bool dev;
 
-	if (errors < ca->set->error_limit) {
+	if (errors < c->error_limit) {
 		bch_notify_cache_error(ca, false);
 	} else {
 		bch_notify_cache_error(ca, true);
 
 		mutex_lock(&bch_register_lock);
-		if (ca->mi.state == CACHE_ACTIVE) {
-			printk(KERN_ERR "bcache: too many IO errors on %s, going RO\n",
-			       bdevname(ca->disk_sb.bdev, buf));
-			bch_cache_read_only(ca);
-		}
+		dev = bch_cache_may_remove(ca);
+		if (dev
+		    ? bch_cache_read_only(ca)
+		    : bch_cache_set_emergency_read_only(c))
+			__bch_cache_set_error(c,
+				"too many IO errors on %s, setting %s RO",
+				bdevname(ca->disk_sb.bdev, buf),
+				dev ? "device" : "filesystem");
 		mutex_unlock(&bch_register_lock);
 	}
 }
diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c
index 9c0d1cbcaa9a..7a3b267597e0 100644
--- a/drivers/md/bcache/fs.c
+++ b/drivers/md/bcache/fs.c
@@ -882,7 +882,7 @@ setflags_out:
 
 		down_write(&sb->s_umount);
 		sb->s_flags |= MS_RDONLY;
-		bch_cache_set_read_only(c);
+		bch_cache_set_emergency_read_only(c);
 		up_write(&sb->s_umount);
 		return 0;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f013cc9cad74..4a903d402144 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -729,7 +729,8 @@ static void bch_writes_disabled(struct percpu_ref *writes)
 {
 	struct cache_set *c = container_of(writes, struct cache_set, writes);
 
-	complete(&c->write_disable_complete);
+	set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch_read_only_wait);
 }
 
 static void bch_cache_set_read_only_work(struct work_struct *work)
@@ -737,7 +738,6 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
 	struct cache_set *c =
 		container_of(work, struct cache_set, read_only_work);
 
-	init_completion(&c->write_disable_complete);
 	percpu_ref_put(&c->writes);
 
 	del_timer_sync(&c->foreground_write_wakeup);
@@ -746,11 +746,23 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
 	c->foreground_write_pd.rate.rate = UINT_MAX;
 	bch_wake_delayed_writes((unsigned long) c);
 
-	/* Wait for outstanding writes to complete: */
-	wait_for_completion(&c->write_disable_complete);
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator.
+	 *
+	 * If we are doing an emergency shutdown, outstanding writes may hang
+	 * until we shutdown the allocator, so we don't want to wait here:
+	 */
+	wait_event(bch_read_only_wait,
+		   test_bit(CACHE_SET_EMERGENCY_RO, &c->flags) ||
+		   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
 
 	__bch_cache_set_read_only(c);
 
+	wait_event(bch_read_only_wait,
+		   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+
 	bch_notify_cache_set_read_only(c);
 	trace_bcache_cache_set_read_only_done(c);
 
@@ -781,16 +793,26 @@ bool bch_cache_set_read_only(struct cache_set *c)
 	return true;
 }
 
+bool bch_cache_set_emergency_read_only(struct cache_set *c)
+{
+	bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+
+	bch_cache_set_read_only(c);
+
+	wake_up(&bch_read_only_wait);
+	return ret;
+}
+
 void bch_cache_set_read_only_sync(struct cache_set *c)
 {
+	/* so we don't race with bch_cache_set_read_write() */
+	lockdep_assert_held(&bch_register_lock);
+
 	bch_cache_set_read_only(c);
 
-	/*
-	 * XXX: can hang indefinitely if we race and someone else does a
-	 * cache_set_read_write
-	 */
 	wait_event(bch_read_only_wait,
-		   test_bit(CACHE_SET_RO_COMPLETE, &c->flags));
+		   test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
+		   test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
 }
 
 static const char *__bch_cache_set_read_write(struct cache_set *c)
@@ -799,6 +821,16 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
 	const char *err;
 	unsigned i;
 
+	lockdep_assert_held(&bch_register_lock);
+
+	err = "error starting allocator thread";
+	for_each_cache(ca, c, i)
+		if (ca->mi.state == CACHE_ACTIVE &&
+		    bch_cache_allocator_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+
 	err = "error starting btree GC thread";
 	if (bch_gc_thread_start(c))
 		goto err;
@@ -832,35 +864,24 @@ err:
 
 const char *bch_cache_set_read_write(struct cache_set *c)
 {
-	struct cache *ca;
 	const char *err;
-	unsigned i;
 
 	lockdep_assert_held(&bch_register_lock);
 
 	if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
 		return NULL;
 
-	for_each_cache(ca, c, i)
-		if (ca->mi.state == CACHE_ACTIVE &&
-		    (err = bch_cache_allocator_start(ca))) {
-			percpu_ref_put(&ca->ref);
-			goto err;
-		}
-
 	err = __bch_cache_set_read_write(c);
 	if (err)
 		return err;
 
 	percpu_ref_reinit(&c->writes);
 
+	clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
 	clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
 	clear_bit(CACHE_SET_RO, &c->flags);
-
 	return NULL;
-err:
-	__bch_cache_set_read_only(c);
-	return err;
 }
 
 /* Cache set startup/shutdown: */
@@ -956,8 +977,6 @@ static void __cache_set_stop2(struct closure *cl)
 	if (!IS_ERR_OR_NULL(c->chardev))
 		device_unregister(c->chardev);
 
-	bch_cache_set_read_only_sync(c);
-
 	if (c->kobj.state_in_sysfs)
 		kobject_del(&c->kobj);
 
@@ -967,6 +986,10 @@ static void __cache_set_stop2(struct closure *cl)
 	kobject_put(&c->opts_dir);
 	kobject_put(&c->internal);
 
+	mutex_lock(&bch_register_lock);
+	bch_cache_set_read_only_sync(c);
+	mutex_unlock(&bch_register_lock);
+
 	closure_return(cl);
 }
 
@@ -1475,32 +1498,6 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
 
 /* Cache device */
 
-static bool cache_may_remove(struct cache *ca)
-{
-	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
-	/*
-	 * Right now, we can't remove the last device from a tier,
-	 * - For tier 0, because all metadata lives in tier 0 and because
-	 *   there is no way to have foreground writes go directly to tier 1.
-	 * - For tier 1, because the code doesn't completely support an
-	 *   empty tier 1.
-	 */
-
-	/*
-	 * Turning a device read-only removes it from the cache group,
-	 * so there may only be one read-write device in a tier, and yet
-	 * the device we are removing is in the same tier, so we have
-	 * to check for identity.
-	 * Removing the last RW device from a tier requires turning the
-	 * whole cache set RO.
-	 */
-
-	return tier->nr_devices != 1 ||
-		rcu_access_pointer(tier->devices[0]) != ca;
-}
-
 static void __bch_cache_read_only(struct cache *ca)
 {
 	trace_bcache_cache_read_only(ca);
@@ -1523,7 +1520,7 @@ static void __bch_cache_read_only(struct cache *ca)
 	trace_bcache_cache_read_only_done(ca);
 }
 
-void bch_cache_read_only(struct cache *ca)
+bool bch_cache_read_only(struct cache *ca)
 {
 	struct cache_set *c = ca->set;
 	char buf[BDEVNAME_SIZE];
@@ -1533,9 +1530,9 @@ void bch_cache_read_only(struct cache *ca)
 	lockdep_assert_held(&bch_register_lock);
 
 	if (ca->mi.state != CACHE_ACTIVE)
-		return;
+		return false;
 
-	if (!cache_may_remove(ca)) {
+	if (!bch_cache_may_remove(ca)) {
 		printk(__bch_err_fmt(c, "required member %s going RO, forcing fs RO",
 				     buf));
 		bch_cache_set_read_only_sync(c);
@@ -1551,6 +1548,7 @@ void bch_cache_read_only(struct cache *ca)
 
 	SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
 	bcache_write_super(c);
+	return true;
 }
 
 static const char *__bch_cache_read_write(struct cache *ca)
@@ -1562,16 +1560,15 @@ static const char *__bch_cache_read_write(struct cache *ca)
 
 	trace_bcache_cache_read_write(ca);
 
-	err = bch_cache_allocator_start(ca);
-	if (err)
-		return err;
+	if (bch_cache_allocator_start(ca))
+		return "error starting allocator thread";
 
-	err = "error starting tiering write workqueue";
 	if (bch_tiering_write_start(ca))
-		return err;
+		return "error starting tiering write workqueue";
 
 	trace_bcache_cache_read_write_done(ca);
 
+	/* XXX wtf? */
 	return NULL;
 
 	err = "error starting moving GC thread";
@@ -1812,7 +1809,7 @@ bool bch_cache_remove(struct cache *ca, bool force)
 	if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
 		return false;
 
-	if (!cache_may_remove(ca)) {
+	if (!bch_cache_may_remove(ca)) {
 		pr_err("Can't remove last device in tier %u of %pU.",
 		       ca->mi.tier, ca->set->disk_sb.set_uuid.b);
 		bch_notify_cache_remove_failed(ca);
diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h
index 9d537e1eb91c..801b17d7b008 100644
--- a/drivers/md/bcache/super.h
+++ b/drivers/md/bcache/super.h
@@ -145,6 +145,32 @@ static inline void bch_check_mark_super(struct cache_set *c,
 	bch_check_mark_super_slowpath(c, k, meta);
 }
 
+static inline bool bch_cache_may_remove(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+
+	/*
+	 * Right now, we can't remove the last device from a tier,
+	 * - For tier 0, because all metadata lives in tier 0 and because
+	 *   there is no way to have foreground writes go directly to tier 1.
+	 * - For tier 1, because the code doesn't completely support an
+	 *   empty tier 1.
+	 */
+
+	/*
+	 * Turning a device read-only removes it from the cache group,
+	 * so there may only be one read-write device in a tier, and yet
+	 * the device we are removing is in the same tier, so we have
+	 * to check for identity.
+	 * Removing the last RW device from a tier requires turning the
+	 * whole cache set RO.
+	 */
+
+	return tier->nr_devices != 1 ||
+		rcu_access_pointer(tier->devices[0]) != ca;
+}
+
 void free_super(struct bcache_superblock *);
 int bch_super_realloc(struct bcache_superblock *, unsigned);
 void bcache_write_super(struct cache_set *);
@@ -162,10 +188,11 @@ const char *bch_register_cache_set(char * const *, unsigned,
 				   struct cache_set **);
 
 bool bch_cache_set_read_only(struct cache_set *);
+bool bch_cache_set_emergency_read_only(struct cache_set *);
 void bch_cache_set_read_only_sync(struct cache_set *);
 const char *bch_cache_set_read_write(struct cache_set *);
 
-void bch_cache_read_only(struct cache *);
+bool bch_cache_read_only(struct cache *);
 const char *bch_cache_read_write(struct cache *);
 bool bch_cache_remove(struct cache *, bool force);
 int bch_cache_set_add_cache(struct cache_set *, const char *);
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-03-15 17:00:41 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:35:49 -0800
commit	1b60766af54b16e92783289e97605aafcf1b4a74 (patch)
tree	22cbfa609f56e933c5d1514ebe37389a2548f8ef
parent	7e4ed6a922b0358d7950b3724dd0ac2d59e0f319 (diff)