summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2016-03-15 17:00:41 -0800
committerKent Overstreet <kent.overstreet@gmail.com>2016-10-07 12:35:49 -0800
commit1b60766af54b16e92783289e97605aafcf1b4a74 (patch)
tree22cbfa609f56e933c5d1514ebe37389a2548f8ef
parent7e4ed6a922b0358d7950b3724dd0ac2d59e0f319 (diff)
bcache: bch_cache_set_emergency_read_only()
going ro with bch_cache_set_read_only() on error, when the allocator has already stopped, could deadlock - we'd be waiting on foreground writes to complete that are stuck because the allocator already stopped (due to the original error)
-rw-r--r--drivers/md/bcache/alloc.c26
-rw-r--r--drivers/md/bcache/alloc.h2
-rw-r--r--drivers/md/bcache/bcache.h9
-rw-r--r--drivers/md/bcache/blockdev_types.h3
-rw-r--r--drivers/md/bcache/error.c24
-rw-r--r--drivers/md/bcache/fs.c2
-rw-r--r--drivers/md/bcache/super.c115
-rw-r--r--drivers/md/bcache/super.h29
8 files changed, 129 insertions, 81 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index deeed810de86..469ef659c2d3 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -1666,6 +1666,12 @@ void bch_cache_allocator_stop(struct cache *ca)
}
mutex_unlock(&c->btree_reserve_cache_lock);
+ /* Avoid deadlocks.. */
+
+ closure_wake_up(&c->buckets_available_wait);
+ closure_wake_up(&c->freelist_wait);
+ wake_up(&c->journal.wait);
+
/* Now wait for any in flight writes: */
while (1) {
@@ -1695,15 +1701,23 @@ void bch_cache_allocator_stop(struct cache *ca)
/*
* Startup the allocator thread for transition to RW mode:
*/
-const char *bch_cache_allocator_start(struct cache *ca)
+int bch_cache_allocator_start(struct cache *ca)
{
struct cache_set *c = ca->set;
struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
struct task_struct *k;
+ /*
+ * allocator thread already started?
+ * (run_cache_set() starts allocator separately from normal rw path, via
+ * bch_cache_allocator_start_once())
+ */
+ if (ca->alloc_thread)
+ return 0;
+
k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
if (IS_ERR(k))
- return "error starting allocator thread";
+ return 0;
get_task_struct(k);
ca->alloc_thread = k;
@@ -1719,8 +1733,7 @@ const char *bch_cache_allocator_start(struct cache *ca)
* -EROFS due to prio_write() -> journal_meta() not finding any devices:
*/
wake_up_process(k);
-
- return NULL;
+ return 0;
}
/*
@@ -1769,7 +1782,10 @@ const char *bch_cache_allocator_start_once(struct cache *ca)
if (!fifo_full(&ca->free[RESERVE_PRIO]))
return "couldn't find enough available buckets to write prios";
- return bch_cache_allocator_start(ca);
+ if (bch_cache_allocator_start(ca))
+ return "error starting allocator thread";
+
+ return NULL;
}
void bch_open_buckets_init(struct cache_set *c)
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index 5fc36604a78f..7dbbb23ff67a 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -55,7 +55,7 @@ static inline void bch_wake_allocator(struct cache *ca)
(_ptr)++)
void bch_cache_allocator_stop(struct cache *);
-const char *bch_cache_allocator_start(struct cache *);
+int bch_cache_allocator_start(struct cache *);
const char *bch_cache_allocator_start_once(struct cache *);
void bch_open_buckets_init(struct cache_set *);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d5336b3e3691..86bd9fc41a84 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -331,6 +331,12 @@ struct cache_member_rcu {
struct cache_member_cpu m[];
};
+/* cache->flags: */
+enum {
+ CACHE_DEV_REMOVING,
+ CACHE_DEV_FORCE_REMOVE,
+};
+
struct cache {
struct percpu_ref ref;
struct rcu_head free_rcu;
@@ -476,6 +482,8 @@ enum {
CACHE_SET_RUNNING,
CACHE_SET_RO,
CACHE_SET_RO_COMPLETE,
+ CACHE_SET_EMERGENCY_RO,
+ CACHE_SET_WRITE_DISABLE_COMPLETE,
CACHE_SET_GC_STOPPING,
CACHE_SET_GC_FAILURE,
CACHE_SET_BDEV_MOUNTED,
@@ -506,7 +514,6 @@ struct cache_set {
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
- struct completion write_disable_complete;
struct work_struct read_only_work;
struct cache __rcu *cache[MAX_CACHES_PER_SET];
diff --git a/drivers/md/bcache/blockdev_types.h b/drivers/md/bcache/blockdev_types.h
index 65ecc42c2f85..67dd4dec868b 100644
--- a/drivers/md/bcache/blockdev_types.h
+++ b/drivers/md/bcache/blockdev_types.h
@@ -120,7 +120,4 @@ struct cached_dev {
unsigned char writeback_percent;
};
-#define CACHE_DEV_REMOVING 0
-#define CACHE_DEV_FORCE_REMOVE 1
-
#endif /* _BCACHE_BLOCKDEV_TYPES_H */
diff --git a/drivers/md/bcache/error.c b/drivers/md/bcache/error.c
index 569348541132..2272cb76cf13 100644
--- a/drivers/md/bcache/error.c
+++ b/drivers/md/bcache/error.c
@@ -16,7 +16,7 @@ void bch_inconsistent_error(struct cache_set *c)
return;
}
- if (bch_cache_set_read_only(c))
+ if (bch_cache_set_emergency_read_only(c))
__bch_cache_set_error(c, "emergency read only");
break;
case BCH_ON_ERROR_PANIC:
@@ -28,9 +28,8 @@ void bch_inconsistent_error(struct cache_set *c)
void bch_fatal_error(struct cache_set *c)
{
- if (bch_cache_set_read_only(c))
- printk(KERN_ERR "bcache: %pU emergency read only\n",
- c->disk_sb.user_uuid.b);
+ if (bch_cache_set_emergency_read_only(c))
+ __bch_cache_set_error(c, "emergency read only");
}
/* Nonfatal IO errors, IO error/latency accounting: */
@@ -110,20 +109,25 @@ void bch_account_io_completion_time(struct cache *ca,
void bch_nonfatal_io_error_work(struct work_struct *work)
{
struct cache *ca = container_of(work, struct cache, io_error_work);
+ struct cache_set *c = ca->set;
unsigned errors = atomic_read(&ca->io_errors);
char buf[BDEVNAME_SIZE];
+ bool dev;
- if (errors < ca->set->error_limit) {
+ if (errors < c->error_limit) {
bch_notify_cache_error(ca, false);
} else {
bch_notify_cache_error(ca, true);
mutex_lock(&bch_register_lock);
- if (ca->mi.state == CACHE_ACTIVE) {
- printk(KERN_ERR "bcache: too many IO errors on %s, going RO\n",
- bdevname(ca->disk_sb.bdev, buf));
- bch_cache_read_only(ca);
- }
+ dev = bch_cache_may_remove(ca);
+ if (dev
+ ? bch_cache_read_only(ca)
+ : bch_cache_set_emergency_read_only(c))
+ __bch_cache_set_error(c,
+ "too many IO errors on %s, setting %s RO",
+ bdevname(ca->disk_sb.bdev, buf),
+ dev ? "device" : "filesystem");
mutex_unlock(&bch_register_lock);
}
}
diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c
index 9c0d1cbcaa9a..7a3b267597e0 100644
--- a/drivers/md/bcache/fs.c
+++ b/drivers/md/bcache/fs.c
@@ -882,7 +882,7 @@ setflags_out:
down_write(&sb->s_umount);
sb->s_flags |= MS_RDONLY;
- bch_cache_set_read_only(c);
+ bch_cache_set_emergency_read_only(c);
up_write(&sb->s_umount);
return 0;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f013cc9cad74..4a903d402144 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -729,7 +729,8 @@ static void bch_writes_disabled(struct percpu_ref *writes)
{
struct cache_set *c = container_of(writes, struct cache_set, writes);
- complete(&c->write_disable_complete);
+ set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+ wake_up(&bch_read_only_wait);
}
static void bch_cache_set_read_only_work(struct work_struct *work)
@@ -737,7 +738,6 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
struct cache_set *c =
container_of(work, struct cache_set, read_only_work);
- init_completion(&c->write_disable_complete);
percpu_ref_put(&c->writes);
del_timer_sync(&c->foreground_write_wakeup);
@@ -746,11 +746,23 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
c->foreground_write_pd.rate.rate = UINT_MAX;
bch_wake_delayed_writes((unsigned long) c);
- /* Wait for outstanding writes to complete: */
- wait_for_completion(&c->write_disable_complete);
+ /*
+ * If we're not doing an emergency shutdown, we want to wait on
+ * outstanding writes to complete so they don't see spurious errors due
+ * to shutting down the allocator.
+ *
+ * If we are doing an emergency shutdown, outstanding writes may hang
+ * until we shutdown the allocator, so we don't want to wait here:
+ */
+ wait_event(bch_read_only_wait,
+ test_bit(CACHE_SET_EMERGENCY_RO, &c->flags) ||
+ test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
__bch_cache_set_read_only(c);
+ wait_event(bch_read_only_wait,
+ test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
+
bch_notify_cache_set_read_only(c);
trace_bcache_cache_set_read_only_done(c);
@@ -781,16 +793,26 @@ bool bch_cache_set_read_only(struct cache_set *c)
return true;
}
+bool bch_cache_set_emergency_read_only(struct cache_set *c)
+{
+ bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
+
+ bch_cache_set_read_only(c);
+
+ wake_up(&bch_read_only_wait);
+ return ret;
+}
+
void bch_cache_set_read_only_sync(struct cache_set *c)
{
+ /* so we don't race with bch_cache_set_read_write() */
+ lockdep_assert_held(&bch_register_lock);
+
bch_cache_set_read_only(c);
- /*
- * XXX: can hang indefinitely if we race and someone else does a
- * cache_set_read_write
- */
wait_event(bch_read_only_wait,
- test_bit(CACHE_SET_RO_COMPLETE, &c->flags));
+ test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
+ test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
}
static const char *__bch_cache_set_read_write(struct cache_set *c)
@@ -799,6 +821,16 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
const char *err;
unsigned i;
+ lockdep_assert_held(&bch_register_lock);
+
+ err = "error starting allocator thread";
+ for_each_cache(ca, c, i)
+ if (ca->mi.state == CACHE_ACTIVE &&
+ bch_cache_allocator_start(ca)) {
+ percpu_ref_put(&ca->ref);
+ goto err;
+ }
+
err = "error starting btree GC thread";
if (bch_gc_thread_start(c))
goto err;
@@ -832,35 +864,24 @@ err:
const char *bch_cache_set_read_write(struct cache_set *c)
{
- struct cache *ca;
const char *err;
- unsigned i;
lockdep_assert_held(&bch_register_lock);
if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
return NULL;
- for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
- (err = bch_cache_allocator_start(ca))) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
-
err = __bch_cache_set_read_write(c);
if (err)
return err;
percpu_ref_reinit(&c->writes);
+ clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
+ clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
clear_bit(CACHE_SET_RO, &c->flags);
-
return NULL;
-err:
- __bch_cache_set_read_only(c);
- return err;
}
/* Cache set startup/shutdown: */
@@ -956,8 +977,6 @@ static void __cache_set_stop2(struct closure *cl)
if (!IS_ERR_OR_NULL(c->chardev))
device_unregister(c->chardev);
- bch_cache_set_read_only_sync(c);
-
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -967,6 +986,10 @@ static void __cache_set_stop2(struct closure *cl)
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
+ mutex_lock(&bch_register_lock);
+ bch_cache_set_read_only_sync(c);
+ mutex_unlock(&bch_register_lock);
+
closure_return(cl);
}
@@ -1475,32 +1498,6 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
/* Cache device */
-static bool cache_may_remove(struct cache *ca)
-{
- struct cache_set *c = ca->set;
- struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
- /*
- * Right now, we can't remove the last device from a tier,
- * - For tier 0, because all metadata lives in tier 0 and because
- * there is no way to have foreground writes go directly to tier 1.
- * - For tier 1, because the code doesn't completely support an
- * empty tier 1.
- */
-
- /*
- * Turning a device read-only removes it from the cache group,
- * so there may only be one read-write device in a tier, and yet
- * the device we are removing is in the same tier, so we have
- * to check for identity.
- * Removing the last RW device from a tier requires turning the
- * whole cache set RO.
- */
-
- return tier->nr_devices != 1 ||
- rcu_access_pointer(tier->devices[0]) != ca;
-}
-
static void __bch_cache_read_only(struct cache *ca)
{
trace_bcache_cache_read_only(ca);
@@ -1523,7 +1520,7 @@ static void __bch_cache_read_only(struct cache *ca)
trace_bcache_cache_read_only_done(ca);
}
-void bch_cache_read_only(struct cache *ca)
+bool bch_cache_read_only(struct cache *ca)
{
struct cache_set *c = ca->set;
char buf[BDEVNAME_SIZE];
@@ -1533,9 +1530,9 @@ void bch_cache_read_only(struct cache *ca)
lockdep_assert_held(&bch_register_lock);
if (ca->mi.state != CACHE_ACTIVE)
- return;
+ return false;
- if (!cache_may_remove(ca)) {
+ if (!bch_cache_may_remove(ca)) {
printk(__bch_err_fmt(c, "required member %s going RO, forcing fs RO",
buf));
bch_cache_set_read_only_sync(c);
@@ -1551,6 +1548,7 @@ void bch_cache_read_only(struct cache *ca)
SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
bcache_write_super(c);
+ return true;
}
static const char *__bch_cache_read_write(struct cache *ca)
@@ -1562,16 +1560,15 @@ static const char *__bch_cache_read_write(struct cache *ca)
trace_bcache_cache_read_write(ca);
- err = bch_cache_allocator_start(ca);
- if (err)
- return err;
+ if (bch_cache_allocator_start(ca))
+ return "error starting allocator thread";
- err = "error starting tiering write workqueue";
if (bch_tiering_write_start(ca))
- return err;
+ return "error starting tiering write workqueue";
trace_bcache_cache_read_write_done(ca);
+ /* XXX wtf? */
return NULL;
err = "error starting moving GC thread";
@@ -1812,7 +1809,7 @@ bool bch_cache_remove(struct cache *ca, bool force)
if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
return false;
- if (!cache_may_remove(ca)) {
+ if (!bch_cache_may_remove(ca)) {
pr_err("Can't remove last device in tier %u of %pU.",
ca->mi.tier, ca->set->disk_sb.set_uuid.b);
bch_notify_cache_remove_failed(ca);
diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h
index 9d537e1eb91c..801b17d7b008 100644
--- a/drivers/md/bcache/super.h
+++ b/drivers/md/bcache/super.h
@@ -145,6 +145,32 @@ static inline void bch_check_mark_super(struct cache_set *c,
bch_check_mark_super_slowpath(c, k, meta);
}
+static inline bool bch_cache_may_remove(struct cache *ca)
+{
+ struct cache_set *c = ca->set;
+ struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+
+ /*
+ * Right now, we can't remove the last device from a tier,
+ * - For tier 0, because all metadata lives in tier 0 and because
+ * there is no way to have foreground writes go directly to tier 1.
+ * - For tier 1, because the code doesn't completely support an
+ * empty tier 1.
+ */
+
+ /*
+ * Turning a device read-only removes it from the cache group,
+ * so there may only be one read-write device in a tier, and yet
+ * the device we are removing is in the same tier, so we have
+ * to check for identity.
+ * Removing the last RW device from a tier requires turning the
+ * whole cache set RO.
+ */
+
+ return tier->nr_devices != 1 ||
+ rcu_access_pointer(tier->devices[0]) != ca;
+}
+
void free_super(struct bcache_superblock *);
int bch_super_realloc(struct bcache_superblock *, unsigned);
void bcache_write_super(struct cache_set *);
@@ -162,10 +188,11 @@ const char *bch_register_cache_set(char * const *, unsigned,
struct cache_set **);
bool bch_cache_set_read_only(struct cache_set *);
+bool bch_cache_set_emergency_read_only(struct cache_set *);
void bch_cache_set_read_only_sync(struct cache_set *);
const char *bch_cache_set_read_write(struct cache_set *);
-void bch_cache_read_only(struct cache *);
+bool bch_cache_read_only(struct cache *);
const char *bch_cache_read_write(struct cache *);
bool bch_cache_remove(struct cache *, bool force);
int bch_cache_set_add_cache(struct cache_set *, const char *);