diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-03-15 17:00:41 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-07 12:35:49 -0800 |
commit | 1b60766af54b16e92783289e97605aafcf1b4a74 (patch) | |
tree | 22cbfa609f56e933c5d1514ebe37389a2548f8ef | |
parent | 7e4ed6a922b0358d7950b3724dd0ac2d59e0f319 (diff) |
bcache: bch_cache_set_emergency_read_only()
going ro with bch_cache_set_read_only() on error, when the allocator has already
stopped, could deadlock - we'd be waiting on foreground writes to complete that
are stuck because the allocator already stopped (due to the original error)
-rw-r--r-- | drivers/md/bcache/alloc.c | 26 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/blockdev_types.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/error.c | 24 | ||||
-rw-r--r-- | drivers/md/bcache/fs.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 115 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 29 |
8 files changed, 129 insertions, 81 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index deeed810de86..469ef659c2d3 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -1666,6 +1666,12 @@ void bch_cache_allocator_stop(struct cache *ca) } mutex_unlock(&c->btree_reserve_cache_lock); + /* Avoid deadlocks.. */ + + closure_wake_up(&c->buckets_available_wait); + closure_wake_up(&c->freelist_wait); + wake_up(&c->journal.wait); + /* Now wait for any in flight writes: */ while (1) { @@ -1695,15 +1701,23 @@ void bch_cache_allocator_stop(struct cache *ca) /* * Startup the allocator thread for transition to RW mode: */ -const char *bch_cache_allocator_start(struct cache *ca) +int bch_cache_allocator_start(struct cache *ca) { struct cache_set *c = ca->set; struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; struct task_struct *k; + /* + * allocator thread already started? + * (run_cache_set() starts allocator separately from normal rw path, via + * bch_cache_allocator_start_once()) + */ + if (ca->alloc_thread) + return 0; + k = kthread_create(bch_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(k)) - return "error starting allocator thread"; + return 0; get_task_struct(k); ca->alloc_thread = k; @@ -1719,8 +1733,7 @@ const char *bch_cache_allocator_start(struct cache *ca) * -EROFS due to prio_write() -> journal_meta() not finding any devices: */ wake_up_process(k); - - return NULL; + return 0; } /* @@ -1769,7 +1782,10 @@ const char *bch_cache_allocator_start_once(struct cache *ca) if (!fifo_full(&ca->free[RESERVE_PRIO])) return "couldn't find enough available buckets to write prios"; - return bch_cache_allocator_start(ca); + if (bch_cache_allocator_start(ca)) + return "error starting allocator thread"; + + return NULL; } void bch_open_buckets_init(struct cache_set *c) diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h index 5fc36604a78f..7dbbb23ff67a 100644 --- a/drivers/md/bcache/alloc.h +++ b/drivers/md/bcache/alloc.h @@ -55,7 +55,7 @@ static inline void bch_wake_allocator(struct cache *ca) (_ptr)++) void bch_cache_allocator_stop(struct cache *); -const char *bch_cache_allocator_start(struct cache *); +int bch_cache_allocator_start(struct cache *); const char *bch_cache_allocator_start_once(struct cache *); void bch_open_buckets_init(struct cache_set *); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d5336b3e3691..86bd9fc41a84 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -331,6 +331,12 @@ struct cache_member_rcu { struct cache_member_cpu m[]; }; +/* cache->flags: */ +enum { + CACHE_DEV_REMOVING, + CACHE_DEV_FORCE_REMOVE, +}; + struct cache { struct percpu_ref ref; struct rcu_head free_rcu; @@ -476,6 +482,8 @@ enum { CACHE_SET_RUNNING, CACHE_SET_RO, CACHE_SET_RO_COMPLETE, + CACHE_SET_EMERGENCY_RO, + CACHE_SET_WRITE_DISABLE_COMPLETE, CACHE_SET_GC_STOPPING, CACHE_SET_GC_FAILURE, CACHE_SET_BDEV_MOUNTED, @@ -506,7 +514,6 @@ struct cache_set { /* Counts outstanding writes, for clean transition to read-only */ struct percpu_ref writes; - struct completion write_disable_complete; struct work_struct read_only_work; struct cache __rcu *cache[MAX_CACHES_PER_SET]; diff --git a/drivers/md/bcache/blockdev_types.h b/drivers/md/bcache/blockdev_types.h index 65ecc42c2f85..67dd4dec868b 100644 --- a/drivers/md/bcache/blockdev_types.h +++ b/drivers/md/bcache/blockdev_types.h @@ -120,7 +120,4 @@ struct cached_dev { unsigned char writeback_percent; }; -#define CACHE_DEV_REMOVING 0 -#define CACHE_DEV_FORCE_REMOVE 1 - #endif /* _BCACHE_BLOCKDEV_TYPES_H */ diff --git a/drivers/md/bcache/error.c b/drivers/md/bcache/error.c index 569348541132..2272cb76cf13 100644 --- a/drivers/md/bcache/error.c +++ b/drivers/md/bcache/error.c @@ -16,7 +16,7 @@ void bch_inconsistent_error(struct cache_set *c) return; } - if (bch_cache_set_read_only(c)) + if (bch_cache_set_emergency_read_only(c)) __bch_cache_set_error(c, "emergency read only"); break; case BCH_ON_ERROR_PANIC: @@ -28,9 +28,8 @@ void bch_inconsistent_error(struct cache_set *c) void bch_fatal_error(struct cache_set *c) { - if (bch_cache_set_read_only(c)) - printk(KERN_ERR "bcache: %pU emergency read only\n", - c->disk_sb.user_uuid.b); + if (bch_cache_set_emergency_read_only(c)) + __bch_cache_set_error(c, "emergency read only"); } /* Nonfatal IO errors, IO error/latency accounting: */ @@ -110,20 +109,25 @@ void bch_account_io_completion_time(struct cache *ca, void bch_nonfatal_io_error_work(struct work_struct *work) { struct cache *ca = container_of(work, struct cache, io_error_work); + struct cache_set *c = ca->set; unsigned errors = atomic_read(&ca->io_errors); char buf[BDEVNAME_SIZE]; + bool dev; - if (errors < ca->set->error_limit) { + if (errors < c->error_limit) { bch_notify_cache_error(ca, false); } else { bch_notify_cache_error(ca, true); mutex_lock(&bch_register_lock); - if (ca->mi.state == CACHE_ACTIVE) { - printk(KERN_ERR "bcache: too many IO errors on %s, going RO\n", - bdevname(ca->disk_sb.bdev, buf)); - bch_cache_read_only(ca); - } + dev = bch_cache_may_remove(ca); + if (dev + ? bch_cache_read_only(ca) + : bch_cache_set_emergency_read_only(c)) + __bch_cache_set_error(c, + "too many IO errors on %s, setting %s RO", + bdevname(ca->disk_sb.bdev, buf), + dev ? "device" : "filesystem"); mutex_unlock(&bch_register_lock); } } diff --git a/drivers/md/bcache/fs.c b/drivers/md/bcache/fs.c index 9c0d1cbcaa9a..7a3b267597e0 100644 --- a/drivers/md/bcache/fs.c +++ b/drivers/md/bcache/fs.c @@ -882,7 +882,7 @@ setflags_out: down_write(&sb->s_umount); sb->s_flags |= MS_RDONLY; - bch_cache_set_read_only(c); + bch_cache_set_emergency_read_only(c); up_write(&sb->s_umount); return 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f013cc9cad74..4a903d402144 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -729,7 +729,8 @@ static void bch_writes_disabled(struct percpu_ref *writes) { struct cache_set *c = container_of(writes, struct cache_set, writes); - complete(&c->write_disable_complete); + set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch_read_only_wait); } static void bch_cache_set_read_only_work(struct work_struct *work) @@ -737,7 +738,6 @@ static void bch_cache_set_read_only_work(struct work_struct *work) struct cache_set *c = container_of(work, struct cache_set, read_only_work); - init_completion(&c->write_disable_complete); percpu_ref_put(&c->writes); del_timer_sync(&c->foreground_write_wakeup); @@ -746,11 +746,23 @@ static void bch_cache_set_read_only_work(struct work_struct *work) c->foreground_write_pd.rate.rate = UINT_MAX; bch_wake_delayed_writes((unsigned long) c); - /* Wait for outstanding writes to complete: */ - wait_for_completion(&c->write_disable_complete); + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator. + * + * If we are doing an emergency shutdown, outstanding writes may hang + * until we shutdown the allocator, so we don't want to wait here: + */ + wait_event(bch_read_only_wait, + test_bit(CACHE_SET_EMERGENCY_RO, &c->flags) || + test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags)); __bch_cache_set_read_only(c); + wait_event(bch_read_only_wait, + test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags)); + bch_notify_cache_set_read_only(c); trace_bcache_cache_set_read_only_done(c); @@ -781,16 +793,26 @@ bool bch_cache_set_read_only(struct cache_set *c) return true; } +bool bch_cache_set_emergency_read_only(struct cache_set *c) +{ + bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags); + + bch_cache_set_read_only(c); + + wake_up(&bch_read_only_wait); + return ret; +} + void bch_cache_set_read_only_sync(struct cache_set *c) { + /* so we don't race with bch_cache_set_read_write() */ + lockdep_assert_held(&bch_register_lock); + bch_cache_set_read_only(c); - /* - * XXX: can hang indefinitely if we race and someone else does a - * cache_set_read_write - */ wait_event(bch_read_only_wait, - test_bit(CACHE_SET_RO_COMPLETE, &c->flags)); + test_bit(CACHE_SET_RO_COMPLETE, &c->flags) && + test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags)); } static const char *__bch_cache_set_read_write(struct cache_set *c) @@ -799,6 +821,16 @@ static const char *__bch_cache_set_read_write(struct cache_set *c) const char *err; unsigned i; + lockdep_assert_held(&bch_register_lock); + + err = "error starting allocator thread"; + for_each_cache(ca, c, i) + if (ca->mi.state == CACHE_ACTIVE && + bch_cache_allocator_start(ca)) { + percpu_ref_put(&ca->ref); + goto err; + } + err = "error starting btree GC thread"; if (bch_gc_thread_start(c)) goto err; @@ -832,35 +864,24 @@ err: const char *bch_cache_set_read_write(struct cache_set *c) { - struct cache *ca; const char *err; - unsigned i; lockdep_assert_held(&bch_register_lock); if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags)) return NULL; - for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && - (err = bch_cache_allocator_start(ca))) { - percpu_ref_put(&ca->ref); - goto err; - } - err = __bch_cache_set_read_write(c); if (err) return err; percpu_ref_reinit(&c->writes); + clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags); + clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags); clear_bit(CACHE_SET_RO_COMPLETE, &c->flags); clear_bit(CACHE_SET_RO, &c->flags); - return NULL; -err: - __bch_cache_set_read_only(c); - return err; } /* Cache set startup/shutdown: */ @@ -956,8 +977,6 @@ static void __cache_set_stop2(struct closure *cl) if (!IS_ERR_OR_NULL(c->chardev)) device_unregister(c->chardev); - bch_cache_set_read_only_sync(c); - if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -967,6 +986,10 @@ static void __cache_set_stop2(struct closure *cl) kobject_put(&c->opts_dir); kobject_put(&c->internal); + mutex_lock(&bch_register_lock); + bch_cache_set_read_only_sync(c); + mutex_unlock(&bch_register_lock); + closure_return(cl); } @@ -1475,32 +1498,6 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c) /* Cache device */ -static bool cache_may_remove(struct cache *ca) -{ - struct cache_set *c = ca->set; - struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; - - /* - * Right now, we can't remove the last device from a tier, - * - For tier 0, because all metadata lives in tier 0 and because - * there is no way to have foreground writes go directly to tier 1. - * - For tier 1, because the code doesn't completely support an - * empty tier 1. - */ - - /* - * Turning a device read-only removes it from the cache group, - * so there may only be one read-write device in a tier, and yet - * the device we are removing is in the same tier, so we have - * to check for identity. - * Removing the last RW device from a tier requires turning the - * whole cache set RO. - */ - - return tier->nr_devices != 1 || - rcu_access_pointer(tier->devices[0]) != ca; -} - static void __bch_cache_read_only(struct cache *ca) { trace_bcache_cache_read_only(ca); @@ -1523,7 +1520,7 @@ static void __bch_cache_read_only(struct cache *ca) trace_bcache_cache_read_only_done(ca); } -void bch_cache_read_only(struct cache *ca) +bool bch_cache_read_only(struct cache *ca) { struct cache_set *c = ca->set; char buf[BDEVNAME_SIZE]; @@ -1533,9 +1530,9 @@ void bch_cache_read_only(struct cache *ca) lockdep_assert_held(&bch_register_lock); if (ca->mi.state != CACHE_ACTIVE) - return; + return false; - if (!cache_may_remove(ca)) { + if (!bch_cache_may_remove(ca)) { printk(__bch_err_fmt(c, "required member %s going RO, forcing fs RO", buf)); bch_cache_set_read_only_sync(c); @@ -1551,6 +1548,7 @@ void bch_cache_read_only(struct cache *ca) SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO); bcache_write_super(c); + return true; } static const char *__bch_cache_read_write(struct cache *ca) @@ -1562,16 +1560,15 @@ static const char *__bch_cache_read_write(struct cache *ca) trace_bcache_cache_read_write(ca); - err = bch_cache_allocator_start(ca); - if (err) - return err; + if (bch_cache_allocator_start(ca)) + return "error starting allocator thread"; - err = "error starting tiering write workqueue"; if (bch_tiering_write_start(ca)) - return err; + return "error starting tiering write workqueue"; trace_bcache_cache_read_write_done(ca); + /* XXX wtf? */ return NULL; err = "error starting moving GC thread"; @@ -1812,7 +1809,7 @@ bool bch_cache_remove(struct cache *ca, bool force) if (test_bit(CACHE_DEV_REMOVING, &ca->flags)) return false; - if (!cache_may_remove(ca)) { + if (!bch_cache_may_remove(ca)) { pr_err("Can't remove last device in tier %u of %pU.", ca->mi.tier, ca->set->disk_sb.set_uuid.b); bch_notify_cache_remove_failed(ca); diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index 9d537e1eb91c..801b17d7b008 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -145,6 +145,32 @@ static inline void bch_check_mark_super(struct cache_set *c, bch_check_mark_super_slowpath(c, k, meta); } +static inline bool bch_cache_may_remove(struct cache *ca) +{ + struct cache_set *c = ca->set; + struct cache_group *tier = &c->cache_tiers[ca->mi.tier]; + + /* + * Right now, we can't remove the last device from a tier, + * - For tier 0, because all metadata lives in tier 0 and because + * there is no way to have foreground writes go directly to tier 1. + * - For tier 1, because the code doesn't completely support an + * empty tier 1. + */ + + /* + * Turning a device read-only removes it from the cache group, + * so there may only be one read-write device in a tier, and yet + * the device we are removing is in the same tier, so we have + * to check for identity. + * Removing the last RW device from a tier requires turning the + * whole cache set RO. + */ + + return tier->nr_devices != 1 || + rcu_access_pointer(tier->devices[0]) != ca; +} + void free_super(struct bcache_superblock *); int bch_super_realloc(struct bcache_superblock *, unsigned); void bcache_write_super(struct cache_set *); @@ -162,10 +188,11 @@ const char *bch_register_cache_set(char * const *, unsigned, struct cache_set **); bool bch_cache_set_read_only(struct cache_set *); +bool bch_cache_set_emergency_read_only(struct cache_set *); void bch_cache_set_read_only_sync(struct cache_set *); const char *bch_cache_set_read_write(struct cache_set *); -void bch_cache_read_only(struct cache *); +bool bch_cache_read_only(struct cache *); const char *bch_cache_read_write(struct cache *); bool bch_cache_remove(struct cache *, bool force); int bch_cache_set_add_cache(struct cache_set *, const char *); |