diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2015-06-01 22:33:29 -0700 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-07 12:34:21 -0800 |
commit | 0c1e9d1bf2e322ecd928fa1e5519124d7cf2bb39 (patch) | |
tree | 395397bb2891823acbb42633fb2ecc687b13cf43 | |
parent | b5ecdc495b4053c64f7bca90125fb5df20d7e0f1 (diff) |
bcache: Cache RO/remove work
Major cleanups/fixes to the cache device RO/removal code
-rw-r--r-- | drivers/md/bcache/bcache.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/migrate.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 388 | ||||
-rw-r--r-- | drivers/md/bcache/super.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 30 |
6 files changed, 181 insertions, 251 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a160f5946c6e..3327dc490a5f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -237,6 +237,7 @@ struct cache { struct percpu_ref ref; struct rcu_head free_rcu; struct work_struct free_work; + struct work_struct read_only_work; struct work_struct remove_work; unsigned long flags; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 75fd276550bc..3249eba03d7b 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -250,9 +250,9 @@ void bch_cache_io_error_work(struct work_struct *work) bch_notify_cache_error(ca, false); } else { bch_notify_cache_error(ca, true); - printk_ratelimited(KERN_ERR "%s: too many IO errors, removing", + printk_ratelimited(KERN_ERR "%s: too many IO errors, going RO", bdevname(ca->disk_sb.bdev, buf)); - bch_cache_remove(ca, true); + queue_work(system_long_wq, &ca->read_only_work); } } diff --git a/drivers/md/bcache/migrate.c b/drivers/md/bcache/migrate.c index 4ee369a4b7a9..09c0b8137d73 100644 --- a/drivers/md/bcache/migrate.c +++ b/drivers/md/bcache/migrate.c @@ -294,12 +294,6 @@ again: again = false; while (1) { - if (CACHE_STATE(&ca->mi) != CACHE_RO && - CACHE_STATE(&ca->mi) != CACHE_ACTIVE) { - ret = -EACCES; - goto out; - } - if (bch_queue_full(queue)) { if (queue->rotational) { again = true; @@ -356,7 +350,6 @@ again: } else if (MIGRATION_DEBUG) pr_notice("Migrated all data in %d iterations", pass); -out: bch_queue_run(queue, &context); return ret; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f67aa72d18e0..a434a6f07726 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1396,6 +1396,49 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c) /* Cache device */ +/* + * Update the cache set's member info and then the various superblocks from one + * device's member info: + */ +void bch_cache_member_info_update(struct cache *ca) +{ + struct cache_set *c = ca->set; + struct cache_member *mi; + + lockdep_assert_held(&bch_register_lock); + + mi = cache_member_info_get(c)->m; + mi[ca->sb.nr_this_dev] = ca->mi; + cache_member_info_put(); + + bcache_write_super(c); +} + +static bool cache_may_remove(struct cache *ca) +{ + struct cache_set *c = ca->set; + + /* + * Right now, we can't remove the last device from a tier, + * - For tier 0, because all metadata lives in tier 0 and because + * there is no way to have foreground writes go directly to tier 1. + * - For tier 1, because the code doesn't completely support an + * empty tier 1. + */ + + /* + * Turning a device read-only removes it from the cache group, + * so there may only be one read-write device in a tier, and yet + * the device we are removing is in the same tier, so we have + * to check for identity. + * Removing the last RW device from a tier requires turning the + * whole cache set RO. + */ + + return c->cache_tiers[CACHE_TIER(&ca->mi)].nr_devices != 1 || + c->cache_tiers[CACHE_TIER(&ca->mi)].devices[0] != ca; +} + static void __bch_cache_read_only(struct cache *ca) { trace_bcache_cache_read_only(ca); @@ -1418,84 +1461,54 @@ static void __bch_cache_read_only(struct cache *ca) trace_bcache_cache_read_only_done(ca); } -static bool bch_last_rw_tier0_device(struct cache *ca) +void bch_cache_read_only(struct cache *ca) { - unsigned i; - bool ret = true; - struct cache *ca2; - - rcu_read_lock(); + struct cache_set *c = ca->set; + char buf[BDEVNAME_SIZE]; - for_each_cache_rcu(ca2, ca->set, i) { - if ((CACHE_TIER(&ca2->mi) == 0) - && (CACHE_STATE(&ca2->mi) == CACHE_ACTIVE) - && (ca2 != ca)) { - ret = false; - } - } + bdevname(ca->disk_sb.bdev, buf); - rcu_read_unlock(); - return ret; -} + lockdep_assert_held(&bch_register_lock); -/* This does not write the super-block, should it? */ + if (CACHE_STATE(&ca->mi) != CACHE_ACTIVE) + return; -void bch_cache_read_only(struct cache *ca) -{ - unsigned tier; - bool has_meta, meta_off; - char buf[BDEVNAME_SIZE]; - struct cache_member *mi; - struct cache_member_rcu *allmi; + if (!cache_may_remove(ca)) { + pr_warning("Required member %s for %pU going RO, cache set going RO", + buf, &c->sb.set_uuid); + bch_cache_set_read_only(c); + } /* * Stop data writes. */ __bch_cache_read_only(ca); - pr_notice("%s read only (data)", bdevname(ca->disk_sb.bdev, buf)); + pr_notice("%s read only", bdevname(ca->disk_sb.bdev, buf)); bch_notify_cache_read_only(ca); - /* - * Mark as RO. - */ - allmi = cache_member_info_get(ca->set); - mi = &allmi->m[ca->sb.nr_this_dev]; - tier = CACHE_TIER(mi); - has_meta = CACHE_HAS_METADATA(mi); - SET_CACHE_STATE(mi, CACHE_RO); - ca->mi = *mi; /* Update cache_member cache in struct cache */ - cache_member_info_put(); + SET_CACHE_STATE(&ca->mi, CACHE_RO); + bch_cache_member_info_update(ca); +} - meta_off = false; +static void bch_cache_read_only_work(struct work_struct *work) +{ + struct cache *ca = container_of(work, struct cache, read_only_work); - /* - * The only way to stop meta-data writes is to actually move - * the meta-data off! - */ - if (has_meta) { - if ((tier == 0) && (bch_last_rw_tier0_device(ca))) - pr_err("Tier 0 needs to allow meta-data writes in %pU.", - ca->set->sb.set_uuid.b); - else if (bch_move_meta_data_off_device(ca) != 0) - pr_err("Unable to stop writing meta-data in %pU.", - ca->set->sb.set_uuid.b); - else - meta_off = true; - } + /* Going RO because of an error: */ - if (has_meta && meta_off) - pr_notice("%s read only (meta-data)", - bdevname(ca->disk_sb.bdev, buf)); - return; + mutex_lock(&bch_register_lock); + bch_cache_read_only(ca); + mutex_unlock(&bch_register_lock); } -/* This does not write the super-block, should it? */ - -const char *bch_cache_read_write(struct cache *ca) +static const char *__bch_cache_read_write(struct cache *ca) { const char *err; + BUG_ON(CACHE_STATE(&ca->mi) != CACHE_ACTIVE); + lockdep_assert_held(&bch_register_lock); + trace_bcache_cache_read_write(ca); err = bch_cache_allocator_start(ca); @@ -1521,6 +1534,28 @@ const char *bch_cache_read_write(struct cache *ca) return err; } +const char *bch_cache_read_write(struct cache *ca) +{ + const char *err; + + lockdep_assert_held(&bch_register_lock); + + if (CACHE_STATE(&ca->mi) == CACHE_ACTIVE) + return NULL; + + if (test_bit(CACHE_DEV_REMOVING, &ca->flags)) + return "removing"; + + err = __bch_cache_read_write(ca); + if (err) + return err; + + SET_CACHE_STATE(&ca->mi, CACHE_ACTIVE); + bch_cache_member_info_update(ca); + + return NULL; +} + /* * bch_cache_stop has already returned, so we no longer hold the register * lock at the point this is called. @@ -1635,218 +1670,116 @@ static void bch_cache_stop(struct cache *ca) static void bch_cache_remove_work(struct work_struct *work) { - unsigned tier; - bool has_data, has_meta, data_off, meta_off; struct cache *ca = container_of(work, struct cache, remove_work); struct cache_set *c = ca->set; - struct cache_member_rcu *allmi; struct cache_member *mi; - char buf[BDEVNAME_SIZE]; - bool force = (test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags)); + char name[BDEVNAME_SIZE]; + bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags); + unsigned dev = ca->sb.nr_this_dev; struct closure cl; closure_init_stack(&cl); - - bch_notify_cache_removing(ca); - - mutex_lock(&bch_register_lock); - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - - /* - * Right now, we can't remove the last device from a tier, - * - For tier 0, because all metadata lives in tier 0 and because - * there is no way to have foreground writes go directly to tier 1. - * - For tier 1, because the code doesn't completely support an - * empty tier 1. - */ - - tier = CACHE_TIER(mi); + bdevname(ca->disk_sb.bdev, name); /* - * Turning a device read-only removes it from the cache group, - * so there may only be one read-write device in a tier, and yet - * the device we are removing is in the same tier, so we have - * to check for identity. - * Removing the last RW device from a tier requires turning the - * whole cache set RO. + * Device should already be RO, now migrate data off: + * + * XXX: locking is sketchy, bch_cache_read_write() has to check + * CACHE_DEV_REMOVING bit */ - - if ((c->cache_tiers[tier].nr_devices == 1) - && (c->cache_tiers[tier].devices[0] == ca)) { - cache_member_info_put(); - mutex_unlock(&bch_register_lock); - clear_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags); + if (!CACHE_HAS_DATA(&ca->mi)) { + /* Nothing to do: */ + } else if (!bch_move_data_off_device(ca)) { + SET_CACHE_HAS_DATA(&ca->mi, false); + bch_cache_member_info_update(ca); + } else if (force) { + bch_flag_data_bad(ca); + + SET_CACHE_HAS_DATA(&ca->mi, false); + bch_cache_member_info_update(ca); + } else { + pr_err("Remove of %s failed, unable to migrate data off", name); clear_bit(CACHE_DEV_REMOVING, &ca->flags); - pr_err("Can't remove last device in tier %u of %pU.", - tier, c->sb.set_uuid.b); - bch_notify_cache_remove_failed(ca); return; } - /* CACHE_ACTIVE means Read/Write. */ + /* Now metadata: */ - if (CACHE_STATE(mi) != CACHE_ACTIVE) { - has_data = CACHE_HAS_DATA(mi); - cache_member_info_put(); + if (!CACHE_HAS_METADATA(&ca->mi)) { + /* Nothing to do: */ + } else if (!bch_move_meta_data_off_device(ca)) { + SET_CACHE_HAS_METADATA(&ca->mi, false); + bch_cache_member_info_update(ca); } else { - cache_member_info_put(); - /* - * The following quiesces data writes but not meta-data writes. - */ - __bch_cache_read_only(ca); - - /* Update the state to read-only */ - - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - SET_CACHE_STATE(mi, CACHE_RO); - ca->mi = *mi; /* Update cache_member cache in struct cache */ - has_data = CACHE_HAS_DATA(mi); - cache_member_info_put(); - bcache_write_super(c); - } - - mutex_unlock(&bch_register_lock); - - /* - * The call to __bch_cache_read_only above has quiesced all data writes. - * Move the data off the device, if there is any. - */ - - data_off = (!has_data || (bch_move_data_off_device(ca) == 0)); - - if (has_data && !data_off && force) - /* Ignore the return value and proceed anyway */ - (void) bch_flag_data_bad(ca); - - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - if (has_data && (data_off || force)) { - /* We've just moved all the data off! */ - SET_CACHE_HAS_DATA(mi, false); - /* Update cache_member cache in struct cache */ - ca->mi = *mi; - } - has_meta = CACHE_HAS_METADATA(mi); - cache_member_info_put(); - - /* - * If there is no meta data, claim it has been moved off. - * Else, try to move it off -- this also quiesces meta-data writes. - */ - - meta_off = (!has_meta || (bch_move_meta_data_off_device(ca) == 0)); - - /* - * If we successfully moved meta-data off, mark as having none. - */ - - if (has_meta && meta_off) { - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - /* We've just moved all the meta-data off! */ - SET_CACHE_HAS_METADATA(mi, false); - /* Update cache_member cache in struct cache */ - ca->mi = *mi; - cache_member_info_put(); - } - - /* Now, complain as necessary */ - - /* - * Note: These error messages are messy because pr_err is a macro - * that concatenates its first must-be-string argument. - */ - - if (has_data && !data_off) - pr_err("%s in %pU%s", - (force - ? "Forcing device removal with live data" - : "Unable to move data off device"), - c->sb.set_uuid.b, - (force ? "!" : ".")); - - if (has_meta && !meta_off) - pr_err("%s in %pU%s", - (force - ? "Forcing device removal with live meta-data" - : "Unable to move meta-data off device"), - c->sb.set_uuid.b, - (force ? "!" : ".")); - - /* If there is (meta-) data left, and not forcing, abort */ - - if ((!data_off || !meta_off) && !force) { + pr_err("Remove of %s failed, unable to migrate metadata off", + name); clear_bit(CACHE_DEV_REMOVING, &ca->flags); - bch_notify_cache_remove_failed(ca); return; } - if (has_meta && meta_off) - pr_notice("%s read only (meta-data)", - bdevname(ca->disk_sb.bdev, buf)); - - /* Update the super block */ - - down(&c->sb_write_mutex); - - /* Mark it as failed in the super block */ - - if (meta_off) { - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - SET_CACHE_STATE(mi, CACHE_FAILED); - /* Update cache_member cache in struct cache */ - ca->mi = *mi; - cache_member_info_put(); - } + /* + * Ok, really doing the remove: + * Drop device's prio pointer before removing it from superblock: + */ + bch_notify_cache_removed(ca); spin_lock(&c->journal.lock); - c->journal.prio_buckets[ca->sb.nr_this_dev] = 0; + c->journal.prio_buckets[dev] = 0; spin_unlock(&c->journal.lock); - /* write new prio pointers */ bch_journal_meta(&c->journal, &cl); closure_sync(&cl); - __bcache_write_super(c); /* ups sb_write_mutex */ - /* - * Now mark the slot as 0 in memory so that the slot can be reused. - * It won't actually be reused until btree_gc makes sure that there - * are no pointers to the device at all. + * Stop device before removing it from the cache set's list of devices - + * and get our own ref on cache set since ca is going away: */ + closure_get(&c->cl); - if (meta_off) { - allmi = cache_member_info_get(c); - mi = &allmi->m[ca->sb.nr_this_dev]; - memset(&mi->uuid, 0, sizeof(mi->uuid)); - /* No need to copy to struct cache as we are removing */ - cache_member_info_put(); - } + mutex_lock(&bch_register_lock); + bch_cache_stop(ca); /* - * This completes asynchronously, with bch_cache_stop scheduling - * the final teardown when there are no (read) bios outstanding. + * RCU barrier between dropping between c->cache and dropping from + * member info: */ + synchronize_rcu(); - mutex_lock(&bch_register_lock); - bch_cache_stop(ca); - mutex_unlock(&bch_register_lock); + mi = cache_member_info_get(c)->m; + memset(&mi[dev].uuid, 0, sizeof(mi[dev].uuid)); + cache_member_info_put(); - bch_notify_cache_removed(ca); + bcache_write_super(c); + mutex_unlock(&bch_register_lock); - return; + closure_put(&c->cl); } bool bch_cache_remove(struct cache *ca, bool force) { - if (test_and_set_bit(CACHE_DEV_REMOVING, &ca->flags)) + mutex_lock(&bch_register_lock); + + if (test_bit(CACHE_DEV_REMOVING, &ca->flags)) return false; + if (!cache_may_remove(ca)) { + pr_err("Can't remove last device in tier %llu of %pU.", + CACHE_TIER(&ca->mi), ca->set->sb.set_uuid.b); + bch_notify_cache_remove_failed(ca); + return false; + } + + /* First, go RO before we try to migrate data off: */ + bch_cache_read_only(ca); + if (force) set_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags); + set_bit(CACHE_DEV_REMOVING, &ca->flags); + bch_notify_cache_removing(ca); + + mutex_unlock(&bch_register_lock); + + /* Migrate the data and finish removal asynchronously: */ queue_work(system_long_wq, &ca->remove_work); return true; @@ -1901,6 +1834,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->self.devices[0] = ca; INIT_WORK(&ca->free_work, bch_cache_free_work); + INIT_WORK(&ca->read_only_work, bch_cache_read_only_work); INIT_WORK(&ca->remove_work, bch_cache_remove_work); bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; @@ -2183,7 +2117,7 @@ have_slot: bch_notify_cache_added(ca); - err = bch_cache_read_write(ca); + err = __bch_cache_read_write(ca); if (err) goto err_put; diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h index b13a17eada97..1e4f9d6d73c0 100644 --- a/drivers/md/bcache/super.h +++ b/drivers/md/bcache/super.h @@ -150,6 +150,8 @@ void __write_super(struct cache_set *, struct bcache_superblock *, const char *validate_super(struct bcache_superblock *, struct cache_sb *); +void bch_cache_member_info_update(struct cache *); + void bch_cache_set_fail(struct cache_set *); void bch_cache_set_release(struct kobject *); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d45ef57b583e..059e76268fd5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1197,39 +1197,39 @@ STORE(__bch_cache) if (attr == &sysfs_state_rw) { char name[BDEVNAME_SIZE]; - const char *err; + const char *err = NULL; ssize_t v = bch_read_string_list(buf, bch_cache_state); if (v < 0) return v; - if (v == CACHE_STATE(mi)) + if (v == CACHE_STATE(&ca->mi)) return size; switch (v) { case CACHE_ACTIVE: err = bch_cache_read_write(ca); - if (err) { - pr_err("can't set %s read-write: %s", - bdevname(ca->disk_sb.bdev, name), err); - - return -EINVAL; - } - break; case CACHE_RO: bch_cache_read_only(ca); break; case CACHE_FAILED: - bch_cache_read_only(ca); - break; case CACHE_SPARE: - bch_cache_read_only(ca); - break; + /* + * XXX: need to migrate data off and set correct state + */ + pr_err("can't set %s %s: not supported", + bdevname(ca->disk_sb.bdev, name), + bch_cache_state[v]); + return -EINVAL; } - SET_CACHE_STATE(mi, v); - bcache_write_super(c); + if (err) { + pr_err("can't set %s %s: %s", + bdevname(ca->disk_sb.bdev, name), + bch_cache_state[v], err); + return -EINVAL; + } } if (attr == &sysfs_unregister) { |