bcache: Cache set RO/RW path improvements

Now we should be able to go RW -> RO and back again - which we need for bcachefs remounting
author: Kent Overstreet <kent.overstreet@gmail.com> 2015-06-01 19:32:22 -0700
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:34:21 -0800
commit: b5ecdc495b4053c64f7bca90125fb5df20d7e0f1 (patch)
tree: fdfd8cb1ec591c09923af1393e84e66e0bc6c619
parent: 5749e6138348d7c1546e28b4ac0ae9032c94e0c0 (diff)
9 files changed, 170 insertions, 107 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 7b51888c5968..442a3575dc0b 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -1540,6 +1540,30 @@ void bch_cache_allocator_stop(struct cache *ca)
 }
 
 /*
+ * Startup the allocator thread for transition to RW mode:
+ */
+const char *bch_cache_allocator_start(struct cache *ca)
+{
+	struct cache_set *c = ca->set;
+	struct cache_group *tier = &c->cache_tiers[CACHE_TIER(&ca->mi)];
+	struct task_struct *k;
+
+	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
+	if (IS_ERR(k))
+		return "error starting allocator thread";
+
+	ca->alloc_thread = k;
+	wake_up_process(k);
+
+	bch_cache_group_add_cache(tier, ca);
+	bch_cache_group_add_cache(&c->cache_all, ca);
+
+	bch_recalc_capacity(c);
+
+	return NULL;
+}
+
+/*
  * bch_cache_allocator_start - fill freelists directly with completely unused
  * buckets
  *
@@ -1562,11 +1586,8 @@ void bch_cache_allocator_stop(struct cache *ca)
  * should always be some of when this function is called, since the last time
  * we shut down there should have been unused buckets stranded on freelists.
  */
-const char *bch_cache_allocator_start(struct cache *ca)
+const char *bch_cache_allocator_start_once(struct cache *ca)
 {
-	struct cache_set *c = ca->set;
-	struct cache_group *tier = &c->cache_tiers[CACHE_TIER(&ca->mi)];
-	struct task_struct *k;
 	struct bucket *g;
 
 	spin_lock(&ca->freelist_lock);
@@ -1588,19 +1609,7 @@ const char *bch_cache_allocator_start(struct cache *ca)
 	if (!fifo_full(&ca->free[RESERVE_PRIO]))
 		return "couldn't find enough available buckets to write prios";
 
-	k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
-	if (IS_ERR(k))
-		return "error starting allocator thread";
-
-	ca->alloc_thread = k;
-	wake_up_process(k);
-
-	bch_cache_group_add_cache(tier, ca);
-	bch_cache_group_add_cache(&c->cache_all, ca);
-
-	bch_recalc_capacity(c);
-
-	return NULL;
+	return bch_cache_allocator_start(ca);
 }
 
 void bch_open_buckets_init(struct cache_set *c)
diff --git a/drivers/md/bcache/alloc.h b/drivers/md/bcache/alloc.h
index c0118db8440e..106eaa52b0e0 100644
--- a/drivers/md/bcache/alloc.h
+++ b/drivers/md/bcache/alloc.h
@@ -54,6 +54,7 @@ static inline void bch_wake_allocator(struct cache *ca)
 
 void bch_cache_allocator_stop(struct cache *);
 const char *bch_cache_allocator_start(struct cache *);
+const char *bch_cache_allocator_start_once(struct cache *);
 void bch_open_buckets_init(struct cache_set *);
 
 #endif /* _BCACHE_ALLOC_H */
diff --git a/drivers/md/bcache/blockdev.c b/drivers/md/bcache/blockdev.c
index 34efb73c8d6b..4faecf99805a 100644
--- a/drivers/md/bcache/blockdev.c
+++ b/drivers/md/bcache/blockdev.c
@@ -483,10 +483,7 @@ static void cached_dev_free(struct closure *cl)
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
-	cancel_delayed_work_sync(&dc->writeback_pd_update);
-	if (!IS_ERR_OR_NULL(dc->writeback_thread))
-		kthread_stop(dc->writeback_thread);
-
+	bch_cached_dev_writeback_stop(dc);
 	bch_cached_dev_writeback_free(dc);
 
 	mutex_lock(&bch_register_lock);
diff --git a/drivers/md/bcache/gc.c b/drivers/md/bcache/gc.c
index 8dbfa6ee52c4..bd3cdda4120f 100644
--- a/drivers/md/bcache/gc.c
+++ b/drivers/md/bcache/gc.c
@@ -660,8 +660,18 @@ static int bch_gc_thread(void *arg)
 	return 0;
 }
 
+void bch_gc_thread_stop(struct cache_set *c)
+{
+	set_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
+	if (!IS_ERR_OR_NULL(c->gc_thread))
+		kthread_stop(c->gc_thread);
+}
+
 int bch_gc_thread_start(struct cache_set *c)
 {
+	clear_bit(CACHE_SET_GC_STOPPING, &c->flags);
+
 	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
 	if (IS_ERR(c->gc_thread))
 		return PTR_ERR(c->gc_thread);
diff --git a/drivers/md/bcache/gc.h b/drivers/md/bcache/gc.h
index 81e50be44af5..b3d60bd2bfc9 100644
--- a/drivers/md/bcache/gc.h
+++ b/drivers/md/bcache/gc.h
@@ -7,6 +7,7 @@ static inline void set_gc_sectors(struct cache_set *c)
 }
 
 void bch_gc(struct cache_set *);
+void bch_gc_thread_stop(struct cache_set *);
 int bch_gc_thread_start(struct cache_set *);
 int bch_initial_gc(struct cache_set *, struct list_head *);
 u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index c0d17ad94623..75fd276550bc 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -1016,7 +1016,6 @@ void bch_wake_delayed_writes(unsigned long data)
 	}
 
 	spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-
 }
 
 /**
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 05c62ca25f74..f67aa72d18e0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -638,24 +638,66 @@ void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
 	__bcache_write_super(c);
 }
 
-/* Cache set */
+/* Cache set RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and tiering (to free up space)
+ *
+ * - copygc and tiering depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
 
 static void __bch_cache_read_only(struct cache *ca);
 
-static void bch_cache_set_read_only(struct cache_set *c)
+static void __bch_cache_set_read_only(struct cache_set *c)
 {
-	struct cached_dev *dc;
-	struct bcache_device *d;
-	struct radix_tree_iter iter;
 	struct closure cl;
-	void **slot;
-
 	struct cache *ca;
 	unsigned i;
 
-	lockdep_assert_held(&bch_register_lock);
 	closure_init_stack(&cl);
 
+	c->tiering_pd.rate.rate = UINT_MAX;
+	bch_ratelimit_reset(&c->tiering_pd.rate);
+	bch_tiering_read_stop(c);
+
+	for_each_cache(ca, c, i) {
+		bch_tiering_write_stop(ca);
+		bch_moving_gc_stop(ca);
+	}
+
+	bch_gc_thread_stop(c);
+
+	bch_btree_flush(c);
+
+	for_each_cache(ca, c, i)
+		bch_cache_allocator_stop(ca);
+
+	bch_journal_flush(&c->journal, &cl);
+	closure_sync(&cl);
+
+	cancel_delayed_work_sync(&c->journal.write_work);
+}
+
+static void bch_writes_disabled(struct percpu_ref *writes)
+{
+	struct cache_set *c = container_of(writes, struct cache_set, writes);
+
+	complete(&c->write_disable_complete);
+}
+
+void bch_cache_set_read_only(struct cache_set *c)
+{
+	lockdep_assert_held(&bch_register_lock);
+
 	if (test_and_set_bit(CACHE_SET_RO, &c->flags))
 		return;
 
@@ -664,7 +706,12 @@ static void bch_cache_set_read_only(struct cache_set *c)
 	/*
 	 * Block new foreground-end write operations from starting - any new
 	 * writes will return -EROFS:
+	 *
+	 * (This is really blocking new _allocations_, writes to previously
+	 * allocated space can still happen until stopping the allocator in
+	 * bch_cache_allocator_stop()).
 	 */
+	init_completion(&c->write_disable_complete);
 	percpu_ref_kill(&c->writes);
 
 	bch_wake_delayed_writes((unsigned long) c);
@@ -674,39 +721,78 @@ static void bch_cache_set_read_only(struct cache_set *c)
 	/* Wait for outstanding writes to complete: */
 	wait_for_completion(&c->write_disable_complete);
 
-	radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
-		d = rcu_dereference_protected(*slot,
-				lockdep_is_held(&bch_register_lock));
+	__bch_cache_set_read_only(c);
 
-		if (!INODE_FLASH_ONLY(&d->inode.v)) {
-			dc = container_of(d, struct cached_dev, disk);
-			bch_cached_dev_writeback_stop(dc);
+	bch_notify_cache_set_read_only(c);
+	trace_bcache_cache_set_read_only_done(c);
+}
+
+static const char *__bch_cache_set_read_write(struct cache_set *c)
+{
+	struct cache *ca;
+	const char *err;
+	unsigned i;
+
+	err = "error starting btree GC thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
+	for_each_cache(ca, c, i) {
+		if (CACHE_STATE(&ca->mi) != CACHE_ACTIVE)
+			continue;
+
+		err = "error starting moving GC thread";
+		if (bch_moving_gc_thread_start(ca)) {
+			percpu_ref_put(&ca->ref);
+			goto err;
 		}
+
+		err = "error starting tiering write workqueue";
+		if (bch_tiering_write_start(ca))
+			return err;
 	}
 
-	c->tiering_pd.rate.rate = UINT_MAX;
-	bch_ratelimit_reset(&c->tiering_pd.rate);
-	bch_tiering_read_stop(c);
+	err = "error starting tiering thread";
+	if (bch_tiering_read_start(c))
+		goto err;
+
+	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
+
+	return NULL;
+err:
+	__bch_cache_set_read_only(c);
+	return err;
+}
 
-	set_bit(CACHE_SET_GC_STOPPING, &c->flags);
+const char *bch_cache_set_read_write(struct cache_set *c)
+{
+	struct cache *ca;
+	const char *err;
+	unsigned i;
 
-	if (!IS_ERR_OR_NULL(c->gc_thread))
-		kthread_stop(c->gc_thread);
+	lockdep_assert_held(&bch_register_lock);
 
-	/* Should skip this if we're unregistering because of an error */
-	bch_btree_flush(c);
+	if (!test_bit(CACHE_SET_RO, &c->flags))
+		return NULL;
 
 	for_each_cache(ca, c, i)
-		__bch_cache_read_only(ca);
-
-	bch_journal_flush(&c->journal, &cl);
-	closure_sync(&cl);
+		if (CACHE_STATE(&ca->mi) == CACHE_ACTIVE &&
+		    (err = bch_cache_allocator_start(ca))) {
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
 
-	cancel_delayed_work_sync(&c->journal.write_work);
+	err = __bch_cache_set_read_write(c);
+	if (err)
+		return err;
 
-	bch_notify_cache_set_read_only(c);
+	percpu_ref_reinit(&c->writes);
+	clear_bit(CACHE_SET_RO, &c->flags);
 
-	trace_bcache_cache_set_read_only_done(c);
+	return NULL;
+err:
+	__bch_cache_set_read_only(c);
+	return err;
 }
 
 static void bch_cache_set_read_only_work(struct work_struct *work)
@@ -719,6 +805,8 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
 	mutex_unlock(&bch_register_lock);
 }
 
+/* Cache set startup/shutdown: */
+
 void bch_cache_set_fail(struct cache_set *c)
 {
 	switch (CACHE_ERROR_ACTION(&c->sb)) {
@@ -884,13 +972,6 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c)
 	return nr;
 }
 
-static void bch_writes_disabled(struct percpu_ref *writes)
-{
-	struct cache_set *c = container_of(writes, struct cache_set, writes);
-
-	complete(&c->write_disable_complete);
-}
-
 #define alloc_bucket_pages(gfp, ca)			\
 	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 
@@ -904,9 +985,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	if (!c)
 		return NULL;
 
-	if (percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL))
-		goto err_free;
-
 	__module_get(THIS_MODULE);
 	closure_init(&c->cl, NULL);
 	set_closure_fn(&c->cl, cache_set_free, system_wq);
@@ -935,7 +1013,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	mutex_init(&c->btree_cache_lock);
 	mutex_init(&c->bucket_lock);
 	spin_lock_init(&c->btree_root_lock);
-	init_completion(&c->write_disable_complete);
 	INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
 
 	init_rwsem(&c->gc_lock);
@@ -994,6 +1071,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 		goto err;
 
 	if (!(c->wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
+	    percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
 	    mempool_init_slab_pool(&c->search, 1, bch_search_cache) ||
 	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 					BTREE_RESERVE_SIZE) ||
@@ -1031,10 +1109,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 err:
 	bch_cache_set_stop(c);
 	return NULL;
-
-err_free:
-	kfree(c);
-	return NULL;
 }
 
 static int bch_cache_set_online(struct cache_set *c)
@@ -1069,8 +1143,6 @@ static int bch_cache_set_online(struct cache_set *c)
 	return 0;
 }
 
-static const char *__bch_cache_read_write(struct cache *ca);
-
 static const char *run_cache_set(struct cache_set *c)
 {
 	const char *err = "cannot allocate memory";
@@ -1170,7 +1242,7 @@ static const char *run_cache_set(struct cache_set *c)
 
 		for_each_cache(ca, c, i)
 			if (CACHE_STATE(&ca->mi) == CACHE_ACTIVE &&
-			    (err = __bch_cache_read_write(ca))) {
+			    (err = bch_cache_allocator_start_once(ca))) {
 				percpu_ref_put(&ca->ref);
 				goto err;
 			}
@@ -1204,7 +1276,7 @@ static const char *run_cache_set(struct cache_set *c)
 
 		for_each_cache(ca, c, i)
 			if (CACHE_STATE(&ca->mi) == CACHE_ACTIVE &&
-			    (err = __bch_cache_read_write(ca))) {
+			    (err = bch_cache_allocator_start_once(ca))) {
 				percpu_ref_put(&ca->ref);
 				goto err;
 			}
@@ -1241,29 +1313,12 @@ static const char *run_cache_set(struct cache_set *c)
 	bch_prio_timer_start(c, READ);
 	bch_prio_timer_start(c, WRITE);
 
-	err = "error starting btree GC thread";
-	if (bch_gc_thread_start(c))
-		goto err;
-
-	for_each_cache(ca, c, i) {
-		if (CACHE_STATE(&ca->mi) != CACHE_ACTIVE)
-			continue;
-
-		err = "error starting moving GC thread";
-		if (bch_moving_gc_thread_start(ca)) {
-			percpu_ref_put(&ca->ref);
-			goto err;
-		}
-	}
+	closure_sync(&cl);
 
-	err = "error starting tiering thread";
-	if (bch_tiering_read_start(c))
+	err = __bch_cache_set_read_write(c);
+	if (err)
 		goto err;
 
-	schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
-
-	closure_sync(&cl);
-
 	now = get_seconds();
 	mi = cache_member_info_get(c);
 	for_each_cache_rcu(ca, c, i)
@@ -1345,8 +1400,8 @@ static void __bch_cache_read_only(struct cache *ca)
 {
 	trace_bcache_cache_read_only(ca);
 
-	bch_moving_gc_stop(ca);
 	bch_tiering_write_stop(ca);
+	bch_moving_gc_stop(ca);
 
 	/*
 	 * This stops new data writes (e.g. to existing open data
@@ -1435,7 +1490,9 @@ void bch_cache_read_only(struct cache *ca)
 	return;
 }
 
-static const char *__bch_cache_read_write(struct cache *ca)
+/* This does not write the super-block, should it? */
+
+const char *bch_cache_read_write(struct cache *ca)
 {
 	const char *err;
 
@@ -1452,16 +1509,6 @@ static const char *__bch_cache_read_write(struct cache *ca)
 	trace_bcache_cache_read_write_done(ca);
 
 	return NULL;
-}
-
-/* This does not write the super-block, should it? */
-
-const char *bch_cache_read_write(struct cache *ca)
-{
-	const char *err = __bch_cache_read_write(ca);
-
-	if (err != NULL)
-		return err;
 
 	err = "error starting moving GC thread";
 	if (!bch_moving_gc_thread_start(ca))
@@ -1491,7 +1538,6 @@ static void bch_cache_free_work(struct work_struct *work)
 {
 	struct cache *ca = container_of(work, struct cache, free_work);
 	struct cache_set *c = ca->set;
-	char buf[BDEVNAME_SIZE];
 	unsigned i;
 
 	/*
@@ -1537,9 +1583,6 @@ static void bch_cache_free_work(struct work_struct *work)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
-	if (ca->disk_sb.bdev)
-		pr_notice("%s removed", bdevname(ca->disk_sb.bdev, buf));
-
 	free_super(&ca->disk_sb);
 
 	if (ca->kobj.state_in_sysfs)
diff --git a/drivers/md/bcache/super.h b/drivers/md/bcache/super.h
index 8bf531e86ee8..b13a17eada97 100644
--- a/drivers/md/bcache/super.h
+++ b/drivers/md/bcache/super.h
@@ -162,6 +162,9 @@ const char *bch_register_one(const char *path);
 const char *bch_register_cache_set(char * const *, unsigned,
 				   struct cache_set **);
 
+void bch_cache_set_read_only(struct cache_set *);
+const char *bch_cache_set_read_write(struct cache_set *);
+
 void bch_cache_read_only(struct cache *);
 const char *bch_cache_read_write(struct cache *);
 bool bch_cache_remove(struct cache *, bool force);
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index a5ab2935c146..b1ac5b9aa794 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -105,7 +105,7 @@ struct bkey {
 	/* Size of combined key and value, in u64s */
 	__u8		u64s;
 
-	/* Format of key (0 for format local to btree node */
+	/* Format of key (0 for format local to btree node) */
 	__u8		format;
 
 	/* Type of the value */
author	Kent Overstreet <kent.overstreet@gmail.com>	2015-06-01 19:32:22 -0700
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:34:21 -0800
commit	b5ecdc495b4053c64f7bca90125fb5df20d7e0f1 (patch)
tree	fdfd8cb1ec591c09923af1393e84e66e0bc6c619
parent	5749e6138348d7c1546e28b4ac0ae9032c94e0c0 (diff)