bcache: Don't discard journal entries that are marked as open

The journalling code was carefully keeping track of which journal entries were no longer needed and could be discarded... but, it wasn't waiting until they were marked as not needed _on disk_ before discarding them. This wasn't an issue without discards because we'd only overwrite journal entries with another journal entry that had the new last_seq - but there was effectively a race with discards enabled. Fix this by keeping track in memory what last_seq on disk is, and don't discard past that. This then creates a deadlock where we have to be able to write a journal entry to free up space in the journal, so we can't ever let the journal completely fill up - so fix that too. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
author: Kent Overstreet <kent.overstreet@gmail.com> 2015-03-23 17:33:15 -0700
committer: Kent Overstreet <kent.overstreet@gmail.com> 2016-10-07 12:33:35 -0800
commit: f2ca090c3c6a27ac7d3c36d83ec38ec0b12edb11 (patch)
tree: 79e57f298f2b17ee7d81a2ced5bcd392f73e7efb
parent: 3d70ad1e73cf671ef0d23bedce76f7d1ef68f13e (diff)
2 files changed, 32 insertions, 18 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index d20f2d2a143f..cce103a8c59f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -472,6 +472,8 @@ const char *bch_journal_read(struct cache_set *c, struct list_head *list)
 	c->journal.pin.back = j->seq - j->last_seq + 1;
 
 	c->journal.seq = j->seq;
+	c->journal.last_seq_ondisk = j->last_seq;
+
 	BUG_ON(last_seq(&c->journal) != j->last_seq);
 
 	i = list_first_entry(list, struct journal_replay, list);
@@ -664,7 +666,7 @@ static void journal_discard_work(struct work_struct *work)
 	submit_bio(&ja->discard_bio);
 }
 
-static bool do_journal_discard(struct cache *ca)
+static void do_journal_discard(struct cache *ca)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->discard_bio;
@@ -672,12 +674,12 @@ static bool do_journal_discard(struct cache *ca)
 	if (!CACHE_DISCARD(&ca->mi) ||
 	    !blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) {
 		ja->discard_idx = ja->last_idx;
-		return true;
+		return;
 	}
 
 	switch (atomic_read(&ja->discard_in_flight)) {
 	case DISCARD_IN_FLIGHT:
-		return false;
+		break;
 
 	case DISCARD_DONE:
 		ja->discard_idx = (ja->discard_idx + 1) %
@@ -688,7 +690,7 @@ static bool do_journal_discard(struct cache *ca)
 
 	case DISCARD_READY:
 		if (ja->discard_idx == ja->last_idx)
-			return true;
+			break;
 
 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 
@@ -707,11 +709,9 @@ static bool do_journal_discard(struct cache *ca)
 		INIT_WORK(&ja->discard_work, journal_discard_work);
 		schedule_work(&ja->discard_work);
 
-		return false;
-
+		break;
 	default:
 		BUG();
-		return true;
 	}
 }
 
@@ -833,11 +833,9 @@ static void journal_reclaim_work(struct work_struct *work)
  *
  * Called from IO submission context, does not block. Cleans up after btree
  * write completions by advancing the journal pin and each cache's last_idx,
- * and kicking off discards.
- *
- * Returns true if background reclaim needs to run.
+ * kicking off discards and background reclaim as necessary.
  */
-static bool journal_reclaim_fast(struct cache_set *c)
+static void journal_reclaim_fast(struct cache_set *c)
 {
 	struct cache *ca;
 	u64 last_seq;
@@ -858,7 +856,7 @@ static bool journal_reclaim_fast(struct cache_set *c)
 	if (fifo_free(&c->journal.pin) <= (c->journal.pin.size >> 6))
 		need_reclaim = true;
 
-	last_seq = last_seq(&c->journal);
+	last_seq = c->journal.last_seq_ondisk;
 
 	rcu_read_lock();
 
@@ -886,7 +884,8 @@ static bool journal_reclaim_fast(struct cache_set *c)
 
 	rcu_read_unlock();
 
-	return need_reclaim;
+	if (need_reclaim)
+		queue_work(system_long_wq, &c->journal.reclaim_work);
 }
 
 /**
@@ -909,8 +908,7 @@ static void journal_next_bucket(struct cache_set *c)
 	BUG_ON(c->journal.u64s_remaining);
 	BUG_ON(c->journal.cur->data->u64s);
 
-	if (journal_reclaim_fast(c))
-		queue_work(system_long_wq, &c->journal.reclaim_work);
+	journal_reclaim_fast(c);
 
 	rcu_read_lock();
 
@@ -930,7 +928,7 @@ static void journal_next_bucket(struct cache_set *c)
 	 */
 	group_for_each_cache_rcu(ca, &c->cache_tiers[0], iter) {
 		struct journal_device *ja = &ca->journal;
-		unsigned next = (ja->cur_idx + 1) %
+		unsigned next, remaining, nr_buckets =
 			bch_nr_journal_buckets(&ca->sb);
 
 		if (bch_extent_ptrs(e) == CACHE_SET_META_REPLICAS_WANT(&c->sb))
@@ -944,8 +942,18 @@ static void journal_next_bucket(struct cache_set *c)
 					  ca->sb.nr_this_dev))
 			continue;
 
-		/* No journal buckets available for writing on this device */
-		if (next == ja->discard_idx)
+		next = (ja->cur_idx + 1) % nr_buckets;
+		remaining = (ja->discard_idx + nr_buckets - next) % nr_buckets;
+
+		if (!remaining)
+			continue;
+
+		/*
+		 * Don't use the last bucket unless writing the new last_seq
+		 * will make another bucket available:
+		 */
+		if (remaining == 1 &&
+		    ja->seq[(next + 1) % nr_buckets] >= last_seq(&c->journal))
 			continue;
 
 		BUG_ON(bch_extent_ptrs(e) >= BKEY_EXTENT_PTRS_MAX);
@@ -1018,6 +1026,8 @@ static void journal_write_done(struct closure *cl)
 		? &j->w[1]
 		: &j->w[0];
 
+	j->last_seq_ondisk = w->data->last_seq;
+
 	__closure_wake_up(&w->wait);
 
 	atomic_set(&j->in_flight, 0);
@@ -1294,6 +1304,7 @@ static bool __journal_res_get(struct cache_set *c, struct journal_res *res,
 			/* Still no room, we have to wait */
 			spin_unlock(&c->journal.lock);
 			trace_bcache_journal_full(c);
+			queue_work(system_long_wq, &c->journal.reclaim_work);
 			return false;
 		}
 	}
diff --git a/drivers/md/bcache/journal_types.h b/drivers/md/bcache/journal_types.h
index e7a425d4e1fe..971af1e043ca 100644
--- a/drivers/md/bcache/journal_types.h
+++ b/drivers/md/bcache/journal_types.h
@@ -44,6 +44,9 @@ struct journal {
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	u64			seq;
 
+	/* last_seq from the most recent journal entry written */
+	u64			last_seq_ondisk;
+
 	/*
 	 * FIFO of journal entries whose btree updates have not yet been
 	 * written out.
author	Kent Overstreet <kent.overstreet@gmail.com>	2015-03-23 17:33:15 -0700
committer	Kent Overstreet <kent.overstreet@gmail.com>	2016-10-07 12:33:35 -0800
commit	f2ca090c3c6a27ac7d3c36d83ec38ec0b12edb11 (patch)
tree	79e57f298f2b17ee7d81a2ced5bcd392f73e7efb
parent	3d70ad1e73cf671ef0d23bedce76f7d1ef68f13e (diff)