bcachefs: Zone support for journal codezones

For zoned devices, we need to ensure we close buckets we're no longer writing to, since some (flash) devices have a limit on the number of active zones. Also, on startup, if we're going to continue appending to a partially-written bucket we need to query the zone's write pointer. This patch updates the journal code to: - On startup, we now query the write pointer for the bucket cur_idx points to - On startup, we ensure all journal buckets except that the one cur_idx points to are closed - In the journal write path, we factor out journal_close_buckets(), which now increments cur_idx when a bucket fills up so that we can start allocating from the next one - it now also issues the appropriate zone command to close the previous bucket. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2023-12-20 00:08:11 -0500
committer: Kent Overstreet <kent.overstreet@linux.dev> 2023-12-23 23:40:46 -0500
commit: 339c51397e9f6b3c377e8559a81ca74e9bbfd493 (patch)
tree: dfa7a6d1b607be5102e0d792ca72e5d129ad3bcd
parent: 5be20aa87df1bc2723f5f41e37c6c4aa4e0a6093 (diff)
1 files changed, 72 insertions, 34 deletions
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b0ce77655596..824b91e9bf3d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1110,6 +1110,38 @@ found:
 	BUG_ON(!wrote);
 	ja->sectors_free = cur_bucket_capacity - min(wrote, cur_bucket_capacity);
 
+	if (ca->zoned) {
+		struct blk_zone zone;
+		unsigned i;
+
+		if (!bch2_zone_report(ca->disk_sb.bdev,
+				      bucket_to_sector(ca, ja->buckets[ja->cur_idx]),
+				      &zone) &&
+		    zone.type != BLK_ZONE_TYPE_CONVENTIONAL &&
+		    blk_zone_writeable(zone)) {
+			if (bch2_fs_inconsistent_on(zone.capacity - zone.wp > ja->sectors_free, c,
+					"device claims %llu sectors written to current journal bucket but found %u",
+					zone.wp, ca->mi.bucket_size - ja->sectors_free)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ja->sectors_free = zone.capacity - zone.wp;
+		}
+
+		/* Make sure all other nonempty journal buckets are closed: */
+		for (i = 0; i < ja->nr; i++) {
+			if (i == ja->cur_idx)
+				continue;
+
+			if (!bch2_zone_report(ca->disk_sb.bdev,
+					      bucket_to_sector(ca, ja->buckets[ja->cur_idx]),
+					      &zone) &&
+			    zone.cond != BLK_ZONE_COND_EMPTY)
+				bch2_bucket_finish(ca, ja->buckets[i]);
+		}
+	}
+
 	/*
 	 * Set dirty_idx to indicate the entire journal is full and needs to be
 	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
@@ -1374,6 +1406,34 @@ fsck_err:
 
 /* journal write: */
 
+static void journal_close_buckets(struct journal *j, unsigned sectors)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	for_each_rw_member(c, ca) {
+		struct journal_device *ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			spin_unlock(&j->lock);
+			bch2_bucket_finish(ca, ja->buckets[ja->cur_idx]);
+			spin_lock(&j->lock);
+
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = bucket_capacity(ca, ja->buckets[ja->cur_idx]);
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] =
+				le64_to_cpu(journal_last_unwritten_seq(j));
+		}
+	}
+}
+
 static void __journal_write_alloc(struct journal *j,
 				  struct journal_buf *w,
 				  struct dev_alloc_list *devs_sorted,
@@ -1441,59 +1501,37 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_devs_mask devs;
-	struct journal_device *ja;
-	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 	unsigned target = c->opts.metadata_target ?:
 		c->opts.foreground_target;
-	unsigned i, replicas = 0, replicas_want =
+	unsigned replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);
+	bool did_close = false;
 
-	rcu_read_lock();
 retry:
+	rcu_read_lock();
 	devs = target_rw_devs(c, BCH_DATA_journal, target);
-
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
+	rcu_read_unlock();
 
-	if (replicas >= replicas_want)
-		goto done;
-
-	for (i = 0; i < devs_sorted.nr; i++) {
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
-		if (!ca)
-			continue;
-
-		ja = &ca->journal;
-
-		if (sectors > ja->sectors_free &&
-		    sectors <= ca->mi.bucket_size &&
-		    bch2_journal_dev_buckets_available(j, ja,
-					journal_space_discarded)) {
-			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-			ja->sectors_free = bucket_capacity(ca, ja->buckets[ja->cur_idx]);
-
-			/*
-			 * ja->bucket_seq[ja->cur_idx] must always have
-			 * something sensible:
-			 */
-			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-		}
+	if (replicas < replicas_want && !did_close) {
+		journal_close_buckets(j, sectors);
+		did_close = true;
+		goto retry;
 	}
 
-	__journal_write_alloc(j, w, &devs_sorted,
-			      sectors, &replicas, replicas_want);
-
 	if (replicas < replicas_want && target) {
-		/* Retry from all devices: */
+		/*
+		 * Retry from all devices
+		 * XXX: this should be configurable
+		 */
 		target = 0;
 		goto retry;
 	}
-done:
-	rcu_read_unlock();
 
 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
author	Kent Overstreet <kent.overstreet@linux.dev>	2023-12-20 00:08:11 -0500
committer	Kent Overstreet <kent.overstreet@linux.dev>	2023-12-23 23:40:46 -0500
commit	339c51397e9f6b3c377e8559a81ca74e9bbfd493 (patch)
tree	dfa7a6d1b607be5102e0d792ca72e5d129ad3bcd
parent	5be20aa87df1bc2723f5f41e37c6c4aa4e0a6093 (diff)