summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-12-20 00:04:57 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-12-23 23:40:46 -0500
commit1f383e88038c2038fd972e9da70190c602dd6840 (patch)
tree73a7fbeeb69b0c1e5d82e5e51c93ef5ba25e74c8
parentbe12f5651d3f5c1d1f256ccb2a8731adbbd0d1d8 (diff)
bcachefs: bucket_capacity()
On zoned devices, zone capacity is variable. This patch implements a new data structure (eytzinger search tree) for getting a bucket's capacity. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/alloc_foreground.c6
-rw-r--r--fs/bcachefs/alloc_types.h1
-rw-r--r--fs/bcachefs/bcachefs.h15
-rw-r--r--fs/bcachefs/buckets.c9
-rw-r--r--fs/bcachefs/buckets.h8
-rw-r--r--fs/bcachefs/ec.c5
-rw-r--r--fs/bcachefs/extents.c14
-rw-r--r--fs/bcachefs/journal.c3
-rw-r--r--fs/bcachefs/journal_io.c53
-rw-r--r--fs/bcachefs/super.c17
-rw-r--r--fs/bcachefs/zone.c90
-rw-r--r--fs/bcachefs/zone.h40
12 files changed, 183 insertions, 78 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index a1bfa9eaee42..64100f60af35 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -30,6 +30,7 @@
#include "movinggc.h"
#include "nocow_locking.h"
#include "trace.h"
+#include "zone.h"
#include <linux/math64.h>
#include <linux/rculist.h>
@@ -254,7 +255,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
spin_lock(&ob->lock);
ob->valid = true;
- ob->sectors_free = ca->mi.bucket_size;
+ ob->bucket_size = bucket_capacity(ca, bucket);
+ ob->sectors_free = ob->bucket_size;
ob->dev = ca->dev_idx;
ob->gen = a->gen;
ob->bucket = bucket;
@@ -1497,7 +1499,7 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
.gen = ob->gen,
.dev = ob->dev,
.offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
+ ob->bucket_size -
ob->sectors_free,
};
}
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..57c4ff50926c 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -61,6 +61,7 @@ struct open_bucket {
u8 dev;
u8 gen;
+ u32 bucket_size;
u32 sectors_free;
u64 bucket;
struct ec_stripe_new *ec;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index afa86dd7ff1a..f44346c90f76 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -530,6 +530,15 @@ struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct bucket_capacities {
+ u32 nr, size;
+
+ struct bucket_capacity {
+ u32 start;
+ u32 sectors;
+ } *d;
+};
+
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
@@ -540,6 +549,8 @@ struct bch_dev {
struct bch_fs *fs;
u8 dev_idx;
+ __uuid_t uuid;
+ char name[BDEVNAME_SIZE];
/*
* Cached version of this device's member info from superblock
* Committed by bch2_write_super() -> bch_fs_mi_update()
@@ -547,8 +558,8 @@ struct bch_dev {
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
- __uuid_t uuid;
- char name[BDEVNAME_SIZE];
+ struct bucket_capacities buckets;
+ u64 capacity;
struct bch_sb_handle disk_sb;
struct bch_sb *sb_read_scratch;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index c0dac04253f7..73a96771316d 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -22,6 +22,7 @@
#include "replicas.h"
#include "subvolume.h"
#include "trace.h"
+#include "zone.h"
#include <linux/preempt.h>
@@ -66,9 +67,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
- ca->mi.bucket_size;
+ usage->hidden += dev.d[BCH_DATA_sb].sectors;
+ usage->hidden += dev.d[BCH_DATA_journal].sectors;
}
percpu_up_write(&c->mark_lock);
@@ -1743,7 +1743,8 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
for (i = 0; i < ca->journal.nr; i++) {
ret = bch2_trans_mark_metadata_bucket(trans, ca,
ca->journal.buckets[i],
- BCH_DATA_journal, ca->mi.bucket_size);
+ BCH_DATA_journal,
+ bucket_capacity(ca, ca->journal.buckets[i]));
if (ret)
return ret;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 379101d7e585..0bd2e32d661b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -22,14 +22,6 @@ static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
return ((sector_t) b) * ca->mi.bucket_size;
}
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
- u32 remainder;
-
- div_u64_rem(s, ca->mi.bucket_size, &remainder);
- return remainder;
-}
-
static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
u32 *offset)
{
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e89185a28e08..c08038b59a1c 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1216,16 +1216,13 @@ void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
{
struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct bch_dev *ca;
- unsigned offset;
if (!ob)
return NULL;
BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
- ca = bch_dev_bkey_exists(c, ob->dev);
- offset = ca->mi.bucket_size - ob->sectors_free;
+ unsigned offset = ob->bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 648f1daffb3b..699cc61e70b2 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -25,6 +25,7 @@
#include "super-io.h"
#include "trace.h"
#include "util.h"
+#include "zone.h"
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
@@ -1063,8 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- u64 bucket;
- u32 bucket_offset;
struct bch_dev *ca;
int ret = 0;
@@ -1087,15 +1086,16 @@ static int extent_ptr_invalid(struct bch_fs *c,
ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
- bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+ u32 bucket_offset;
+ u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+ bkey_fsck_err_on(bucket < ca->mi.first_bucket, c, err,
+ ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
ptr_after_last_bucket,
"pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
- bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
- ptr_before_first_bucket,
- "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_capacity(ca, bucket), c, err,
ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
bucket_offset, size_ondisk, ca->mi.bucket_size);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8538ef34f62b..d66e0ed47209 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -19,6 +19,7 @@
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
#include "trace.h"
+#include "zone.h"
static const char * const bch2_journal_errors[] = {
#define x(n) #n,
@@ -852,7 +853,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
ret = bch2_trans_run(c,
bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
- ca->mi.bucket_size));
+ bucket_capacity(ca, ob[nr_got]->bucket)));
if (ret) {
bch2_open_bucket_put(c, ob[nr_got]);
bch_err_msg(c, ret, "marking new journal buckets");
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index c5bc58247146..b0ce77655596 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -16,6 +16,7 @@
#include "replicas.h"
#include "sb-clean.h"
#include "trace.h"
+#include "zone.h"
static struct nonce journal_nonce(const struct jset *jset)
{
@@ -932,7 +933,7 @@ static int journal_read_bucket(struct bch_dev *ca,
struct jset *j = NULL;
unsigned sectors, sectors_read = 0;
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
+ end = offset + bucket_capacity(ca, ja->buckets[bucket]);
bool saw_bad = false, csum_good;
int ret = 0;
@@ -1061,7 +1062,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
struct journal_replay *r, **_r;
struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
- unsigned i;
+ u64 cur_bucket;
+ unsigned i, wrote = 0, cur_bucket_capacity;
int ret = 0;
if (!ja->nr)
@@ -1079,7 +1081,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
goto err;
}
- ja->sectors_free = ca->mi.bucket_size;
+ cur_bucket = ja->buckets[ja->cur_idx];
+ cur_bucket_capacity = bucket_capacity(ca, cur_bucket);
mutex_lock(&jlist->lock);
genradix_for_each_reverse(&c->journal_entries, iter, _r) {
@@ -1089,12 +1092,14 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
continue;
for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx) {
- unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
- vstruct_sectors(&r->j, c->block_bits);
+ if (r->ptrs[i].dev == ca->dev_idx &&
+ sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+ wrote = max_t(u64, wrote, r->ptrs[i].sector -
+ bucket_to_sector(ca, cur_bucket) +
+ vstruct_sectors(&r->j, c->block_bits));
ja->cur_idx = r->ptrs[i].bucket;
- ja->sectors_free = ca->mi.bucket_size - wrote;
+ ja->sectors_free = cur_bucket_capacity - wrote;
goto found;
}
}
@@ -1102,24 +1107,8 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
found:
mutex_unlock(&jlist->lock);
- if (ja->bucket_seq[ja->cur_idx] &&
- ja->sectors_free == ca->mi.bucket_size) {
-#if 0
- /*
- * Debug code for ZNS support, where we (probably) want to be
- * correlated where we stopped in the journal to the zone write
- * points:
- */
- bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
- bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
- for (i = 0; i < 3; i++) {
- unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
-
- bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
- }
-#endif
- ja->sectors_free = 0;
- }
+ BUG_ON(!wrote);
+ ja->sectors_free = cur_bucket_capacity - min(wrote, cur_bucket_capacity);
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
@@ -1147,11 +1136,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
unsigned i;
for (i = 0; i < j->nr_ptrs; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
- u64 offset;
-
- div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
if (i)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
@@ -1401,6 +1385,7 @@ static void __journal_write_alloc(struct journal *j,
struct journal_device *ja;
struct bch_dev *ca;
unsigned i;
+ u64 b;
if (*replicas >= replicas_want)
return;
@@ -1424,12 +1409,12 @@ static void __journal_write_alloc(struct journal *j,
continue;
bch2_dev_stripe_increment(ca, &j->wp.stripe);
+ b = ja->buckets[ja->cur_idx];
bch2_bkey_append_ptr(&w->key,
(struct bch_extent_ptr) {
- .offset = bucket_to_sector(ca,
- ja->buckets[ja->cur_idx]) +
- ca->mi.bucket_size -
+ .offset = bucket_to_sector(ca, b) +
+ bucket_capacity(ca, b) -
ja->sectors_free,
.dev = ca->dev_idx,
});
@@ -1489,7 +1474,7 @@ retry:
bch2_journal_dev_buckets_available(j, ja,
journal_space_discarded)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->sectors_free = ca->mi.bucket_size;
+ ja->sectors_free = bucket_capacity(ca, ja->buckets[ja->cur_idx]);
/*
* ja->bucket_seq[ja->cur_idx] must always have
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f27b93d0f56b..b600f6db3afc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1145,6 +1145,7 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
+ bch2_dev_zones_exit(ca);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
@@ -1330,19 +1331,9 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return -BCH_ERR_device_size_too_small;
}
- ca->zoned = bdev_nr_zones(sb->bdev) != 0;
- if (ca->zoned) {
- struct blk_zone zone;
-
- ret = bch2_zone_report(sb->bdev, 0, &zone);
- if (ret)
- return ret;
-
- if (zone.len != ca->mi.bucket_size) {
- bch_err(ca, "zone size doesn't match bucket size");
- return -EINVAL;
- }
- }
+ ret = bch2_dev_zones_init(ca, sb);
+ if (ret)
+ return ret;
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
diff --git a/fs/bcachefs/zone.c b/fs/bcachefs/zone.c
index b6ad8c9daaea..6f48f58a6c50 100644
--- a/fs/bcachefs/zone.c
+++ b/fs/bcachefs/zone.c
@@ -2,11 +2,12 @@
#include "bcachefs.h"
#include "buckets.h"
+#include "eytzinger.h"
#include "zone.h"
#include <linux/blkdev.h>
-static int zone_report_cb(struct blk_zone *src, unsigned int idx, void *data)
+static int zone_report_cb(struct blk_zone *src, unsigned idx, void *data)
{
struct blk_zone *dst = data;
@@ -14,7 +15,7 @@ static int zone_report_cb(struct blk_zone *src, unsigned int idx, void *data)
return 0;
}
-int bch2_zone_report(struct block_device *bdev, sector_t sector, struct blk_zone *zone)
+static int bch2_zone_report(struct block_device *bdev, sector_t sector, struct blk_zone *zone)
{
int ret = blkdev_report_zones(bdev, sector, 1, zone_report_cb, zone);
@@ -53,3 +54,88 @@ void bch2_bucket_finish(struct bch_dev *ca, u64 b)
bucket_to_sector(ca, b),
ca->mi.bucket_size, GFP_KERNEL);
}
+
+void bch2_dev_zones_exit(struct bch_dev *ca)
+{
+ kfree(ca->buckets.d);
+}
+
+static int zone_report_capacity(struct blk_zone *src, unsigned idx, void *data)
+{
+ struct bucket_capacities *b = data;
+
+ if (b->nr &&
+ b->d[b->nr - 1].sectors == src->capacity)
+ return 0;
+
+ if (b->nr == b->size) {
+ size_t new_size = min(b->size * 2, 8U);
+ struct bucket_capacity *d =
+ krealloc_array(b->d, new_size, sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ b->d = d;
+ b->size = new_size;
+ }
+
+ b->d[b->nr++] = (struct bucket_capacity) {
+ .start = idx,
+ .sectors = src->capacity,
+ };
+
+ return 0;
+}
+
+int bch2_dev_zones_init(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+ struct bucket_capacities *b = &ca->buckets;
+ struct blk_zone zone;
+ unsigned i;
+ int ret;
+
+ ca->zoned = bdev_nr_zones(sb->bdev) != 0;
+ if (!ca->zoned) {
+ ca->capacity = ca->mi.bucket_size * ca->mi.nbuckets;
+ return 0;
+ }
+
+ ret = bch2_zone_report(sb->bdev, 0, &zone);
+ if (ret)
+ return ret;
+
+ if (zone.len != ca->mi.bucket_size) {
+ bch_err(ca, "zone size doesn't match bucket size");
+ return -EINVAL;
+ }
+
+ if (bdev_nr_zones(sb->bdev) < ca->mi.nbuckets) {
+ bch_err(ca, "member info nbuckets (%llu) greater than number of zones (%u)",
+ ca->mi.nbuckets,
+ bdev_nr_zones(sb->bdev));
+ return -EINVAL;
+ }
+
+ b->nr = 0;
+ ret = blkdev_report_zones(sb->bdev, 0, ca->mi.nbuckets,
+ zone_report_capacity, &ca->buckets);
+ if (ret) {
+ bch_err(ca, "error getting zone capacities");
+ return -EINVAL;
+ }
+
+ ca->capacity = 0;
+ for (i = 0; i < b->nr; i++) {
+ u64 next = i + 1 < b->nr
+ ? b->d[i + 1].start
+ : ca->mi.nbuckets;
+
+ ca->capacity += (next - b->d[i].start) * b->d[i].sectors;
+ }
+
+ BUG_ON(ca->capacity > ca->mi.bucket_size * ca->mi.nbuckets);
+
+ eytzinger0_sort(b->d, b->nr, sizeof(*b->d), bucket_capacity_cmp, NULL);
+
+ return 0;
+}
diff --git a/fs/bcachefs/zone.h b/fs/bcachefs/zone.h
index aa3653bdb59b..620efc4fcbdc 100644
--- a/fs/bcachefs/zone.h
+++ b/fs/bcachefs/zone.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_ZONE_H
#define _BCACHEFS_ZONE_H
+#include "eytzinger.h"
+
static inline bool blk_zone_writeable(struct blk_zone zone)
{
return (zone.cond == BLK_ZONE_COND_EMPTY ||
@@ -10,8 +12,44 @@ static inline bool blk_zone_writeable(struct blk_zone zone)
zone.cond == BLK_ZONE_COND_CLOSED);
}
-int bch2_zone_report(struct block_device *, sector_t, struct blk_zone *);
+static inline int bucket_capacity_cmp(const void *_l, const void *_r, size_t size)
+{
+ const struct bucket_capacity *l = _l;
+ const struct bucket_capacity *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+static inline unsigned bucket_capacity(struct bch_dev *ca, size_t bucket)
+{
+ struct bucket_capacities *b = &ca->buckets;
+ struct bucket_capacity search = { .start = bucket };
+ ssize_t idx;
+
+ if (!ca->zoned)
+ return ca->mi.bucket_size;
+
+ idx = eytzinger0_find_le(b->d, b->nr,
+ sizeof(b->d[0]),
+ bucket_capacity_cmp, &search);
+
+ {
+ ssize_t j = -1, k;
+
+ for (k = 0; k < b->nr; k++)
+ if (b->d[k].start <= bucket &&
+ (j < 0 || b->d[k].start > b->d[j].start))
+ j = k;
+
+ BUG_ON(idx != j);
+ }
+
+ return b->d[idx].sectors;
+}
+
void bch2_bucket_discard(struct bch_dev *, u64);
void bch2_bucket_finish(struct bch_dev *, u64);
+void bch2_dev_zones_exit(struct bch_dev *);
+int bch2_dev_zones_init(struct bch_dev *, struct bch_sb_handle *);
#endif /* _BCACHEFS_ZONE_H */