summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-07-27 14:24:31 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2020-07-27 14:29:49 -0400
commit014287bbf6f0046d4092f20be2ff0c3385e5df3e (patch)
treef3613171cfd567b012d6d2252a85d22157c4d09a
parent0511e1ea598d18d603d4478030c1e5893d5b2598 (diff)
Merge with 6288f1b609 bcachefs: Convert various code to printbuf
-rw-r--r--block/blk-core.c10
-rw-r--r--fs/bcachefs/alloc_background.c34
-rw-r--r--fs/bcachefs/alloc_foreground.c264
-rw-r--r--fs/bcachefs/alloc_foreground.h23
-rw-r--r--fs/bcachefs/alloc_types.h1
-rw-r--r--fs/bcachefs/bcachefs.h26
-rw-r--r--fs/bcachefs/bcachefs_format.h19
-rw-r--r--fs/bcachefs/bset.c61
-rw-r--r--fs/bcachefs/bset.h34
-rw-r--r--fs/bcachefs/btree_cache.c64
-rw-r--r--fs/bcachefs/btree_cache.h7
-rw-r--r--fs/bcachefs/btree_gc.c27
-rw-r--r--fs/bcachefs/btree_io.c91
-rw-r--r--fs/bcachefs/btree_io.h5
-rw-r--r--fs/bcachefs/btree_types.h6
-rw-r--r--fs/bcachefs/btree_update_interior.c46
-rw-r--r--fs/bcachefs/btree_update_interior.h4
-rw-r--r--fs/bcachefs/btree_update_leaf.c38
-rw-r--r--fs/bcachefs/buckets.c447
-rw-r--r--fs/bcachefs/buckets.h21
-rw-r--r--fs/bcachefs/buckets_types.h1
-rw-r--r--fs/bcachefs/chardev.c2
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/clock.c7
-rw-r--r--fs/bcachefs/clock.h2
-rw-r--r--fs/bcachefs/compress.c8
-rw-r--r--fs/bcachefs/ec.c561
-rw-r--r--fs/bcachefs/ec.h18
-rw-r--r--fs/bcachefs/ec_types.h1
-rw-r--r--fs/bcachefs/extents.c16
-rw-r--r--fs/bcachefs/fs-io.c303
-rw-r--r--fs/bcachefs/fs-io.h4
-rw-r--r--fs/bcachefs/fs.c13
-rw-r--r--fs/bcachefs/io.c43
-rw-r--r--fs/bcachefs/io.h2
-rw-r--r--fs/bcachefs/io_types.h1
-rw-r--r--fs/bcachefs/journal.c40
-rw-r--r--fs/bcachefs/journal.h6
-rw-r--r--fs/bcachefs/journal_io.c18
-rw-r--r--fs/bcachefs/journal_reclaim.c8
-rw-r--r--fs/bcachefs/move.c24
-rw-r--r--fs/bcachefs/movinggc.c227
-rw-r--r--fs/bcachefs/movinggc.h6
-rw-r--r--fs/bcachefs/opts.c9
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/rebalance.c19
-rw-r--r--fs/bcachefs/rebalance.h2
-rw-r--r--fs/bcachefs/recovery.c16
-rw-r--r--fs/bcachefs/replicas.c101
-rw-r--r--fs/bcachefs/replicas.h10
-rw-r--r--fs/bcachefs/super-io.c7
-rw-r--r--fs/bcachefs/super.c48
-rw-r--r--fs/bcachefs/sysfs.c271
-rw-r--r--fs/bcachefs/util.c25
-rw-r--r--fs/bcachefs/util.h31
-rw-r--r--include/linux/blkdev.h1
-rw-r--r--include/trace/events/bcachefs.h6
-rw-r--r--kernel/locking/six.c2
59 files changed, 1546 insertions, 1583 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index ea33d6abdcfc..f3e31bab4f9d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -249,6 +249,16 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
+const char *blk_status_to_str(blk_status_t status)
+{
+ int idx = (__force int)status;
+
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+ return "(invalid error)";
+ return blk_errors[idx].name;
+}
+EXPORT_SYMBOL_GPL(blk_status_to_str);
+
static void print_req_error(struct request *req, blk_status_t status)
{
int idx = (__force int)status;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index cb720ee04b86..43b9f99194b9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -41,29 +41,26 @@ static void pd_controllers_update(struct work_struct *work)
struct bch_fs,
pd_controllers_update);
struct bch_dev *ca;
+ s64 free = 0, fragmented = 0;
unsigned i;
for_each_member_device(ca, c, i) {
- struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
- u64 free = bucket_to_sector(ca,
+ free += bucket_to_sector(ca,
__dev_buckets_free(ca, stats)) << 9;
/*
* Bytes of internal fragmentation, which can be
* reclaimed by copy GC
*/
- s64 fragmented = (bucket_to_sector(ca,
- stats.buckets[BCH_DATA_USER] +
- stats.buckets[BCH_DATA_CACHED]) -
- (stats.sectors[BCH_DATA_USER] +
- stats.sectors[BCH_DATA_CACHED])) << 9;
-
- fragmented = max(0LL, fragmented);
-
- bch2_pd_controller_update(&ca->copygc_pd,
- free, fragmented, -1);
+ fragmented += max_t(s64, 0, (bucket_to_sector(ca,
+ stats.buckets[BCH_DATA_user] +
+ stats.buckets[BCH_DATA_cached]) -
+ (stats.sectors[BCH_DATA_user] +
+ stats.sectors[BCH_DATA_cached])) << 9);
}
+ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
schedule_delayed_work(&c->pd_controllers_update,
c->pd_controllers_update_seconds * HZ);
}
@@ -517,11 +514,13 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
- available = max_t(s64, 0, dev_buckets_available(c, ca) -
+ available = max_t(s64, 0, dev_buckets_available(ca) -
ca->inc_gen_really_needs_gc);
if (available > fifo_free(&ca->free_inc) ||
- (available && !fifo_full(&ca->free[RESERVE_BTREE])))
+ (available &&
+ (!fifo_full(&ca->free[RESERVE_BTREE]) ||
+ !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
break;
up_read(&c->gc_lock);
@@ -1191,7 +1190,7 @@ stop:
void bch2_recalc_capacity(struct bch_fs *c)
{
struct bch_dev *ca;
- u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
unsigned i, j;
@@ -1234,7 +1233,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
dev_reserve *= ca->mi.bucket_size;
- ca->copygc_threshold = dev_reserve;
+ copygc_threshold += dev_reserve;
capacity += bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket);
@@ -1253,6 +1252,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserved_sectors = min(reserved_sectors, capacity);
+ c->copygc_threshold = copygc_threshold;
c->capacity = capacity - reserved_sectors;
c->bucket_size_max = bucket_size_max;
@@ -1312,7 +1312,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch2_writepoint_stop(c, ca, &c->write_points[i]);
- bch2_writepoint_stop(c, ca, &ca->copygc_write_point);
+ bch2_writepoint_stop(c, ca, &c->copygc_write_point);
bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
bch2_writepoint_stop(c, ca, &c->btree_write_point);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 979aba30bc9d..4a048828869b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -70,12 +70,6 @@
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
-enum bucket_alloc_ret {
- ALLOC_SUCCESS,
- OPEN_BUCKETS_EMPTY,
- FREELIST_EMPTY, /* Allocator thread not keeping up */
-};
-
/*
* Open buckets represent a bucket that's currently being allocated from. They
* serve two purposes:
@@ -150,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
}
static void open_bucket_free_unused(struct bch_fs *c,
- struct open_bucket *ob,
- bool may_realloc)
+ struct write_point *wp,
+ struct open_bucket *ob)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ bool may_realloc = wp->type == BCH_DATA_user;
- BUG_ON(ca->open_buckets_partial_nr >=
+ BUG_ON(ca->open_buckets_partial_nr >
ARRAY_SIZE(ca->open_buckets_partial));
if (ca->open_buckets_partial_nr <
@@ -234,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
spin_lock(&c->freelist_lock);
- if (may_alloc_partial &&
- ca->open_buckets_partial_nr) {
- ob = c->open_buckets +
- ca->open_buckets_partial[--ca->open_buckets_partial_nr];
- ob->on_partial_list = false;
- spin_unlock(&c->freelist_lock);
- return ob;
+ if (may_alloc_partial) {
+ int i;
+
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+ ob = c->open_buckets + ca->open_buckets_partial[i];
+
+ if (reserve <= ob->alloc_reserve) {
+ array_remove_item(ca->open_buckets_partial,
+ ca->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+ ob->alloc_reserve = reserve;
+ spin_unlock(&c->freelist_lock);
+ return ob;
+ }
+ }
}
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
@@ -297,6 +301,7 @@ out:
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
+ ob->alloc_reserve = reserve;
ob->ptr = (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
.gen = buckets->b[bucket].mark.gen,
@@ -344,21 +349,20 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
struct bch_devs_mask *devs)
{
struct dev_alloc_list ret = { .nr = 0 };
- struct bch_dev *ca;
unsigned i;
- for_each_member_device_rcu(ca, c, i, devs)
+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
ret.devs[ret.nr++] = i;
bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
return ret;
}
-void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+void bch2_dev_stripe_increment(struct bch_dev *ca,
struct dev_stripe_state *stripe)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_free(c, ca);
+ u64 free_space = dev_buckets_free(ca);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -396,21 +400,22 @@ static void add_new_bucket(struct bch_fs *c,
ob_push(c, ptrs, ob);
}
-static int bch2_bucket_alloc_set(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
+enum bucket_alloc_ret
+bch2_bucket_alloc_set(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
{
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
struct bch_dev *ca;
- bool alloc_failure = false;
+ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
@@ -428,102 +433,28 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
ob = bch2_bucket_alloc(c, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
- enum bucket_alloc_ret ret = -PTR_ERR(ob);
-
- WARN_ON(reserve == RESERVE_MOVINGGC &&
- ret != OPEN_BUCKETS_EMPTY);
+ ret = -PTR_ERR(ob);
if (cl)
- return -EAGAIN;
- if (ret == OPEN_BUCKETS_EMPTY)
- return -ENOSPC;
- alloc_failure = true;
+ return ret;
continue;
}
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
- bch2_dev_stripe_increment(c, ca, stripe);
+ bch2_dev_stripe_increment(ca, stripe);
if (*nr_effective >= nr_replicas)
- return 0;
+ return ALLOC_SUCCESS;
}
- return alloc_failure ? -ENOSPC : -EROFS;
+ return ret;
}
/* Allocate from stripes: */
/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
-{
- struct bch_devs_mask devs;
- struct open_bucket *ob;
- unsigned i, nr_have = 0, nr_data =
- min_t(unsigned, h->nr_active_devs,
- EC_STRIPE_MAX) - h->redundancy;
- bool have_cache = true;
- int ret = 0;
-
- BUG_ON(h->blocks.nr > nr_data);
- BUG_ON(h->parity.nr > h->redundancy);
-
- devs = h->devs;
-
- open_bucket_for_each(c, &h->parity, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
- open_bucket_for_each(c, &h->blocks, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
-
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
- if (h->parity.nr < h->redundancy) {
- nr_have = h->parity.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->parity,
- &h->parity_stripe,
- &devs,
- h->redundancy,
- &nr_have,
- &have_cache,
- RESERVE_NONE,
- 0,
- NULL);
- if (ret)
- goto err;
- }
-
- if (h->blocks.nr < nr_data) {
- nr_have = h->blocks.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->blocks,
- &h->block_stripe,
- &devs,
- nr_data,
- &nr_have,
- &have_cache,
- RESERVE_NONE,
- 0,
- NULL);
- if (ret)
- goto err;
- }
-
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
- return bch2_ec_stripe_new_alloc(c, h);
-err:
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
- return -1;
-}
-
-/*
* if we can't allocate a new stripe because there are already too many
* partially filled stripes, force allocating from an existing stripe even when
* it's to a device we don't want:
@@ -555,34 +486,30 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
if (ec_open_bucket(c, ptrs))
return;
- h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
if (!h)
return;
- if (!h->s && ec_stripe_alloc(c, h))
- goto out_put_head;
-
- rcu_read_lock();
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
- rcu_read_unlock();
for (i = 0; i < devs_sorted.nr; i++)
open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
if (ob->ptr.dev == devs_sorted.devs[i] &&
- !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+ !test_and_set_bit(h->s->data_block_idx[ec_idx],
+ h->s->blocks_allocated))
goto got_bucket;
goto out_put_head;
got_bucket:
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- ob->ec_idx = ec_idx;
+ ob->ec_idx = h->s->data_block_idx[ec_idx];
ob->ec = h->s;
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob);
atomic_inc(&h->s->pin);
out_put_head:
- bch2_ec_stripe_head_put(h);
+ bch2_ec_stripe_head_put(c, h);
}
/* Sector allocator */
@@ -607,7 +534,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
if (*nr_effective < nr_replicas &&
test_bit(ob->ptr.dev, devs_may_alloc->d) &&
(ca->mi.durability ||
- (wp->type == BCH_DATA_USER && !*have_cache)) &&
+ (wp->type == BCH_DATA_user && !*have_cache)) &&
(ob->ec || !need_ec)) {
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache,
@@ -619,24 +546,25 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
wp->ptrs = ptrs_skip;
}
-static int open_bucket_add_buckets(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *_cl)
+static enum bucket_alloc_ret
+open_bucket_add_buckets(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_list *devs_have,
+ u16 target,
+ unsigned erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *_cl)
{
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
+ enum bucket_alloc_ret ret;
unsigned i;
- int ret;
rcu_read_lock();
devs = target_rw_devs(c, wp->type, target);
@@ -650,18 +578,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
__clear_bit(ob->ptr.dev, devs.d);
if (erasure_code) {
- get_buckets_from_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, flags, true);
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (!ec_open_bucket(c, ptrs)) {
+ get_buckets_from_writepoint(c, ptrs, wp, &devs,
+ nr_replicas, nr_effective,
+ have_cache, flags, true);
+ if (*nr_effective >= nr_replicas)
+ return 0;
+ }
- bucket_alloc_from_stripe(c, ptrs, wp, &devs,
- target, erasure_code,
- nr_replicas, nr_effective,
- have_cache, flags);
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (!ec_open_bucket(c, ptrs)) {
+ bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+ target, erasure_code,
+ nr_replicas, nr_effective,
+ have_cache, flags);
+ if (*nr_effective >= nr_replicas)
+ return 0;
+ }
}
get_buckets_from_writepoint(c, ptrs, wp, &devs,
@@ -681,7 +613,7 @@ retry_blocking:
ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != -EROFS && !cl && _cl) {
+ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
@@ -872,7 +804,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
unsigned nr_effective, write_points_nr;
unsigned ob_flags = 0;
bool have_cache;
- int ret, i;
+ enum bucket_alloc_ret ret;
+ int i;
if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
@@ -886,11 +819,11 @@ retry:
wp = writepoint_find(c, write_point.v);
- if (wp->type == BCH_DATA_USER)
+ if (wp->type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
/* metadata may not allocate on cache devices: */
- if (wp->type != BCH_DATA_USER)
+ if (wp->type != BCH_DATA_user)
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -920,7 +853,7 @@ alloc_done:
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
- if (ret == -EROFS &&
+ if (ret == INSUFFICIENT_DEVICES &&
nr_effective >= nr_replicas_required)
ret = 0;
@@ -929,7 +862,7 @@ alloc_done:
/* Free buckets we didn't use: */
open_bucket_for_each(c, &wp->ptrs, ob, i)
- open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
+ open_bucket_free_unused(c, wp, ob);
wp->ptrs = ptrs;
@@ -948,17 +881,24 @@ err:
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
ob_push(c, &ptrs, ob);
else
- open_bucket_free_unused(c, ob,
- wp->type == BCH_DATA_USER);
+ open_bucket_free_unused(c, wp, ob);
wp->ptrs = ptrs;
mutex_unlock(&wp->lock);
- if (ret == -ENOSPC &&
+ if (ret == FREELIST_EMPTY &&
try_decrease_writepoints(c, write_points_nr))
goto retry;
- return ERR_PTR(ret);
+ switch (ret) {
+ case OPEN_BUCKETS_EMPTY:
+ case FREELIST_EMPTY:
+ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
+ case INSUFFICIENT_DEVICES:
+ return ERR_PTR(-EROFS);
+ default:
+ BUG();
+ }
}
/*
@@ -980,7 +920,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
struct bch_extent_ptr tmp = ob->ptr;
tmp.cached = !ca->mi.durability &&
- wp->type == BCH_DATA_USER;
+ wp->type == BCH_DATA_user;
tmp.offset += ca->mi.bucket_size - ob->sectors_free;
bch2_bkey_append_ptr(k, tmp);
@@ -1009,6 +949,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
bch2_open_buckets_put(c, &ptrs);
}
+static inline void writepoint_init(struct write_point *wp,
+ enum bch_data_type type)
+{
+ mutex_init(&wp->lock);
+ wp->type = type;
+}
+
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
{
struct open_bucket *ob;
@@ -1029,12 +976,13 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
c->open_buckets_freelist = ob - c->open_buckets;
}
- writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
- writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+ writepoint_init(&c->btree_write_point, BCH_DATA_btree);
+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
+ writepoint_init(&c->copygc_write_point, BCH_DATA_user);
for (wp = c->write_points;
wp < c->write_points + c->write_points_nr; wp++) {
- writepoint_init(wp, BCH_DATA_USER);
+ writepoint_init(wp, BCH_DATA_user);
wp->last_used = sched_clock();
wp->write_point = (unsigned long) wp;
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 687f973e4b3a..c658295cb8e0 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -12,6 +12,13 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
+enum bucket_alloc_ret {
+ ALLOC_SUCCESS,
+ OPEN_BUCKETS_EMPTY,
+ FREELIST_EMPTY, /* Allocator thread not keeping up */
+ INSUFFICIENT_DEVICES,
+};
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -20,8 +27,7 @@ struct dev_alloc_list {
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
struct dev_stripe_state *,
struct bch_devs_mask *);
-void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
- struct dev_stripe_state *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
long bch2_bucket_alloc_new_fs(struct bch_dev *);
@@ -92,6 +98,12 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
}
}
+enum bucket_alloc_ret
+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+ struct dev_stripe_state *, struct bch_devs_mask *,
+ unsigned, unsigned *, bool *, enum alloc_reserve,
+ unsigned, struct closure *);
+
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned, unsigned,
struct write_point_specifier,
@@ -121,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
return (struct write_point_specifier) { .v = (unsigned long) wp };
}
-static inline void writepoint_init(struct write_point *wp,
- enum bch_data_type type)
-{
- mutex_init(&wp->lock);
- wp->type = type;
-}
-
void bch2_fs_allocator_foreground_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 4f1465077994..20705460bb0a 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -66,6 +66,7 @@ struct open_bucket {
u8 type;
unsigned valid:1;
unsigned on_partial_list:1;
+ int alloc_reserve:3;
unsigned sectors_free;
struct bch_extent_ptr ptr;
struct ec_stripe_new *ec;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index dbc714db6a3d..90303b6a3d99 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,13 +451,6 @@ struct bch_dev {
alloc_heap alloc_heap;
- /* Copying GC: */
- struct task_struct *copygc_thread;
- copygc_heap copygc_heap;
- struct bch_pd_controller copygc_pd;
- struct write_point copygc_write_point;
- u64 copygc_threshold;
-
atomic64_t rebalance_work;
struct journal_device journal;
@@ -741,7 +734,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -751,16 +744,27 @@ struct bch_fs {
/* REBALANCE */
struct bch_fs_rebalance rebalance;
+ /* COPYGC */
+ struct task_struct *copygc_thread;
+ copygc_heap copygc_heap;
+ struct bch_pd_controller copygc_pd;
+ struct write_point copygc_write_point;
+ u64 copygc_threshold;
+
/* STRIPES: */
GENRADIX(struct stripe) stripes[2];
- struct mutex ec_stripe_create_lock;
ec_stripes_heap ec_stripes_heap;
spinlock_t ec_stripes_heap_lock;
/* ERASURE CODING */
- struct list_head ec_new_stripe_list;
- struct mutex ec_new_stripe_lock;
+ struct list_head ec_stripe_head_list;
+ struct mutex ec_stripe_head_lock;
+
+ struct list_head ec_stripe_new_list;
+ struct mutex ec_stripe_new_lock;
+
+ struct work_struct ec_stripe_create_work;
u64 ec_stripe_hint;
struct bio_set ec_bioset;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f808e63a713d..d5a2230e403c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1026,14 +1026,19 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
/* BCH_SB_FIELD_replicas: */
+#define BCH_DATA_TYPES() \
+ x(none, 0) \
+ x(sb, 1) \
+ x(journal, 2) \
+ x(btree, 3) \
+ x(user, 4) \
+ x(cached, 5)
+
enum bch_data_type {
- BCH_DATA_NONE = 0,
- BCH_DATA_SB = 1,
- BCH_DATA_JOURNAL = 2,
- BCH_DATA_BTREE = 3,
- BCH_DATA_USER = 4,
- BCH_DATA_CACHED = 5,
- BCH_DATA_NR = 6,
+#define x(t, n) BCH_DATA_##t,
+ BCH_DATA_TYPES()
+#undef x
+ BCH_DATA_NR
};
struct bch_replicas_entry_v0 {
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 6fc91e6a35e8..f7c2841ed8a7 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -313,44 +313,6 @@ struct rw_aux_tree {
struct bpos k;
};
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE 128
-
-/* Space required for the btree node keys */
-static inline size_t btree_keys_bytes(struct btree *b)
-{
- return PAGE_SIZE << b->page_order;
-}
-
-static inline size_t btree_keys_cachelines(struct btree *b)
-{
- return btree_keys_bytes(b) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(struct btree *b)
-{
- return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(struct btree *b)
-{
- return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
{
BUG_ON(t->aux_data_offset == U16_MAX);
@@ -426,29 +388,6 @@ static void bset_aux_tree_verify(struct btree *b)
#endif
}
-/* Memory allocation */
-
-void bch2_btree_keys_free(struct btree *b)
-{
- vfree(b->aux_data);
- b->aux_data = NULL;
-}
-
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
-int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
-{
- b->page_order = page_order;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
- if (!b->aux_data)
- return -ENOMEM;
-
- return 0;
-}
-
void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
{
unsigned i;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 652ffed4adfb..5921cf689105 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -184,6 +184,38 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
}
}
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE 128
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+ return (1U << b->byte_order) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(struct btree *b)
+{
+ return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(struct btree *b)
+{
+ return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
static inline void
@@ -334,8 +366,6 @@ static inline struct bset *bset_next_set(struct btree *b,
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
-void bch2_btree_keys_free(struct btree *);
-int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
void bch2_btree_keys_init(struct btree *, bool *);
void bch2_bset_init_first(struct btree *, struct bset *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d3addd3a8964..a0d570f3adf0 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -44,7 +44,8 @@ static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
- bch2_btree_keys_free(b);
+ vfree(b->aux_data);
+ b->aux_data = NULL;
}
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
@@ -72,7 +73,11 @@ static const struct rhashtable_params bch_btree_cache_params = {
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
-static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
@@ -80,7 +85,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
- if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) {
+ b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
+ PAGE_KERNEL_EXEC);
+ if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
return -ENOMEM;
@@ -89,21 +96,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
return 0;
}
-static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
- struct btree_cache *bc = &c->btree_cache;
-
- if (!__btree_node_data_alloc(c, b, gfp)) {
- bc->used++;
- list_move(&b->list, &bc->freeable);
- } else {
- list_move(&b->list, &bc->freed);
- }
-}
-
-static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
{
- struct btree *b = kzalloc(sizeof(struct btree), gfp);
+ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
if (!b)
return NULL;
@@ -111,9 +106,25 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
six_lock_init(&b->c.lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
+ b->byte_order = ilog2(btree_bytes(c));
+ return b;
+}
- btree_node_data_alloc(c, b, gfp);
- return b->data ? b : NULL;
+static struct btree *btree_node_mem_alloc(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b = __btree_node_mem_alloc(c);
+ if (!b)
+ return NULL;
+
+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+ kfree(b);
+ return NULL;
+ }
+
+ bc->used++;
+ list_add(&b->list, &bc->freeable);
+ return b;
}
/* Btree in memory cache - hash table */
@@ -124,6 +135,8 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
+
+ six_lock_wakeup_all(&b->c.lock);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -402,7 +415,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
- if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+ if (!btree_node_mem_alloc(c)) {
ret = -ENOMEM;
goto out;
}
@@ -418,7 +431,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
goto out;
}
- c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+ c->verify_data = btree_node_mem_alloc(c);
if (!c->verify_data) {
ret = -ENOMEM;
goto out;
@@ -550,21 +563,16 @@ got_node:
mutex_unlock(&bc->lock);
if (!b) {
- b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+ b = __btree_node_mem_alloc(c);
if (!b)
goto err;
- bkey_btree_ptr_init(&b->key);
- six_lock_init(&b->c.lock);
- INIT_LIST_HEAD(&b->list);
- INIT_LIST_HEAD(&b->write_blocked);
-
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
}
if (!b->data) {
- if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
goto err;
mutex_lock(&bc->lock);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 2160012c734f..d0d3a85bb8be 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -79,14 +79,9 @@ static inline size_t btree_max_u64s(struct bch_fs *c)
return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_page_order(struct bch_fs *c)
-{
- return get_order(btree_bytes(c));
-}
-
static inline size_t btree_pages(struct bch_fs *c)
{
- return 1 << btree_page_order(c);
+ return btree_bytes(c) / PAGE_SIZE;
}
static inline unsigned btree_blocks(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8771ef1f07cc..4f581130270c 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -109,7 +109,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
atomic64_set(&c->key_version, k.k->version.lo);
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
+ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c,
"superblock not marked as containing replicas (type %u)",
k.k->type)) {
ret = bch2_mark_bkey_replicas(c, k);
@@ -433,16 +433,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
if (offset == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
- BCH_DATA_SB, flags);
+ BCH_DATA_sb, flags);
mark_metadata_sectors(c, ca, offset,
offset + (1 << layout->sb_max_size_bits),
- BCH_DATA_SB, flags);
+ BCH_DATA_sb, flags);
}
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB), flags);
}
@@ -617,8 +617,11 @@ static int bch2_gc_done(struct bch_fs *c,
copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
- if (dst->alive)
+ if (dst->alive) {
+ spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
genradix_iter_advance(&dst_iter, &c->stripes[0]);
genradix_iter_advance(&src_iter, &c->stripes[1]);
@@ -673,8 +676,8 @@ static int bch2_gc_done(struct bch_fs *c,
char buf[80];
if (metadata_only &&
- (e->data_type == BCH_DATA_USER ||
- e->data_type == BCH_DATA_CACHED))
+ (e->data_type == BCH_DATA_user ||
+ e->data_type == BCH_DATA_cached))
continue;
bch2_replicas_entry_to_text(&PBUF(buf), e);
@@ -759,8 +762,8 @@ static int bch2_gc_start(struct bch_fs *c,
d->gen_valid = s->gen_valid;
if (metadata_only &&
- (s->mark.data_type == BCH_DATA_USER ||
- s->mark.data_type == BCH_DATA_CACHED)) {
+ (s->mark.data_type == BCH_DATA_user ||
+ s->mark.data_type == BCH_DATA_cached)) {
d->_mark = s->mark;
d->_mark.owned_by_allocator = 0;
}
@@ -949,8 +952,10 @@ int bch2_gc_gens(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
if (btree_node_type_needs_gc(i)) {
ret = bch2_gc_btree_gens(c, i);
- if (ret)
+ if (ret) {
+ bch_err(c, "error recalculating oldest_gen: %i", ret);
goto err;
+ }
}
for_each_member_device(ca, c, i) {
@@ -961,6 +966,8 @@ int bch2_gc_gens(struct bch_fs *c)
g->oldest_gen = g->gc_gen;
up_read(&ca->bucket_lock);
}
+
+ c->gc_count++;
err:
up_read(&c->gc_lock);
return ret;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index c81783ed9400..887e40574c93 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -57,25 +57,25 @@ static void set_needs_whiteout(struct bset *i, int v)
k->needs_whiteout = v;
}
-static void btree_bounce_free(struct bch_fs *c, unsigned order,
+static void btree_bounce_free(struct bch_fs *c, size_t size,
bool used_mempool, void *p)
{
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
- vpfree(p, PAGE_SIZE << order);
+ vpfree(p, size);
}
-static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
bool *used_mempool)
{
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(order > btree_page_order(c));
+ BUG_ON(size > btree_bytes(c));
*used_mempool = false;
- p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
@@ -125,16 +125,14 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
{
struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
bool used_mempool = false;
- unsigned order;
+ size_t bytes = b->whiteout_u64s * sizeof(u64);
if (!b->whiteout_u64s)
return;
- order = get_order(b->whiteout_u64s * sizeof(u64));
+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
- new_whiteouts = btree_bounce_alloc(c, order, &used_mempool);
-
- ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order));
+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
for (k = unwritten_whiteouts_start(c, b);
k != unwritten_whiteouts_end(c, b);
@@ -158,7 +156,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
memcpy_u64s(unwritten_whiteouts_start(c, b),
new_whiteouts, b->whiteout_u64s);
- btree_bounce_free(c, order, used_mempool, new_whiteouts);
+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
}
static bool should_compact_bset(struct btree *b, struct bset_tree *t,
@@ -187,7 +185,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
struct bkey_packed *whiteouts = NULL;
struct bkey_packed *u_start, *u_pos;
struct sort_iter sort_iter;
- unsigned order, whiteout_u64s = 0, u64s;
+ unsigned bytes, whiteout_u64s = 0, u64s;
bool used_mempool, compacting = false;
BUG_ON(!btree_node_is_extents(b));
@@ -204,9 +202,9 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
sort_iter_init(&sort_iter, b);
whiteout_u64s += b->whiteout_u64s;
- order = get_order(whiteout_u64s * sizeof(u64));
+ bytes = whiteout_u64s * sizeof(u64);
- whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
u_start = u_pos = whiteouts;
memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
@@ -306,7 +304,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
unwritten_whiteouts_end(c, b),
true);
- btree_bounce_free(c, order, used_mempool, whiteouts);
+ btree_bounce_free(c, bytes, used_mempool, whiteouts);
bch2_btree_build_aux_trees(b);
@@ -401,7 +399,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false;
u64 start_time, seq = 0;
- unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets;
@@ -416,11 +414,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
btree_bkey_last(b, t));
}
- order = sorting_entire_node
- ? btree_page_order(c)
- : get_order(__vstruct_bytes(struct btree_node, u64s));
+ bytes = sorting_entire_node
+ ? btree_bytes(c)
+ : __vstruct_bytes(struct btree_node, u64s);
- out = btree_bounce_alloc(c, order, &used_mempool);
+ out = btree_bounce_alloc(c, bytes, &used_mempool);
start_time = local_clock();
@@ -435,7 +433,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
out->keys.u64s = cpu_to_le16(u64s);
- BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
if (sorting_entire_node)
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
@@ -449,7 +447,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
unsigned u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(order != btree_page_order(c));
+ BUG_ON(bytes != btree_bytes(c));
/*
* Our temporary buffer is the same size as the btree node's
@@ -484,7 +482,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
set_btree_bset_end(b, &b->set[start_idx]);
bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
- btree_bounce_free(c, order, used_mempool, out);
+ btree_bounce_free(c, bytes, used_mempool, out);
bch2_verify_btree_nr_keys(b);
}
@@ -620,7 +618,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -917,6 +915,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
+ struct bch_extent_ptr *ptr;
struct bset *i;
bool used_mempool, blacklisted;
unsigned u64s;
@@ -971,8 +970,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
bset_encrypt(c, i, b->written << 9);
if (btree_node_is_extents(b) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
set_btree_node_old_extent_overwrite(b);
+ set_btree_node_need_rewrite(b);
+ }
sectors = vstruct_sectors(b->data, c->block_bits);
} else {
@@ -1040,7 +1041,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
BTREE_ERR_WANT_RETRY, c, b, NULL,
"found bset signature after last bset");
- sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1058,7 +1059,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
@@ -1098,6 +1099,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
set_needs_whiteout(btree_bset_first(b), true);
btree_node_reset_sib_u64s(b);
+
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ if (ca->mi.state != BCH_MEMBER_STATE_RW)
+ set_btree_node_need_rewrite(b);
+ }
out:
mempool_free(iter, &c->fill_iter);
return retry_read;
@@ -1139,7 +1147,8 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_status = BLK_STS_REMOVED;
}
start:
- bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
+ bch2_blk_status_to_str(bio->bi_status));
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
@@ -1220,7 +1229,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
set_btree_node_read_in_flight(b);
if (rb->have_ioref) {
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1392,7 +1401,7 @@ static void btree_node_write_work(struct work_struct *work)
struct btree *b = wbio->wbio.bio.bi_private;
btree_bounce_free(c,
- wbio->wbio.order,
+ wbio->bytes,
wbio->wbio.used_mempool,
wbio->data);
@@ -1423,8 +1432,8 @@ static void btree_node_write_endio(struct bio *bio)
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bio->bi_status == BLK_STS_REMOVED ||
- bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+ bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
@@ -1475,7 +1484,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct bch_extent_ptr *ptr;
struct sort_iter sort_iter;
struct nonce nonce;
- unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
+ unsigned bytes_to_write, sectors_to_write, bytes, u64s;
u64 seq = 0;
bool used_mempool;
unsigned long old, new;
@@ -1545,8 +1554,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
seq = max(seq, le64_to_cpu(i->journal_seq));
}
- order = get_order(bytes);
- data = btree_bounce_alloc(c, order, &used_mempool);
+ data = btree_bounce_alloc(c, bytes, &used_mempool);
if (!b->written) {
bn = data;
@@ -1658,7 +1666,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
wbio->data = data;
- wbio->wbio.order = order;
+ wbio->bytes = bytes;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
@@ -1689,13 +1697,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->written += sectors_to_write;
/* XXX: submitting IO with btree locks held: */
- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
return;
err:
set_btree_node_noevict(b);
b->written += sectors_to_write;
nowrite:
- btree_bounce_free(c, order, used_mempool, data);
+ btree_bounce_free(c, bytes, used_mempool, data);
btree_node_write_done(c, b);
}
@@ -1826,9 +1834,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c)
rcu_read_unlock();
}
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
@@ -1841,7 +1848,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
- pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
+ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1852,6 +1859,4 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
b->will_make_reachable & 1);
}
rcu_read_unlock();
-
- return out.pos - buf;
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index f3d7ec749b61..66ebdd39f5b3 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -23,8 +23,9 @@ struct btree_read_bio {
};
struct btree_write_bio {
- void *data;
struct work_struct work;
+ void *data;
+ unsigned bytes;
struct bch_write_bio wbio;
};
@@ -139,7 +140,7 @@ do { \
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
void bch2_btree_verify_flushed(struct bch_fs *);
-ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
unsigned version, unsigned big_endian,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 16c4d058358b..683b416ef427 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -94,7 +94,7 @@ struct btree {
struct btree_nr_keys nr;
u16 sib_u64s[2];
u16 whiteout_u64s;
- u8 page_order;
+ u8 byte_order;
u8 unpack_fn_len;
/*
@@ -409,6 +409,7 @@ enum btree_flags {
BTREE_NODE_dying,
BTREE_NODE_fake,
BTREE_NODE_old_extent_overwrite,
+ BTREE_NODE_need_rewrite,
};
BTREE_FLAG(read_in_flight);
@@ -423,6 +424,7 @@ BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
BTREE_FLAG(old_extent_overwrite);
+BTREE_FLAG(need_rewrite);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -593,7 +595,6 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
enum btree_trigger_flags {
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
- __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
@@ -606,7 +607,6 @@ enum btree_trigger_flags {
};
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES)
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a8cd6ffb6c7c..a2604b0ce2d8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -26,7 +26,7 @@
/*
* Verify that child nodes correctly span parent node's range:
*/
-static void btree_node_interior_verify(struct btree *b)
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos next_node = b->data->min_key;
@@ -37,6 +37,9 @@ static void btree_node_interior_verify(struct btree *b)
BUG_ON(!b->c.level);
+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+ return;
+
bch2_btree_node_iter_init_from_start(&iter, b);
while (1) {
@@ -135,8 +138,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
bch2_btree_node_hash_remove(&c->btree_cache, b);
- six_lock_wakeup_all(&b->c.lock);
-
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
@@ -290,8 +291,10 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
if (btree_node_is_extents(b) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
set_btree_node_old_extent_overwrite(b);
+ set_btree_node_need_rewrite(b);
+ }
bch2_btree_build_aux_trees(b);
@@ -1118,8 +1121,8 @@ static struct btree *__btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n2);
if (n1->c.level) {
- btree_node_interior_verify(n1);
- btree_node_interior_verify(n2);
+ btree_node_interior_verify(as->c, n1);
+ btree_node_interior_verify(as->c, n2);
}
return n2;
@@ -1178,7 +1181,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
BUG_ON(b->nsets != 1 ||
b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
- btree_node_interior_verify(b);
+ btree_node_interior_verify(as->c, b);
}
static void btree_split(struct btree_update *as, struct btree *b,
@@ -1376,7 +1379,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
bch2_btree_node_unlock_write(b, iter);
- btree_node_interior_verify(b);
+ btree_node_interior_verify(c, b);
/*
* when called from the btree_split path the new nodes aren't added to
@@ -1864,7 +1867,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
new_hash = bch2_btree_node_mem_alloc(c);
}
-
+retry:
as = bch2_btree_update_start(iter->trans, iter->btree_id,
parent ? btree_update_reserve_required(c, parent) : 0,
BTREE_INSERT_NOFAIL|
@@ -1877,16 +1880,17 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
if (ret == -EAGAIN)
ret = -EINTR;
- if (ret != -EINTR)
- goto err;
+ if (ret == -EINTR) {
+ bch2_trans_unlock(iter->trans);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
- bch2_trans_unlock(iter->trans);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- down_read(&c->gc_lock);
+ if (bch2_trans_relock(iter->trans))
+ goto retry;
+ }
- if (!bch2_trans_relock(iter->trans))
- goto err;
+ goto err;
}
ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
@@ -1943,6 +1947,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
+ set_btree_node_need_rewrite(b);
b->c.level = 0;
b->c.btree_id = id;
@@ -1969,22 +1974,19 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
six_unlock_intent(&b->c.lock);
}
-ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct btree_update *as;
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- pr_buf(&out, "%p m %u w %u r %u j %llu\n",
+ pr_buf(out, "%p m %u w %u r %u j %llu\n",
as,
as->mode,
as->nodes_written,
atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
as->journal.seq);
mutex_unlock(&c->btree_interior_update_lock);
-
- return out.pos - buf;
}
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 4a5b9dcfbdd0..7668225e72c6 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -311,13 +311,13 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
{
- if (unlikely(btree_node_fake(b)))
+ if (unlikely(btree_node_need_rewrite(b)))
return false;
return u64s <= bch_btree_keys_u64s_remaining(c, b);
}
-ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e9688d0bb77..cd699c257244 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -264,30 +264,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
static enum btree_insert_ret
btree_key_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
- unsigned *u64s)
+ unsigned u64s)
{
struct bch_fs *c = trans->c;
struct btree *b = iter_l(iter)->b;
- static enum btree_insert_ret ret;
- if (unlikely(btree_node_fake(b)))
- return BTREE_INSERT_BTREE_NODE_FULL;
-
- /*
- * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
- * style extent updates:
- */
- if (unlikely(btree_node_old_extent_overwrite(b)))
- return BTREE_INSERT_BTREE_NODE_FULL;
-
- ret = !btree_iter_is_extents(iter)
- ? BTREE_INSERT_OK
- : bch2_extent_can_insert(trans, iter, insert);
- if (ret)
- return ret;
-
- if (*u64s > bch_btree_keys_u64s_remaining(c, b))
+ if (!bch2_btree_node_insert_fits(c, b, u64s))
return BTREE_INSERT_BTREE_NODE_FULL;
return BTREE_INSERT_OK;
@@ -296,8 +278,7 @@ btree_key_can_insert(struct btree_trans *trans,
static enum btree_insert_ret
btree_key_can_insert_cached(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
- unsigned *u64s)
+ unsigned u64s)
{
struct bkey_cached *ck = (void *) iter->l[0].b;
unsigned new_u64s;
@@ -305,10 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
BUG_ON(iter->level);
- if (*u64s <= ck->u64s)
+ if (u64s <= ck->u64s)
return BTREE_INSERT_OK;
- new_u64s = roundup_pow_of_two(*u64s);
+ new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k)
return -ENOMEM;
@@ -414,8 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
u64s += i->k->k.u64s;
ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
- ? btree_key_can_insert(trans, i->iter, i->k, &u64s)
- : btree_key_can_insert_cached(trans, i->iter, i->k, &u64s);
+ ? btree_key_can_insert(trans, i->iter, u64s)
+ : btree_key_can_insert_cached(trans, i->iter, u64s);
if (ret) {
*stopped_at = i;
return ret;
@@ -733,6 +714,11 @@ static int extent_update_to_keys(struct btree_trans *trans,
struct bkey_i *insert)
{
struct btree_iter *iter;
+ int ret;
+
+ ret = bch2_extent_can_insert(trans, orig_iter, insert);
+ if (ret)
+ return ret;
if (bkey_deleted(&insert->k))
return 0;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0ec194b93c71..97a8af31ded1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -133,13 +133,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
cpu_replicas_entry(&c->replicas, i);
switch (e->data_type) {
- case BCH_DATA_BTREE:
+ case BCH_DATA_btree:
usage->btree += usage->replicas[i];
break;
- case BCH_DATA_USER:
+ case BCH_DATA_user:
usage->data += usage->replicas[i];
break;
- case BCH_DATA_CACHED:
+ case BCH_DATA_cached:
usage->cached += usage->replicas[i];
break;
}
@@ -179,7 +179,7 @@ out_pool:
return ret;
}
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
struct bch_dev_usage ret;
@@ -367,7 +367,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
struct bch_dev *ca)
{
if (!m.owned_by_allocator &&
- m.data_type == BCH_DATA_USER &&
+ m.data_type == BCH_DATA_user &&
bucket_sectors_used(m))
return max_t(int, 0, (int) ca->mi.bucket_size -
bucket_sectors_used(m));
@@ -382,7 +382,7 @@ static inline int bucket_stripe_sectors(struct bucket_mark m)
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
- ? BCH_DATA_CACHED
+ ? BCH_DATA_cached
: m.data_type;
}
@@ -435,7 +435,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
enum bch_data_type type,
int nr, s64 size)
{
- if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
+ if (type == BCH_DATA_sb || type == BCH_DATA_journal)
fs_usage->hidden += size;
dev_usage->buckets[type] += nr;
@@ -472,7 +472,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->sectors[old.data_type] -= old.dirty_sectors;
u->sectors[new.data_type] += new.dirty_sectors;
- u->sectors[BCH_DATA_CACHED] +=
+ u->sectors[BCH_DATA_cached] +=
(int) new.cached_sectors - (int) old.cached_sectors;
u->sectors_fragmented +=
is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
@@ -520,13 +520,13 @@ static inline int update_replicas(struct bch_fs *c,
return 0;
switch (r->data_type) {
- case BCH_DATA_BTREE:
+ case BCH_DATA_btree:
fs_usage->btree += sectors;
break;
- case BCH_DATA_USER:
+ case BCH_DATA_user:
fs_usage->data += sectors;
break;
- case BCH_DATA_CACHED:
+ case BCH_DATA_cached:
fs_usage->cached += sectors;
break;
}
@@ -713,7 +713,8 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
preempt_enable();
}
-static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_alloc(struct bch_fs *c,
+ struct bkey_s_c old, struct bkey_s_c new,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
@@ -721,7 +722,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
struct bkey_alloc_unpacked u;
struct bch_dev *ca;
struct bucket *g;
- struct bucket_mark old, m;
+ struct bucket_mark old_m, m;
+
+ /* We don't do anything for deletions - do we?: */
+ if (new.k->type != KEY_TYPE_alloc)
+ return 0;
/*
* alloc btree is read in by bch2_alloc_read, not gc:
@@ -730,15 +735,15 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ ca = bch_dev_bkey_exists(c, new.k->p.inode);
- if (k.k->p.offset >= ca->mi.nbuckets)
+ if (new.k->p.offset >= ca->mi.nbuckets)
return 0;
- g = __bucket(ca, k.k->p.offset, gc);
- u = bch2_alloc_unpack(k);
+ g = __bucket(ca, new.k->p.offset, gc);
+ u = bch2_alloc_unpack(new);
- old = bucket_cmpxchg(g, m, ({
+ old_m = bucket_cmpxchg(g, m, ({
m.gen = u.gen;
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
@@ -751,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
}));
if (!(flags & BTREE_TRIGGER_ALLOC_READ))
- bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
@@ -764,11 +769,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old.cached_sectors) {
+ old_m.cached_sectors) {
update_cached_sectors(c, fs_usage, ca->dev_idx,
- -old.cached_sectors);
- trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
- old.cached_sectors);
+ -old_m.cached_sectors);
+ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+ old_m.cached_sectors);
}
return 0;
@@ -792,8 +797,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
struct bucket_mark old, new;
bool overflow;
- BUG_ON(data_type != BCH_DATA_SB &&
- data_type != BCH_DATA_JOURNAL);
+ BUG_ON(data_type != BCH_DATA_sb &&
+ data_type != BCH_DATA_journal);
old = bucket_cmpxchg(g, new, ({
new.data_type = data_type;
@@ -824,8 +829,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
- BUG_ON(type != BCH_DATA_SB &&
- type != BCH_DATA_JOURNAL);
+ BUG_ON(type != BCH_DATA_sb &&
+ type != BCH_DATA_journal);
preempt_disable();
@@ -878,51 +883,46 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
}
static void bucket_set_stripe(struct bch_fs *c,
- const struct bch_stripe *v,
+ const struct bch_extent_ptr *ptr,
struct bch_fs_usage *fs_usage,
u64 journal_seq,
- unsigned flags)
+ unsigned flags,
+ bool enabled)
{
- bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE);
bool gc = flags & BTREE_TRIGGER_GC;
- unsigned i;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+ struct bucket_mark new, old;
- for (i = 0; i < v->nr_blocks; i++) {
- const struct bch_extent_ptr *ptr = v->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, gc);
- struct bucket_mark new, old;
-
- old = bucket_cmpxchg(g, new, ({
- new.stripe = enabled;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
- }));
+ old = bucket_cmpxchg(g, new, ({
+ new.stripe = enabled;
+ if (journal_seq) {
+ new.journal_seq_valid = 1;
+ new.journal_seq = journal_seq;
+ }
+ }));
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
- /*
- * XXX write repair code for these, flag stripe as possibly bad
- */
- if (old.gen != ptr->gen)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "stripe with stale pointer");
+ /*
+ * XXX write repair code for these, flag stripe as possibly bad
+ */
+ if (old.gen != ptr->gen)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "stripe with stale pointer");
#if 0
- /*
- * We'd like to check for these, but these checks don't work
- * yet:
- */
- if (old.stripe && enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "multiple stripes using same bucket");
-
- if (!old.stripe && !enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "deleting stripe but bucket not marked as stripe bucket");
+ /*
+ * We'd like to check for these, but these checks don't work
+ * yet:
+ */
+ if (old.stripe && enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "multiple stripes using same bucket");
+
+ if (!old.stripe && !enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "deleting stripe but bucket not marked as stripe bucket");
#endif
- }
}
static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
@@ -1064,8 +1064,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
{
bool gc = flags & BTREE_TRIGGER_GC;
struct stripe *m;
- unsigned old, new;
- int blocks_nonempty_delta;
+ unsigned i, blocks_nonempty = 0;
m = genradix_ptr(&c->stripes[gc], p.idx);
@@ -1084,31 +1083,30 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
*nr_parity = m->nr_redundant;
*r = m->r;
- old = m->block_sectors[p.block];
m->block_sectors[p.block] += sectors;
- new = m->block_sectors[p.block];
- blocks_nonempty_delta = (int) !!new - (int) !!old;
- if (blocks_nonempty_delta) {
- m->blocks_nonempty += blocks_nonempty_delta;
+ for (i = 0; i < m->nr_blocks; i++)
+ blocks_nonempty += m->block_sectors[i] != 0;
+ if (m->blocks_nonempty != blocks_nonempty) {
+ m->blocks_nonempty = blocks_nonempty;
if (!gc)
bch2_stripes_heap_update(c, m, p.idx);
}
- m->dirty = true;
-
spin_unlock(&c->ec_stripes_heap_lock);
return 0;
}
-static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_extent(struct bch_fs *c,
+ struct bkey_s_c old, struct bkey_s_c new,
unsigned offset, s64 sectors,
enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1124,7 +1122,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
BUG_ON(!sectors);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = data_type == BCH_DATA_BTREE
+ s64 disk_sectors = data_type == BCH_DATA_btree
? sectors
: ptr_disk_sectors_delta(p, offset, sectors, flags);
@@ -1177,72 +1175,98 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_stripe(struct bch_fs *c,
+ struct bkey_s_c old, struct bkey_s_c new,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- size_t idx = s.k->p.offset;
+ size_t idx = new.k->p.offset;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
- spin_lock(&c->ec_stripes_heap_lock);
-
- if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) {
- spin_unlock(&c->ec_stripes_heap_lock);
+ if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
return -1;
}
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- m->sectors = le16_to_cpu(s.v->sectors);
- m->algorithm = s.v->algorithm;
- m->nr_blocks = s.v->nr_blocks;
- m->nr_redundant = s.v->nr_redundant;
+ if (!new_s) {
+ /* Deleting: */
+ for (i = 0; i < old_s->nr_blocks; i++)
+ bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+ journal_seq, flags, false);
- bch2_bkey_to_replicas(&m->r.e, k);
+ if (!gc && m->on_heap) {
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_del(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
- /*
- * XXX: account for stripes somehow here
- */
-#if 0
- update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
-#endif
+ memset(m, 0, sizeof(*m));
+ } else {
+ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
+ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ if (!old_s ||
+ memcmp(new_s->ptrs + i,
+ old_s->ptrs + i,
+ sizeof(struct bch_extent_ptr))) {
+
+ if (old_s)
+ bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+ journal_seq, flags, false);
+ bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
+ journal_seq, flags, true);
+ }
+ }
+
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+
+ bch2_bkey_to_replicas(&m->r.e, new);
/* gc recalculates these fields: */
if (!(flags & BTREE_TRIGGER_GC)) {
- for (i = 0; i < s.v->nr_blocks; i++) {
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
m->block_sectors[i] =
- stripe_blockcount_get(s.v, i);
+ stripe_blockcount_get(new_s, i);
m->blocks_nonempty += !!m->block_sectors[i];
}
}
- if (!gc)
+ if (!gc) {
+ spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
- m->alive = true;
- } else {
- if (!gc)
- bch2_stripes_heap_del(c, m, idx);
- memset(m, 0, sizeof(*m));
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
}
- spin_unlock(&c->ec_stripes_heap_lock);
-
- bucket_set_stripe(c, s.v, fs_usage, 0, flags);
return 0;
}
static int bch2_mark_key_locked(struct bch_fs *c,
- struct bkey_s_c k,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
int ret = 0;
+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+
preempt_disable();
if (!fs_usage || (flags & BTREE_TRIGGER_GC))
@@ -1251,7 +1275,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
switch (k.k->type) {
case KEY_TYPE_alloc:
- ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
+ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
case KEY_TYPE_btree_ptr_v2:
@@ -1259,16 +1283,16 @@ static int bch2_mark_key_locked(struct bch_fs *c,
? c->opts.btree_node_size
: -c->opts.btree_node_size;
- ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
- fs_usage, journal_seq, flags);
+ ret = bch2_mark_extent(c, old, new, offset, sectors,
+ BCH_DATA_btree, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
- fs_usage, journal_seq, flags);
+ ret = bch2_mark_extent(c, old, new, offset, sectors,
+ BCH_DATA_user, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_stripe:
- ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
+ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_inode:
if (!(flags & BTREE_TRIGGER_OVERWRITE))
@@ -1294,82 +1318,38 @@ static int bch2_mark_key_locked(struct bch_fs *c,
return ret;
}
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
+ struct bkey deleted;
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
int ret;
+ bkey_init(&deleted);
+
percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(c, k, offset, sectors,
- fs_usage, journal_seq, flags);
+ ret = bch2_mark_key_locked(c, old, new, offset, sectors,
+ fs_usage, journal_seq,
+ BTREE_TRIGGER_INSERT|flags);
percpu_up_read(&c->mark_lock);
return ret;
}
-inline int bch2_mark_overwrite(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c old,
- struct bkey_i *new,
- struct bch_fs_usage *fs_usage,
- unsigned flags,
- bool is_extents)
-{
- struct bch_fs *c = trans->c;
- unsigned offset = 0;
- s64 sectors = -((s64) old.k->size);
-
- flags |= BTREE_TRIGGER_OVERWRITE;
-
- if (is_extents
- ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
- : bkey_cmp(new->k.p, old.k->p))
- return 0;
-
- if (is_extents) {
- switch (bch2_extent_overlap(&new->k, old.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- offset = 0;
- sectors = -((s64) old.k->size);
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = bkey_start_offset(&new->k) -
- old.k->p.offset;
- break;
- case BCH_EXTENT_OVERLAP_FRONT:
- offset = 0;
- sectors = bkey_start_offset(old.k) -
- new->k.p.offset;
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = -((s64) new->k.size);
- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
- break;
- }
-
- BUG_ON(sectors >= 0);
- }
-
- return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
- trans->journal_res.seq, flags) ?: 1;
-}
-
int bch2_mark_update(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
+ struct bkey_i *new,
struct bch_fs_usage *fs_usage,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = iter_l(iter)->b;
struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_k;
+ struct bkey_packed *_old;
+ struct bkey_s_c old;
+ struct bkey unpacked;
int ret = 0;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1378,34 +1358,87 @@ int bch2_mark_update(struct btree_trans *trans,
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, insert->k.size,
- fs_usage, trans->journal_res.seq,
- BTREE_TRIGGER_INSERT|flags);
+ bkey_init(&unpacked);
+ old = (struct bkey_s_c) { &unpacked, NULL };
- if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
- return 0;
+ if (!btree_node_type_is_extents(iter->btree_id)) {
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+ _old = bch2_btree_node_iter_peek(&node_iter, b);
+ if (_old)
+ old = bkey_disassemble(b, _old, &unpacked);
+ } else {
+ struct bkey_cached *ck = (void *) iter->l[0].b;
- /*
- * For non extents, we only mark the new key, not the key being
- * overwritten - unless we're actually deleting:
- */
- if ((iter->btree_id == BTREE_ID_ALLOC ||
- iter->btree_id == BTREE_ID_EC) &&
- !bkey_deleted(&insert->k))
- return 0;
+ if (ck->valid)
+ old = bkey_i_to_s_c(ck->k);
+ }
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ if (old.k->type == new->k.type) {
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+ fs_usage, trans->journal_res.seq,
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- ret = bch2_mark_overwrite(trans, iter, k, insert,
- fs_usage, flags,
- btree_node_type_is_extents(iter->btree_id));
- if (ret <= 0)
- break;
+ } else {
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+ fs_usage, trans->journal_res.seq,
+ BTREE_TRIGGER_INSERT|flags);
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
+ fs_usage, trans->journal_res.seq,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+ } else {
+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
+ 0, new->k.size,
+ fs_usage, trans->journal_res.seq,
+ BTREE_TRIGGER_INSERT|flags);
- bch2_btree_node_iter_advance(&node_iter, b);
+ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
+ unsigned offset = 0;
+ s64 sectors;
+
+ old = bkey_disassemble(b, _old, &unpacked);
+ sectors = -((s64) old.k->size);
+
+ flags |= BTREE_TRIGGER_OVERWRITE;
+
+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+ return 0;
+
+ switch (bch2_extent_overlap(&new->k, old.k)) {
+ case BCH_EXTENT_OVERLAP_ALL:
+ offset = 0;
+ sectors = -((s64) old.k->size);
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = bkey_start_offset(&new->k) -
+ old.k->p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_FRONT:
+ offset = 0;
+ sectors = bkey_start_offset(old.k) -
+ new->k.p.offset;
+ break;
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = -((s64) new->k.size);
+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
+ break;
+ }
+
+ BUG_ON(sectors >= 0);
+
+ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
+ offset, sectors, fs_usage,
+ trans->journal_res.seq, flags) ?: 1;
+ if (ret <= 0)
+ break;
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
}
return ret;
@@ -1460,8 +1493,10 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
} else {
struct bkey_cached *ck = (void *) i->iter->l[0].b;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
- pr_err("%s", buf);
+ if (ck->valid) {
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
+ pr_err("%s", buf);
+ }
}
}
}
@@ -1632,7 +1667,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
BUG_ON(!sectors);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = data_type == BCH_DATA_BTREE
+ s64 disk_sectors = data_type == BCH_DATA_btree
? sectors
: ptr_disk_sectors_delta(p, offset, sectors, flags);
@@ -1774,11 +1809,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
: -c->opts.btree_node_size;
return bch2_trans_mark_extent(trans, k, offset, sectors,
- flags, BCH_DATA_BTREE);
+ flags, BCH_DATA_btree);
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
return bch2_trans_mark_extent(trans, k, offset, sectors,
- flags, BCH_DATA_USER);
+ flags, BCH_DATA_user);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);
@@ -1829,9 +1864,6 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (ret)
return ret;
- if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
- return 0;
-
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
struct bkey_cached *ck = (void *) iter->l[0].b;
@@ -1992,7 +2024,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
- copygc_heap copygc_heap;
size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
@@ -2001,15 +2032,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve * 2);
- bool resize = ca->buckets[0] != NULL,
- start_copygc = ca->copygc_thread != NULL;
+ bool resize = ca->buckets[0] != NULL;
int ret = -ENOMEM;
unsigned i;
memset(&free, 0, sizeof(free));
memset(&free_inc, 0, sizeof(free_inc));
memset(&alloc_heap, 0, sizeof(alloc_heap));
- memset(&copygc_heap, 0, sizeof(copygc_heap));
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
@@ -2022,14 +2051,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
- !init_heap(&copygc_heap, copygc_reserve, GFP_KERNEL))
+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
goto err;
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = nbuckets;
- bch2_copygc_stop(ca);
+ bch2_copygc_stop(c);
if (resize) {
down_write(&c->gc_lock);
@@ -2072,21 +2100,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
/* with gc lock held, alloc_heap can't be in use: */
swap(ca->alloc_heap, alloc_heap);
- /* and we shut down copygc: */
- swap(ca->copygc_heap, copygc_heap);
-
nbuckets = ca->mi.nbuckets;
if (resize)
up_write(&ca->bucket_lock);
- if (start_copygc &&
- bch2_copygc_start(c, ca))
- bch_err(ca, "error restarting copygc thread");
-
ret = 0;
err:
- free_heap(&copygc_heap);
free_heap(&alloc_heap);
free_fifo(&free_inc);
for (i = 0; i < RESERVE_NR; i++)
@@ -2103,7 +2123,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
- free_heap(&ca->copygc_heap);
free_heap(&ca->alloc_heap);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 97265fe90e96..653f6761862e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -99,9 +99,9 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
{
if (k->type == KEY_TYPE_btree_ptr ||
k->type == KEY_TYPE_btree_ptr_v2)
- return BCH_DATA_BTREE;
+ return BCH_DATA_btree;
- return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
@@ -182,7 +182,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
/* Device usage: */
-struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
void bch2_dev_usage_from_buckets(struct bch_fs *);
@@ -202,9 +202,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
/*
* Number of reclaimable buckets - only for use by the allocator thread:
*/
-static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca)
{
- return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
}
static inline u64 __dev_buckets_free(struct bch_dev *ca,
@@ -215,9 +215,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca,
fifo_used(&ca->free_inc);
}
-static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca)
{
- return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+ return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
}
/* Filesystem usage: */
@@ -259,14 +259,11 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
- struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
+ s64, struct bch_fs_usage *, u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, unsigned);
-int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
- struct bkey_s_c, struct bkey_i *,
- struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 53f22726893d..4ebe80b05ffc 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -123,6 +123,7 @@ struct disk_reservation {
};
struct copygc_heap_entry {
+ u8 dev;
u8 gen;
u32 sectors;
u64 offset;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 3af521947502..0377f9018d27 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -468,7 +468,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(c, ca);
+ src = bch2_dev_usage_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -463,7 +464,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index a9f5d5696622..1d1590de55e8 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -152,9 +152,8 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
timer->fn(timer);
}
-ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
unsigned long now;
unsigned i;
@@ -162,12 +161,10 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
now = atomic_long_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
- pr_buf(&out, "%ps:\t%li\n",
+ pr_buf(out, "%ps:\t%li\n",
clock->timers.data[i]->fn,
clock->timers.data[i]->expire - now);
spin_unlock(&clock->timer_lock);
-
- return out.pos - buf;
}
void bch2_io_clock_exit(struct io_clock *clock)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index da50afe206cc..70a0f7436c84 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -30,7 +30,7 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
__ret; \
})
-ssize_t bch2_io_timers_show(struct io_clock *, char *);
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
void bch2_io_clock_exit(struct io_clock *);
int bch2_io_clock_init(struct io_clock *);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index ddff52de2e97..b50d2b0d5fd3 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -7,7 +7,6 @@
#include "super-io.h"
#include <linux/lz4.h>
-#include <linux/sched/mm.h>
#include <linux/zlib.h>
#include <linux/zstd.h>
@@ -46,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
@@ -64,7 +63,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
struct bbuf ret;
struct bio_vec bv;
struct bvec_iter iter;
- unsigned nr_pages = 0, flags;
+ unsigned nr_pages = 0;
struct page *stack_pages[16];
struct page **pages = NULL;
void *data;
@@ -104,10 +103,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
__bio_for_each_segment(bv, bio, iter, start)
pages[nr_pages++] = bv.bv_page;
- flags = memalloc_nofs_save();
data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- memalloc_nofs_restore(flags);
-
if (pages != stack_pages)
kfree(pages);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9442d6e4041c..425b0b806cee 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -200,40 +200,6 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
return false;
}
-static void ec_stripe_key_init(struct bch_fs *c,
- struct bkey_i_stripe *s,
- struct open_buckets *blocks,
- struct open_buckets *parity,
- unsigned stripe_size)
-{
- struct open_bucket *ob;
- unsigned i, u64s;
-
- bkey_stripe_init(&s->k_i);
- s->v.sectors = cpu_to_le16(stripe_size);
- s->v.algorithm = 0;
- s->v.nr_blocks = parity->nr + blocks->nr;
- s->v.nr_redundant = parity->nr;
- s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
- s->v.csum_type = BCH_CSUM_CRC32C;
- s->v.pad = 0;
-
- open_bucket_for_each(c, blocks, ob, i)
- s->v.ptrs[i] = ob->ptr;
-
- open_bucket_for_each(c, parity, ob, i)
- s->v.ptrs[blocks->nr + i] = ob->ptr;
-
- while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
- BUG_ON(1 << s->v.csum_granularity_bits >=
- le16_to_cpu(s->v.sectors) ||
- s->v.csum_granularity_bits == U8_MAX);
- s->v.csum_granularity_bits++;
- }
-
- set_bkey_val_u64s(&s->k, u64s);
-}
-
/* Checksumming: */
static void ec_generate_checksums(struct ec_stripe_buf *buf)
@@ -360,7 +326,9 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+ bio_data_dir(bio) ? "write" : "read",
+ bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
bio_put(&ec_bio->bio);
@@ -605,39 +573,16 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
BUG_ON(h->data[m->heap_idx].idx != idx);
}
-void bch2_stripes_heap_update(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- size_t i;
-
- if (m->alive) {
- heap_verify_backpointer(c, idx);
-
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
- i = m->heap_idx;
- heap_sift_up(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
- heap_sift_down(h, i, ec_stripes_heap_cmp,
- ec_stripes_heap_set_backpointer);
-
- heap_verify_backpointer(c, idx);
- } else {
- bch2_stripes_heap_insert(c, m, idx);
- }
-
- if (stripe_idx_to_delete(c) >= 0 &&
- !percpu_ref_is_dying(&c->writes))
- schedule_work(&c->ec_stripe_delete_work);
-}
-
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
+ if (!m->on_heap)
+ return;
+
+ m->on_heap = false;
+
heap_verify_backpointer(c, idx);
- m->alive = false;
heap_del(&c->ec_stripes_heap, m->heap_idx,
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
@@ -646,23 +591,54 @@ void bch2_stripes_heap_del(struct bch_fs *c,
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
+ if (m->on_heap)
+ return;
+
BUG_ON(heap_full(&c->ec_stripes_heap));
+ m->on_heap = true;
+
heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = m->blocks_nonempty,
}),
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
- m->alive = true;
heap_verify_backpointer(c, idx);
}
+void bch2_stripes_heap_update(struct bch_fs *c,
+ struct stripe *m, size_t idx)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ size_t i;
+
+ if (!m->on_heap)
+ return;
+
+ heap_verify_backpointer(c, idx);
+
+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+ i = m->heap_idx;
+ heap_sift_up(h, i, ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+ heap_sift_down(h, i, ec_stripes_heap_cmp,
+ ec_stripes_heap_set_backpointer);
+
+ heap_verify_backpointer(c, idx);
+
+ if (stripe_idx_to_delete(c) >= 0 &&
+ !percpu_ref_is_dying(&c->writes))
+ schedule_work(&c->ec_stripe_delete_work);
+}
+
/* stripe deletion */
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
+ //pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx),
POS(0, idx + 1),
@@ -675,23 +651,20 @@ static void ec_stripe_delete_work(struct work_struct *work)
container_of(work, struct bch_fs, ec_stripe_delete_work);
ssize_t idx;
- down_read(&c->gc_lock);
- mutex_lock(&c->ec_stripe_create_lock);
-
while (1) {
spin_lock(&c->ec_stripes_heap_lock);
idx = stripe_idx_to_delete(c);
- spin_unlock(&c->ec_stripes_heap_lock);
-
- if (idx < 0)
+ if (idx < 0) {
+ spin_unlock(&c->ec_stripes_heap_lock);
break;
+ }
+
+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
if (ec_stripe_delete(c, idx))
break;
}
-
- mutex_unlock(&c->ec_stripe_create_lock);
- up_read(&c->gc_lock);
}
/* stripe creation: */
@@ -784,6 +757,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+ /* XXX this doesn't support the reflink btree */
+
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(pos),
BTREE_ITER_INTENT);
@@ -809,12 +784,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
bkey_on_stack_reassemble(&sk, c, k);
e = bkey_i_to_s_extent(sk.k);
- extent_for_each_ptr(e, ptr) {
- if (ptr->dev == dev)
- ec_ptr = ptr;
- else
- ptr->cached = true;
- }
+ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
+ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
+ BUG_ON(!ec_ptr);
extent_stripe_ptr_add(e, s, ec_ptr, idx);
@@ -844,6 +816,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
struct bch_fs *c = s->c;
struct open_bucket *ob;
struct bkey_i *k;
+ struct stripe *m;
struct bch_stripe *v = &s->stripe.key.v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
struct closure cl;
@@ -854,10 +827,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
closure_init_stack(&cl);
if (s->err) {
- bch_err(c, "error creating stripe: error writing data buckets");
+ if (s->err != -EROFS)
+ bch_err(c, "error creating stripe: error writing data buckets");
goto err;
}
+ BUG_ON(!s->allocated);
+
if (!percpu_ref_tryget(&c->writes))
goto err;
@@ -880,22 +856,33 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err_put_writes;
}
- mutex_lock(&c->ec_stripe_create_lock);
-
- ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+ ret = s->existing_stripe
+ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
+ NULL, NULL, BTREE_INSERT_NOFAIL)
+ : ec_stripe_bkey_insert(c, &s->stripe.key);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
- goto err_unlock;
+ goto err_put_writes;
}
for_each_keylist_key(&s->keys, k) {
ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
- if (ret)
+ if (ret) {
+ bch_err(c, "error creating stripe: error updating pointers");
break;
+ }
}
-err_unlock:
- mutex_unlock(&c->ec_stripe_create_lock);
+ spin_lock(&c->ec_stripes_heap_lock);
+ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
+#if 0
+ pr_info("created a %s stripe %llu",
+ s->existing_stripe ? "existing" : "new",
+ s->stripe.key.k.p.offset);
+#endif
+ BUG_ON(m->on_heap);
+ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
err_put_writes:
percpu_ref_put(&c->writes);
err:
@@ -908,30 +895,52 @@ err:
bch2_keylist_free(&s->keys, s->inline_keys);
- mutex_lock(&s->h->lock);
- list_del(&s->list);
- mutex_unlock(&s->h->lock);
-
for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
kvpfree(s->stripe.data[i], s->stripe.size << 9);
kfree(s);
}
-static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+static void ec_stripe_create_work(struct work_struct *work)
{
- struct ec_stripe_new *s = h->s;
+ struct bch_fs *c = container_of(work,
+ struct bch_fs, ec_stripe_create_work);
+ struct ec_stripe_new *s, *n;
+restart:
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
+ if (!atomic_read(&s->pin)) {
+ list_del(&s->list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+ ec_stripe_create(s);
+ goto restart;
+ }
+ mutex_unlock(&c->ec_stripe_new_lock);
+}
- list_add(&s->list, &h->stripes);
- h->s = NULL;
+static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ BUG_ON(atomic_read(&s->pin) <= 0);
- return s;
+ if (atomic_dec_and_test(&s->pin)) {
+ BUG_ON(!s->pending);
+ queue_work(system_long_wq, &c->ec_stripe_create_work);
+ }
}
-static void ec_stripe_new_put(struct ec_stripe_new *s)
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
- BUG_ON(atomic_read(&s->pin) <= 0);
- if (atomic_dec_and_test(&s->pin))
- ec_stripe_create(s);
+ struct ec_stripe_new *s = h->s;
+
+ BUG_ON(!s->allocated && !s->err);
+
+ h->s = NULL;
+ s->pending = true;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_add(&s->list, &c->ec_stripe_new_list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ ec_stripe_new_put(c, s);
}
/* have a full bucket - hand it off to be erasure coded: */
@@ -942,7 +951,7 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
if (ob->sectors_free)
s->err = -1;
- ec_stripe_new_put(s);
+ ec_stripe_new_put(c, s);
}
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -976,6 +985,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
+ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
+
ec = ob->ec;
mutex_lock(&ec->lock);
@@ -1034,14 +1045,43 @@ static unsigned pick_blocksize(struct bch_fs *c,
return best.size;
}
-int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+ return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+ struct bkey_i_stripe *s,
+ unsigned nr_data,
+ unsigned nr_parity,
+ unsigned stripe_size)
+{
+ unsigned u64s;
+
+ bkey_stripe_init(&s->k_i);
+ s->v.sectors = cpu_to_le16(stripe_size);
+ s->v.algorithm = 0;
+ s->v.nr_blocks = nr_data + nr_parity;
+ s->v.nr_redundant = nr_parity;
+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
+ s->v.csum_type = BCH_CSUM_CRC32C;
+ s->v.pad = 0;
+
+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+ BUG_ON(1 << s->v.csum_granularity_bits >=
+ le16_to_cpu(s->v.sectors) ||
+ s->v.csum_granularity_bits == U8_MAX);
+ s->v.csum_granularity_bits++;
+ }
+
+ set_bkey_val_u64s(&s->k, u64s);
+}
+
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s;
unsigned i;
- BUG_ON(h->parity.nr != h->redundancy);
- BUG_ON(!h->blocks.nr);
- BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
lockdep_assert_held(&h->lock);
s = kzalloc(sizeof(*s), GFP_KERNEL);
@@ -1052,11 +1092,9 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
atomic_set(&s->pin, 1);
s->c = c;
s->h = h;
- s->blocks = h->blocks;
- s->parity = h->parity;
-
- memset(&h->blocks, 0, sizeof(h->blocks));
- memset(&h->parity, 0, sizeof(h->parity));
+ s->nr_data = min_t(unsigned, h->nr_active_devs,
+ EC_STRIPE_MAX) - h->redundancy;
+ s->nr_parity = h->redundancy;
bch2_keylist_init(&s->keys, s->inline_keys);
@@ -1064,9 +1102,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->stripe.size = h->blocksize;
memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
- ec_stripe_key_init(c, &s->stripe.key,
- &s->blocks, &s->parity,
- h->blocksize);
+ ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+ s->nr_parity, h->blocksize);
for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
@@ -1098,14 +1135,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
mutex_init(&h->lock);
mutex_lock(&h->lock);
- INIT_LIST_HEAD(&h->stripes);
h->target = target;
h->algo = algo;
h->redundancy = redundancy;
rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+ h->devs = target_rw_devs(c, BCH_DATA_user, target);
for_each_member_device_rcu(ca, c, i, &h->devs)
if (!ca->mi.durability)
@@ -1118,26 +1154,22 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
- list_add(&h->list, &c->ec_new_stripe_list);
+ list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
-void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
{
- struct ec_stripe_new *s = NULL;
-
if (h->s &&
+ h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->blocks.nr) == h->s->blocks.nr)
- s = ec_stripe_set_pending(h);
+ ec_stripe_set_pending(c, h);
mutex_unlock(&h->lock);
-
- if (s)
- ec_stripe_new_put(s);
}
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
unsigned target,
unsigned algo,
unsigned redundancy)
@@ -1147,8 +1179,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
if (!redundancy)
return NULL;
- mutex_lock(&c->ec_new_stripe_lock);
- list_for_each_entry(h, &c->ec_new_stripe_list, list)
+ mutex_lock(&c->ec_stripe_head_lock);
+ list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
h->redundancy == redundancy) {
@@ -1158,7 +1190,196 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
found:
- mutex_unlock(&c->ec_new_stripe_lock);
+ mutex_unlock(&c->ec_stripe_head_lock);
+ return h;
+}
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+{
+ struct bch_devs_mask devs;
+ struct open_bucket *ob;
+ unsigned i, nr_have, nr_data =
+ min_t(unsigned, h->nr_active_devs,
+ EC_STRIPE_MAX) - h->redundancy;
+ bool have_cache = true;
+ int ret = 0;
+
+ devs = h->devs;
+
+ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
+ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
+ --nr_data;
+ }
+
+ BUG_ON(h->s->blocks.nr > nr_data);
+ BUG_ON(h->s->parity.nr > h->redundancy);
+
+ open_bucket_for_each(c, &h->s->parity, ob, i)
+ __clear_bit(ob->ptr.dev, devs.d);
+ open_bucket_for_each(c, &h->s->blocks, ob, i)
+ __clear_bit(ob->ptr.dev, devs.d);
+
+ percpu_down_read(&c->mark_lock);
+ rcu_read_lock();
+
+ if (h->s->parity.nr < h->redundancy) {
+ nr_have = h->s->parity.nr;
+
+ ret = bch2_bucket_alloc_set(c, &h->s->parity,
+ &h->parity_stripe,
+ &devs,
+ h->redundancy,
+ &nr_have,
+ &have_cache,
+ RESERVE_NONE,
+ 0,
+ NULL);
+ if (ret)
+ goto err;
+ }
+
+ if (h->s->blocks.nr < nr_data) {
+ nr_have = h->s->blocks.nr;
+
+ ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+ &h->block_stripe,
+ &devs,
+ nr_data,
+ &nr_have,
+ &have_cache,
+ RESERVE_NONE,
+ 0,
+ NULL);
+ if (ret)
+ goto err;
+ }
+err:
+ rcu_read_unlock();
+ percpu_up_read(&c->mark_lock);
+ return ret;
+}
+
+/* XXX: doesn't obey target: */
+static s64 get_existing_stripe(struct bch_fs *c,
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ struct stripe *m;
+ size_t heap_idx;
+ u64 stripe_idx;
+
+ if (may_create_new_stripe(c))
+ return -1;
+
+ spin_lock(&c->ec_stripes_heap_lock);
+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ if (!h->data[heap_idx].blocks_nonempty)
+ continue;
+
+ stripe_idx = h->data[heap_idx].idx;
+ m = genradix_ptr(&c->stripes[0], stripe_idx);
+
+ if (m->algorithm == algo &&
+ m->nr_redundant == redundancy &&
+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
+ bch2_stripes_heap_del(c, m, stripe_idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ return stripe_idx;
+ }
+ }
+
+ spin_unlock(&c->ec_stripes_heap_lock);
+ return -1;
+}
+
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (!ret)
+ bkey_reassemble(&stripe->key.k_i, k);
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy)
+{
+ struct closure cl;
+ struct ec_stripe_head *h;
+ struct open_bucket *ob;
+ unsigned i, data_idx = 0;
+ s64 idx;
+
+ closure_init_stack(&cl);
+
+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
+ if (!h)
+ return NULL;
+
+ if (!h->s && ec_new_stripe_alloc(c, h)) {
+ bch2_ec_stripe_head_put(c, h);
+ return NULL;
+ }
+
+ if (!h->s->allocated) {
+ if (!h->s->existing_stripe &&
+ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
+ //pr_info("got existing stripe %llu", idx);
+
+ h->s->existing_stripe = true;
+ h->s->existing_stripe_idx = idx;
+ if (get_stripe_key(c, idx, &h->s->stripe)) {
+ /* btree error */
+ BUG();
+ }
+
+ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
+ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
+ __set_bit(i, h->s->blocks_allocated);
+ ec_block_io(c, &h->s->stripe, READ, i, &cl);
+ }
+ }
+
+ if (new_stripe_alloc_buckets(c, h)) {
+ bch2_ec_stripe_head_put(c, h);
+ h = NULL;
+ goto out;
+ }
+
+ open_bucket_for_each(c, &h->s->blocks, ob, i) {
+ data_idx = find_next_zero_bit(h->s->blocks_allocated,
+ h->s->nr_data, data_idx);
+ BUG_ON(data_idx >= h->s->nr_data);
+
+ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
+ h->s->data_block_idx[i] = data_idx;
+ data_idx++;
+ }
+
+ open_bucket_for_each(c, &h->s->parity, ob, i)
+ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+
+ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
+ h->s->allocated = true;
+ }
+out:
+ closure_sync(&cl);
return h;
}
@@ -1168,14 +1389,10 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
struct open_bucket *ob;
unsigned i;
- mutex_lock(&c->ec_new_stripe_lock);
- list_for_each_entry(h, &c->ec_new_stripe_list, list) {
- struct ec_stripe_new *s = NULL;
+ mutex_lock(&c->ec_stripe_head_lock);
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
mutex_lock(&h->lock);
- bch2_open_buckets_stop_dev(c, ca, &h->blocks);
- bch2_open_buckets_stop_dev(c, ca, &h->parity);
-
if (!h->s)
goto unlock;
@@ -1187,15 +1404,12 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
goto found;
goto unlock;
found:
- h->s->err = -1;
- s = ec_stripe_set_pending(h);
+ h->s->err = -EROFS;
+ ec_stripe_set_pending(c, h);
unlock:
mutex_unlock(&h->lock);
-
- if (s)
- ec_stripe_new_put(s);
}
- mutex_unlock(&c->ec_new_stripe_lock);
+ mutex_unlock(&c->ec_stripe_head_lock);
}
static int __bch2_stripe_write_key(struct btree_trans *trans,
@@ -1278,11 +1492,21 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
{
int ret = 0;
- if (k.k->type == KEY_TYPE_stripe)
+ if (k.k->type == KEY_TYPE_stripe) {
+ struct stripe *m;
+
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
+ if (ret)
+ return ret;
+
+ spin_lock(&c->ec_stripes_heap_lock);
+ m = genradix_ptr(&c->stripes[0], k.k->p.offset);
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
return ret;
}
@@ -1333,25 +1557,73 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
return 0;
}
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ ec_stripes_heap *h = &c->ec_stripes_heap;
+ struct stripe *m;
+ size_t i;
+
+ spin_lock(&c->ec_stripes_heap_lock);
+ for (i = 0; i < min(h->used, 20UL); i++) {
+ m = genradix_ptr(&c->stripes[0], h->data[i].idx);
+
+ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+ h->data[i].blocks_nonempty,
+ m->nr_blocks - m->nr_redundant,
+ m->nr_redundant);
+ }
+ spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct ec_stripe_head *h;
+ struct ec_stripe_new *s;
+
+ mutex_lock(&c->ec_stripe_head_lock);
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+ pr_buf(out, "target %u algo %u redundancy %u:\n",
+ h->target, h->algo, h->redundancy);
+
+ if (h->s)
+ pr_buf(out, "\tpending: blocks %u allocated %u\n",
+ h->s->blocks.nr,
+ bitmap_weight(h->s->blocks_allocated,
+ h->s->blocks.nr));
+ }
+ mutex_unlock(&c->ec_stripe_head_lock);
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_for_each_entry(h, &c->ec_stripe_new_list, list) {
+ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
+ s->blocks.nr,
+ bitmap_weight(s->blocks_allocated,
+ s->blocks.nr),
+ atomic_read(&s->pin));
+ }
+ mutex_unlock(&c->ec_stripe_new_lock);
+}
+
void bch2_fs_ec_exit(struct bch_fs *c)
{
struct ec_stripe_head *h;
while (1) {
- mutex_lock(&c->ec_new_stripe_lock);
- h = list_first_entry_or_null(&c->ec_new_stripe_list,
+ mutex_lock(&c->ec_stripe_head_lock);
+ h = list_first_entry_or_null(&c->ec_stripe_head_list,
struct ec_stripe_head, list);
if (h)
list_del(&h->list);
- mutex_unlock(&c->ec_new_stripe_lock);
+ mutex_unlock(&c->ec_stripe_head_lock);
if (!h)
break;
BUG_ON(h->s);
- BUG_ON(!list_empty(&h->stripes));
kfree(h);
}
+ BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
free_heap(&c->ec_stripes_heap);
genradix_free(&c->stripes[0]);
bioset_exit(&c->ec_bioset);
@@ -1359,6 +1631,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
int bch2_fs_ec_init(struct bch_fs *c)
{
+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 4dfaac034886..f8fc3d616cd7 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -93,9 +93,17 @@ struct ec_stripe_new {
int err;
+ u8 nr_data;
+ u8 nr_parity;
+ bool allocated;
+ bool pending;
+ bool existing_stripe;
+ u64 existing_stripe_idx;
+
unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
struct open_buckets blocks;
+ u8 data_block_idx[EC_STRIPE_MAX];
struct open_buckets parity;
struct keylist keys;
@@ -108,8 +116,6 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
- struct list_head stripes;
-
unsigned target;
unsigned algo;
unsigned redundancy;
@@ -122,9 +128,6 @@ struct ec_stripe_head {
struct dev_stripe_state block_stripe;
struct dev_stripe_state parity_stripe;
- struct open_buckets blocks;
- struct open_buckets parity;
-
struct ec_stripe_new *s;
};
@@ -139,7 +142,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
-void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
unsigned, unsigned);
@@ -157,6 +160,9 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
int bch2_ec_mem_alloc(struct bch_fs *, bool);
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
+
void bch2_fs_ec_exit(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 5c3f77c8aac7..e4d633fca5bf 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -22,6 +22,7 @@ struct stripe {
unsigned alive:1;
unsigned dirty:1;
+ unsigned on_heap:1;
u8 blocks_nonempty;
u16 block_sectors[EC_STRIPE_MAX];
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 251d4af773a5..568f039edcff 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -179,11 +179,6 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
if (!percpu_down_read_trylock(&c->mark_lock))
return;
- bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked_locked(c, k, false), c,
- "btree key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
bkey_for_each_ptr(ptrs, ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -194,7 +189,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
goto err;
err = "inconsistent";
- if (mark.data_type != BCH_DATA_BTREE ||
+ if (mark.data_type != BCH_DATA_btree ||
mark.dirty_sectors < c->opts.btree_node_size)
goto err;
}
@@ -267,11 +262,6 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
if (!percpu_down_read_trylock(&c->mark_lock))
return;
- bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
- "extent key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-
extent_for_each_ptr_decode(e, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
@@ -289,7 +279,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
"key too stale: %i", stale);
bch2_fs_inconsistent_on(!stale &&
- (mark.data_type != BCH_DATA_USER ||
+ (mark.data_type != BCH_DATA_user ||
mark_sectors < disk_sectors), c,
"extent pointer not marked: %s:\n"
"type %u sectors %u < %u",
@@ -724,7 +714,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (WARN_ON(!s))
goto out;
- durability = max_t(unsigned, durability, s->nr_redundant);
+ durability += s->nr_redundant;
}
out:
return durability;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4411883ab7b8..951a436195ee 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -603,7 +603,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@@ -628,10 +628,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -783,11 +783,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1038,32 +1035,33 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1087,7 +1085,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1241,7 +1239,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1518,24 +1516,24 @@ retry_reservation:
if (!pg_copied)
break;
+ if (!PageUptodate(page) &&
+ pg_copied != PAGE_SIZE &&
+ pos + copied + pg_copied < inode->v.i_size) {
+ zero_user(page, 0, PAGE_SIZE);
+ break;
+ }
+
flush_dcache_page(page);
iov_iter_advance(iter, pg_copied);
copied += pg_copied;
+
+ if (pg_copied != pg_len)
+ break;
}
if (!copied)
goto out;
- if (copied < len &&
- ((offset + copied) & (PAGE_SIZE - 1))) {
- struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
-
- if (!PageUptodate(page)) {
- zero_user(page, 0, PAGE_SIZE);
- copied -= (offset + copied) & (PAGE_SIZE - 1);
- }
- }
-
spin_lock(&inode->v.i_lock);
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
@@ -1632,6 +1630,7 @@ again:
}
pos += ret;
written += ret;
+ ret = 0;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
@@ -1809,8 +1808,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
+ unsigned unaligned;
bool sync = dio->sync;
long ret;
@@ -1818,22 +1818,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
goto loop;
while (1) {
- size_t extra = dio->iter.count -
- min(BIO_MAX_PAGES * PAGE_SIZE, dio->iter.count);
-
if (kthread)
use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
- /*
- * Don't issue more than 2MB at once, the bcachefs io path in
- * io.c can't bounce more than that:
- */
-
- dio->iter.count -= extra;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
- dio->iter.count += extra;
current->faults_disabled_mapping = NULL;
if (kthread)
@@ -1851,7 +1841,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1914,7 +1904,7 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
@@ -2825,235 +2815,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3249,7 +3010,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index ba73e5258e8d..a47923d67f7a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -966,15 +966,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -992,7 +983,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1523,7 +1514,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1aca92cacdfc..19b79e60126a 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -31,9 +31,17 @@
#include <linux/blkdev.h>
#include <linux/random.h>
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+ if (status == BLK_STS_REMOVED)
+ return "device removed";
+ return blk_status_to_str(status);
+}
+
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
const struct bch_devs_mask *devs;
@@ -124,10 +132,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -611,7 +619,8 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+ bch2_blk_status_to_str(bio->bi_status)))
set_bit(wbio->dev, op->failed.d);
if (wbio->have_ioref) {
@@ -1053,7 +1062,10 @@ static void __bch2_write(struct closure *cl)
struct write_point *wp;
struct bio *bio;
bool skip_put = true;
+ unsigned nofs_flags;
int ret;
+
+ nofs_flags = memalloc_nofs_save();
again:
memset(&op->failed, 0, sizeof(op->failed));
@@ -1100,6 +1112,16 @@ again:
goto flush_io;
}
+ /*
+ * It's possible for the allocator to fail, put us on the
+ * freelist waitlist, and then succeed in one of various retry
+ * paths: if that happens, we need to disable the skip_put
+ * optimization because otherwise there won't necessarily be a
+ * barrier before we free the bch_write_op:
+ */
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ skip_put = false;
+
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
bch2_alloc_sectors_done(c, wp);
@@ -1129,19 +1151,21 @@ again:
key_to_write = (void *) (op->insert_keys.keys_p +
key_to_write_offset);
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
key_to_write);
} while (ret);
if (!skip_put)
continue_at(cl, bch2_write_index, index_update_wq(op));
+out:
+ memalloc_nofs_restore(nofs_flags);
return;
err:
op->error = ret;
op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
- return;
+ goto out;
flush_io:
/*
* If the write can't all be submitted at once, we generally want to
@@ -1152,7 +1176,7 @@ flush_io:
*/
if (current->flags & PF_WQ_WORKER) {
continue_at(cl, bch2_write_index, index_update_wq(op));
- return;
+ goto out;
}
closure_sync(cl);
@@ -1163,7 +1187,7 @@ flush_io:
if (op->error) {
op->flags |= BCH_WRITE_DONE;
continue_at_nobarrier(cl, bch2_write_done, NULL);
- return;
+ goto out;
}
}
@@ -1921,7 +1945,8 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+ bch2_blk_status_to_str(bio->bi_status))) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
@@ -2174,7 +2199,7 @@ get_bio:
goto out;
}
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
bio_sectors(&rbio->bio));
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 0ad293bd6295..ded468d70f09 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -22,6 +22,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
#define BLK_STS_REMOVED ((__force blk_status_t)128)
+const char *bch2_blk_status_to_str(blk_status_t);
+
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 684e4c9a5d98..b23727d212b9 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -78,7 +78,6 @@ struct bch_write_bio {
u64 submit_time;
struct bch_devs_list failed;
- u8 order;
u8 dev;
unsigned split:1,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index b4f7b61ba9ac..210ad1b0c469 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -847,7 +847,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
0);
@@ -1135,9 +1135,8 @@ out:
/* debug: */
-ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
@@ -1147,7 +1146,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
spin_lock(&j->lock);
s = READ_ONCE(j->reservations);
- pr_buf(&out,
+ pr_buf(out,
"active journal entries:\t%llu\n"
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
@@ -1165,44 +1164,44 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- pr_buf(&out, "error\n");
+ pr_buf(out, "error\n");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- pr_buf(&out, "closed\n");
+ pr_buf(out, "closed\n");
break;
default:
- pr_buf(&out, "%u/%u\n",
+ pr_buf(out, "%u/%u\n",
s.cur_entry_offset,
j->cur_entry_u64s);
break;
}
- pr_buf(&out,
+ pr_buf(out,
"current entry refs:\t%u\n"
"prev entry unwritten:\t",
journal_state_count(s, s.idx));
if (s.prev_buf_unwritten)
- pr_buf(&out, "yes, ref %u sectors %u\n",
+ pr_buf(out, "yes, ref %u sectors %u\n",
journal_state_count(s, !s.idx),
journal_prev_buf(j)->sectors);
else
- pr_buf(&out, "no\n");
+ pr_buf(out, "no\n");
- pr_buf(&out,
+ pr_buf(out,
"need write:\t\t%i\n"
"replay done:\t\t%i\n",
test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
+ &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
- pr_buf(&out,
+ pr_buf(out,
"dev %u:\n"
"\tnr\t\t%u\n"
"\tavailable\t%u:%u\n"
@@ -1221,34 +1220,29 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
spin_unlock(&j->lock);
rcu_read_unlock();
-
- return out.pos - buf;
}
-ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
u64 i;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
- pr_buf(&out, "%llu: count %u\n",
+ pr_buf(out, "%llu: count %u\n",
i, atomic_read(&pin_list->count));
list_for_each_entry(pin, &pin_list->list, list)
- pr_buf(&out, "\t%px %ps\n",
+ pr_buf(out, "\t%px %ps\n",
pin, pin->flush);
if (!list_empty(&pin_list->flushed))
- pr_buf(&out, "flushed:\n");
+ pr_buf(out, "flushed:\n");
list_for_each_entry(pin, &pin_list->flushed, list)
- pr_buf(&out, "\t%px %ps\n",
+ pr_buf(out, "\t%px %ps\n",
pin, pin->flush);
}
spin_unlock(&j->lock);
-
- return out.pos - buf;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 9c286f58c854..56438840efd7 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -281,7 +281,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
@@ -499,8 +499,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
-ssize_t bch2_journal_print_debug(struct journal *, char *);
-ssize_t bch2_journal_print_pins(struct journal *, char *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index b7625285b3ad..89585833c846 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -6,6 +6,7 @@
#include "buckets.h"
#include "checksum.h"
#include "error.h"
+#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
@@ -661,7 +662,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
for_each_member_device(ca, c, iter) {
if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
@@ -695,11 +696,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
* the devices - this is wrong:
*/
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
"superblock not marked as containing replicas %s",
(bch2_replicas_entry_to_text(&PBUF(buf),
&replicas.e), buf)))) {
@@ -759,7 +760,7 @@ static void __journal_write_alloc(struct journal *j,
sectors > ja->sectors_free)
continue;
- bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
+ bch2_dev_stripe_increment(ca, &j->wp.stripe);
bch2_bkey_append_ptr(&w->key,
(struct bch_extent_ptr) {
@@ -796,7 +797,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
rcu_read_lock();
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
- &c->rw_devs[BCH_DATA_JOURNAL]);
+ &c->rw_devs[BCH_DATA_journal]);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@@ -914,7 +915,7 @@ static void journal_write_done(struct closure *cl)
goto err;
}
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
if (bch2_mark_replicas(c, &replicas.e))
goto err;
@@ -961,7 +962,8 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+ bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
struct journal_buf *w = journal_prev_buf(j);
unsigned long flags;
@@ -1105,7 +1107,7 @@ retry_alloc:
continue;
}
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
bio = ca->journal.bio;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 4811ab9f879e..57591983eebd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -70,7 +70,7 @@ static struct journal_space {
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
+ &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_this_device, sectors_this_device;
@@ -139,7 +139,7 @@ void bch2_journal_space_available(struct journal *j)
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
+ &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
@@ -618,7 +618,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
return ret;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
seq = 0;
@@ -627,7 +627,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
journal_seq_pin(j, seq)->devs);
seq++;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index dd2011e295cc..2f3be487ef65 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -247,11 +247,15 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
m->op.target = data_opts.target,
m->op.write_point = wp;
- if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
m->op.alloc_reserve = RESERVE_MOVINGGC;
+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ } else {
+ /* XXX: this should probably be passed in */
+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+ }
- m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
- BCH_WRITE_PAGES_STABLE|
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_FROM_INTERNAL;
@@ -316,12 +320,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -517,7 +521,7 @@ static int __bch2_move_data(struct bch_fs *c,
bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- stats->data_type = BCH_DATA_USER;
+ stats->data_type = BCH_DATA_user;
stats->btree_id = btree_id;
stats->pos = POS_MIN;
@@ -642,7 +646,7 @@ int bch2_move_data(struct bch_fs *c,
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
- stats->data_type = BCH_DATA_USER;
+ stats->data_type = BCH_DATA_user;
ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
pred, arg, stats, BTREE_ID_EXTENTS) ?:
@@ -677,7 +681,7 @@ static int bch2_move_btree(struct bch_fs *c,
bch2_trans_init(&trans, c, 0, 0);
- stats->data_type = BCH_DATA_BTREE;
+ stats->data_type = BCH_DATA_btree;
for (id = 0; id < BTREE_ID_NR; id++) {
stats->btree_id = id;
@@ -773,7 +777,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
- stats->data_type = BCH_DATA_JOURNAL;
+ stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
@@ -794,7 +798,7 @@ int bch2_data_job(struct bch_fs *c,
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- stats->data_type = BCH_DATA_JOURNAL;
+ stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 0a87cd7405dd..55aa463f992f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -12,6 +12,7 @@
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
+#include "error.h"
#include "extents.h"
#include "eytzinger.h"
#include "io.h"
@@ -43,13 +44,6 @@
#define COPYGC_BUCKETS_PER_ITER(ca) \
((ca)->free[RESERVE_MOVINGGC].size / 2)
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca) \
- ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-
static inline int sectors_used_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
@@ -62,18 +56,22 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
const struct copygc_heap_entry *l = _l;
const struct copygc_heap_entry *r = _r;
- return cmp_int(l->offset, r->offset);
+ return cmp_int(l->dev, r->dev) ?:
+ cmp_int(l->offset, r->offset);
}
-static bool __copygc_pred(struct bch_dev *ca,
- struct bkey_s_c k)
+static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
{
- copygc_heap *h = &ca->copygc_heap;
- const struct bch_extent_ptr *ptr =
- bch2_bkey_has_device(k, ca->dev_idx);
-
- if (ptr) {
- struct copygc_heap_entry search = { .offset = ptr->offset };
+ copygc_heap *h = &c->copygc_heap;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct copygc_heap_entry search = {
+ .dev = ptr->dev,
+ .offset = ptr->offset
+ };
ssize_t i = eytzinger0_find_le(h->data, h->used,
sizeof(h->data[0]),
@@ -89,12 +87,13 @@ static bool __copygc_pred(struct bch_dev *ca,
BUG_ON(i != j);
#endif
- return (i >= 0 &&
- ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
- ptr->gen == h->data[i].gen);
+ if (i >= 0 &&
+ ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+ ptr->gen == h->data[i].gen)
+ return ptr->dev;
}
- return false;
+ return -1;
}
static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
@@ -102,14 +101,13 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
- struct bch_dev *ca = arg;
-
- if (!__copygc_pred(ca, k))
+ int dev_idx = __copygc_pred(c, k);
+ if (dev_idx < 0)
return DATA_SKIP;
- data_opts->target = dev_to_target(ca->dev_idx);
+ data_opts->target = io_opts->background_target;
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
- data_opts->rewrite_dev = ca->dev_idx;
+ data_opts->rewrite_dev = dev_idx;
return DATA_REWRITE;
}
@@ -125,20 +123,21 @@ static bool have_copygc_reserve(struct bch_dev *ca)
return ret;
}
-static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_copygc(struct bch_fs *c)
{
- copygc_heap *h = &ca->copygc_heap;
+ copygc_heap *h = &c->copygc_heap;
struct copygc_heap_entry e, *i;
struct bucket_array *buckets;
struct bch_move_stats move_stats;
u64 sectors_to_move = 0, sectors_not_moved = 0;
+ u64 sectors_reserved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
- size_t b;
+ struct bch_dev *ca;
+ unsigned dev_idx;
+ size_t b, heap_size = 0;
int ret;
memset(&move_stats, 0, sizeof(move_stats));
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
/*
* Find buckets with lowest sector counts, skipping completely
* empty buckets, by building a maxheap sorted by sector count,
@@ -147,38 +146,57 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
*/
h->used = 0;
- /*
- * We need bucket marks to be up to date - gc can't be recalculating
- * them:
- */
- down_read(&c->gc_lock);
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- struct copygc_heap_entry e;
-
- if (m.owned_by_allocator ||
- m.data_type != BCH_DATA_USER ||
- !bucket_sectors_used(m) ||
- bucket_sectors_used(m) >= ca->mi.bucket_size)
- continue;
+ for_each_rw_member(ca, c, dev_idx)
+ heap_size += ca->mi.nbuckets >> 7;
- e = (struct copygc_heap_entry) {
- .gen = m.gen,
- .sectors = bucket_sectors_used(m),
- .offset = bucket_to_sector(ca, b),
- };
- heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+ if (h->size < heap_size) {
+ free_heap(&c->copygc_heap);
+ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
+ bch_err(c, "error allocating copygc heap");
+ return 0;
+ }
+ }
+
+ for_each_rw_member(ca, c, dev_idx) {
+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+ spin_lock(&ca->fs->freelist_lock);
+ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
+ spin_unlock(&ca->fs->freelist_lock);
+
+ down_read(&ca->bucket_lock);
+ buckets = bucket_array(ca);
+
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+ struct copygc_heap_entry e;
+
+ if (m.owned_by_allocator ||
+ m.data_type != BCH_DATA_user ||
+ !bucket_sectors_used(m) ||
+ bucket_sectors_used(m) >= ca->mi.bucket_size)
+ continue;
+
+ e = (struct copygc_heap_entry) {
+ .dev = dev_idx,
+ .gen = m.gen,
+ .sectors = bucket_sectors_used(m),
+ .offset = bucket_to_sector(ca, b),
+ };
+ heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
+ }
+ up_read(&ca->bucket_lock);
+ }
+
+ if (!sectors_reserved) {
+ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+ return -1;
}
- up_read(&ca->bucket_lock);
- up_read(&c->gc_lock);
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += i->sectors;
- while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+ while (sectors_to_move > sectors_reserved) {
BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
sectors_to_move -= e.sectors;
}
@@ -186,30 +204,39 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
buckets_to_move = h->used;
if (!buckets_to_move)
- return;
+ return 0;
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, NULL);
- ret = bch2_move_data(c, &ca->copygc_pd.rate,
- writepoint_ptr(&ca->copygc_write_point),
+ ret = bch2_move_data(c, &c->copygc_pd.rate,
+ writepoint_ptr(&c->copygc_write_point),
POS_MIN, POS_MAX,
- copygc_pred, ca,
+ copygc_pred, NULL,
&move_stats);
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for (i = h->data; i < h->data + h->used; i++) {
- size_t b = sector_to_bucket(ca, i->offset);
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+ for_each_rw_member(ca, c, dev_idx) {
+ down_read(&ca->bucket_lock);
+ buckets = bucket_array(ca);
+ for (i = h->data; i < h->data + h->used; i++) {
+ struct bucket_mark m;
+ size_t b;
+
+ if (i->dev != dev_idx)
+ continue;
- if (i->gen == m.gen && bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
- buckets_not_moved++;
+ b = sector_to_bucket(ca, i->offset);
+ m = READ_ONCE(buckets->b[b].mark);
+
+ if (i->gen == m.gen &&
+ bucket_sectors_used(m)) {
+ sectors_not_moved += bucket_sectors_used(m);
+ buckets_not_moved++;
+ }
}
+ up_read(&ca->bucket_lock);
}
- up_read(&ca->bucket_lock);
if (sectors_not_moved && !ret)
bch_warn_ratelimited(c,
@@ -220,9 +247,10 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
atomic64_read(&move_stats.keys_raced),
atomic64_read(&move_stats.sectors_raced));
- trace_copygc(ca,
+ trace_copygc(c,
atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
buckets_to_move, buckets_not_moved);
+ return 0;
}
/*
@@ -239,20 +267,27 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
* often and continually reduce the amount of fragmented space as the device
* fills up. So, we increase the threshold by half the current free space.
*/
-unsigned long bch2_copygc_wait_amount(struct bch_dev *ca)
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
- u64 fragmented_allowed = ca->copygc_threshold +
- ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1);
+ struct bch_dev *ca;
+ unsigned dev_idx;
+ u64 fragmented_allowed = c->copygc_threshold;
+ u64 fragmented = 0;
+
+ for_each_rw_member(ca, c, dev_idx) {
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented);
+ fragmented_allowed += ((__dev_buckets_available(ca, usage) *
+ ca->mi.bucket_size) >> 1);
+ fragmented += usage.sectors_fragmented;
+ }
+
+ return max_t(s64, 0, fragmented_allowed - fragmented);
}
static int bch2_copygc_thread(void *arg)
{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
+ struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last, wait;
@@ -263,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
break;
last = atomic_long_read(&clock->now);
- wait = bch2_copygc_wait_amount(ca);
+ wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
bch2_kthread_io_clock_wait(clock, last + wait,
@@ -271,29 +306,30 @@ static int bch2_copygc_thread(void *arg)
continue;
}
- bch2_copygc(c, ca);
+ if (bch2_copygc(c))
+ break;
}
return 0;
}
-void bch2_copygc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_fs *c)
{
- ca->copygc_pd.rate.rate = UINT_MAX;
- bch2_ratelimit_reset(&ca->copygc_pd.rate);
+ c->copygc_pd.rate.rate = UINT_MAX;
+ bch2_ratelimit_reset(&c->copygc_pd.rate);
- if (ca->copygc_thread) {
- kthread_stop(ca->copygc_thread);
- put_task_struct(ca->copygc_thread);
+ if (c->copygc_thread) {
+ kthread_stop(c->copygc_thread);
+ put_task_struct(c->copygc_thread);
}
- ca->copygc_thread = NULL;
+ c->copygc_thread = NULL;
}
-int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c)
{
struct task_struct *t;
- if (ca->copygc_thread)
+ if (c->copygc_thread)
return 0;
if (c->opts.nochanges)
@@ -302,21 +338,20 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_copygc_thread, ca,
- "bch_copygc[%s]", ca->name);
+ t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
if (IS_ERR(t))
return PTR_ERR(t);
get_task_struct(t);
- ca->copygc_thread = t;
- wake_up_process(ca->copygc_thread);
+ c->copygc_thread = t;
+ wake_up_process(c->copygc_thread);
return 0;
}
-void bch2_dev_copygc_init(struct bch_dev *ca)
+void bch2_fs_copygc_init(struct bch_fs *c)
{
- bch2_pd_controller_init(&ca->copygc_pd);
- ca->copygc_pd.d_term = 0;
+ bch2_pd_controller_init(&c->copygc_pd);
+ c->copygc_pd.d_term = 0;
}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index dcd479632cf1..922738247d03 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,8 +2,8 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
-void bch2_copygc_stop(struct bch_dev *);
-int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
-void bch2_dev_copygc_init(struct bch_dev *);
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 94d6c044a27d..afe25cd26c06 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -45,12 +45,9 @@ const char * const bch2_str_hash_types[] = {
};
const char * const bch2_data_types[] = {
- "none",
- "sb",
- "journal",
- "btree",
- "data",
- "cached",
+#define x(t, n) #t,
+ BCH_DATA_TYPES()
+#undef x
NULL
};
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 3b051e7a8f1d..d6a832a38b20 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -260,6 +260,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
+ x(rebuild_replicas, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Rebuild the superblock replicas section") \
x(keep_journal, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index e15a2b1dc5d0..56a1f761271f 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -249,45 +249,42 @@ static int bch2_rebalance_thread(void *arg)
return 0;
}
-ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
char h1[21], h2[21];
bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
- pr_buf(&out, "fullest_dev (%i):\t%s/%s\n",
+ pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
w.dev_most_full_idx, h1, h2);
bch2_hprint(&PBUF(h1), w.total_work << 9);
bch2_hprint(&PBUF(h2), c->capacity << 9);
- pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2);
+ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
- pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
switch (r->state) {
case REBALANCE_WAITING:
- pr_buf(&out, "waiting\n");
+ pr_buf(out, "waiting\n");
break;
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
- pr_buf(&out, "throttled for %lu sec or %s io\n",
+ pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
break;
case REBALANCE_RUNNING:
- pr_buf(&out, "running\n");
- pr_buf(&out, "pos %llu:%llu\n",
+ pr_buf(out, "running\n");
+ pr_buf(out, "pos %llu:%llu\n",
r->move_stats.pos.inode,
r->move_stats.pos.offset);
break;
}
-
- return out.pos - buf;
}
void bch2_rebalance_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 99e2a1fb6084..7ade0bb81cce 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -19,7 +19,7 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
-ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 41b864dcdc39..28972f30e198 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -442,11 +442,18 @@ retry:
* regular keys
*/
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
- bch2_trans_update(&trans, split_iter, split, !remark
- ? BTREE_TRIGGER_NORUN
- : BTREE_TRIGGER_NOOVERWRITES);
+ bch2_trans_update(&trans, split_iter, split,
+ BTREE_TRIGGER_NORUN);
bch2_btree_iter_set_pos(iter, split->k.p);
+
+ if (remark) {
+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
+ 0, split->k.size,
+ BTREE_TRIGGER_INSERT);
+ if (ret)
+ goto err;
+ }
} while (bkey_cmp(iter->pos, k->k.p) < 0);
if (remark) {
@@ -967,7 +974,8 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
- if (!c->replicas.entries) {
+ if (!c->replicas.entries ||
+ c->opts.rebuild_replicas) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 67a7128fd9af..6b6506c68609 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -113,16 +113,16 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
case KEY_TYPE_btree_ptr_v2:
- e->data_type = BCH_DATA_BTREE;
+ e->data_type = BCH_DATA_btree;
extent_to_replicas(k, e);
break;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- e->data_type = BCH_DATA_USER;
+ e->data_type = BCH_DATA_user;
extent_to_replicas(k, e);
break;
case KEY_TYPE_stripe:
- e->data_type = BCH_DATA_USER;
+ e->data_type = BCH_DATA_user;
stripe_to_replicas(k, e);
break;
}
@@ -137,7 +137,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
unsigned i;
BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
+ data_type == BCH_DATA_sb ||
data_type >= BCH_DATA_NR);
e->data_type = data_type;
@@ -213,29 +213,20 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0;
}
-static bool bch2_replicas_marked_locked(struct bch_fs *c,
- struct bch_replicas_entry *search,
- bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+ struct bch_replicas_entry *search)
{
+ bool marked;
+
if (!search->nr_devs)
return true;
verify_replicas_entry(search);
- return __replicas_has_entry(&c->replicas, search) &&
- (!check_gc_replicas ||
- likely((!c->replicas_gc.entries)) ||
- __replicas_has_entry(&c->replicas_gc, search));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry *search,
- bool check_gc_replicas)
-{
- bool marked;
-
percpu_down_read(&c->mark_lock);
- marked = bch2_replicas_marked_locked(c, search, check_gc_replicas);
+ marked = __replicas_has_entry(&c->replicas, search) &&
+ (likely((!c->replicas_gc.entries)) ||
+ __replicas_has_entry(&c->replicas_gc, search));
percpu_up_read(&c->mark_lock);
return marked;
@@ -423,66 +414,50 @@ err:
goto out;
}
-int bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *r)
+static int __bch2_mark_replicas(struct bch_fs *c,
+ struct bch_replicas_entry *r,
+ bool check)
{
- return likely(bch2_replicas_marked(c, r, true))
- ? 0
+ return likely(bch2_replicas_marked(c, r)) ? 0
+ : check ? -1
: bch2_mark_replicas_slowpath(c, r);
}
-bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
- struct bkey_s_c k,
- bool check_gc_replicas)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+{
+ return __bch2_mark_replicas(c, r, false);
+}
+
+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
+ bool check)
{
struct bch_replicas_padded search;
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
+ int ret;
for (i = 0; i < cached.nr; i++) {
bch2_replicas_entry_cached(&search.e, cached.devs[i]);
- if (!bch2_replicas_marked_locked(c, &search.e,
- check_gc_replicas))
- return false;
+ ret = __bch2_mark_replicas(c, &search.e, check);
+ if (ret)
+ return ret;
}
bch2_bkey_to_replicas(&search.e, k);
- return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
+ return __bch2_mark_replicas(c, &search.e, check);
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
- struct bkey_s_c k,
- bool check_gc_replicas)
+ struct bkey_s_c k)
{
- bool marked;
-
- percpu_down_read(&c->mark_lock);
- marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas);
- percpu_up_read(&c->mark_lock);
-
- return marked;
+ return __bch2_mark_bkey_replicas(c, k, true) == 0;
}
int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
- struct bch_replicas_padded search;
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
- int ret;
-
- for (i = 0; i < cached.nr; i++) {
- bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
- ret = bch2_mark_replicas(c, &search.e);
- if (ret)
- return ret;
- }
-
- bch2_bkey_to_replicas(&search.e, k);
-
- return bch2_mark_replicas(c, &search.e);
+ return __bch2_mark_bkey_replicas(c, k, false);
}
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@@ -611,7 +586,7 @@ retry:
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- if (e->data_type == BCH_DATA_JOURNAL ||
+ if (e->data_type == BCH_DATA_journal ||
c->usage_base->replicas[i] ||
percpu_u64_get(&c->usage[0]->replicas[i]) ||
percpu_u64_get(&c->usage[1]->replicas[i]))
@@ -1037,13 +1012,13 @@ static bool have_enough_devs(struct replicas_status s,
bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
{
- return (have_enough_devs(s, BCH_DATA_JOURNAL,
+ return (have_enough_devs(s, BCH_DATA_journal,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_BTREE,
+ have_enough_devs(s, BCH_DATA_btree,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_USER,
+ have_enough_devs(s, BCH_DATA_user,
flags & BCH_FORCE_IF_DATA_DEGRADED,
flags & BCH_FORCE_IF_DATA_LOST));
}
@@ -1053,9 +1028,9 @@ int bch2_replicas_online(struct bch_fs *c, bool meta)
struct replicas_status s = bch2_replicas_status(c);
return (meta
- ? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
- s.replicas[BCH_DATA_BTREE].redundancy)
- : s.replicas[BCH_DATA_USER].redundancy) + 1;
+ ? min(s.replicas[BCH_DATA_journal].redundancy,
+ s.replicas[BCH_DATA_btree].redundancy)
+ : s.replicas[BCH_DATA_user].redundancy) + 1;
}
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 8527d82841bb..8b95164fbb56 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -21,22 +21,18 @@ int bch2_replicas_entry_idx(struct bch_fs *,
void bch2_devlist_to_replicas(struct bch_replicas_entry *,
enum bch_data_type,
struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *,
- struct bch_replicas_entry *, bool);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
-bool bch2_bkey_replicas_marked_locked(struct bch_fs *,
- struct bkey_s_c, bool);
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *,
- struct bkey_s_c, bool);
+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
unsigned dev)
{
- e->data_type = BCH_DATA_CACHED;
+ e->data_type = BCH_DATA_cached;
e->nr_devs = 1;
e->nr_required = 1;
e->devs[0] = dev;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index f2be64c869df..cee6cc938734 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -636,7 +636,8 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+ bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write);
@@ -656,7 +657,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB],
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
@@ -684,7 +685,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
roundup((size_t) vstruct_bytes(sb),
bdev_logical_block_size(ca->disk_sb.bdev)));
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
percpu_ref_get(&ca->io_ref);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0cdf285e4ffd..1d9a6bfa8c13 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -172,7 +172,7 @@ int bch2_congested(void *data, int bdi_bits)
unsigned target = READ_ONCE(c->opts.foreground_target);
const struct bch_devs_mask *devs = target
? bch2_target_to_mask(c, target)
- : &c->rw_devs[BCH_DATA_USER];
+ : &c->rw_devs[BCH_DATA_user];
for_each_member_device_rcu(ca, c, i, devs) {
bdi = ca->disk_sb.bdev->bd_bdi;
@@ -213,10 +213,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
int ret;
bch2_rebalance_stop(c);
-
- for_each_member_device(ca, c, i)
- bch2_copygc_stop(ca);
-
+ bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
/*
@@ -396,8 +393,6 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
static int bch2_fs_read_write_late(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret;
ret = bch2_gc_thread_start(c);
@@ -406,13 +401,10 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
return ret;
}
- for_each_rw_member(ca, c, i) {
- ret = bch2_copygc_start(c, ca);
- if (ret) {
- bch_err(c, "error starting copygc threads");
- percpu_ref_put(&ca->io_ref);
- return ret;
- }
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err(c, "error starting copygc thread");
+ return ret;
}
ret = bch2_rebalance_start(c);
@@ -535,6 +527,7 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
+ free_heap(&c->copygc_heap);
if (c->journal_reclaim_wq)
destroy_workqueue(c->journal_reclaim_wq);
@@ -684,6 +677,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
+ bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
@@ -708,9 +702,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
- INIT_LIST_HEAD(&c->ec_new_stripe_list);
- mutex_init(&c->ec_new_stripe_lock);
- mutex_init(&c->ec_stripe_create_lock);
+ INIT_LIST_HEAD(&c->ec_stripe_head_list);
+ mutex_init(&c->ec_stripe_head_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_new_list);
+ mutex_init(&c->ec_stripe_new_lock);
+
spin_lock_init(&c->ec_stripes_heap_lock);
seqcount_init(&c->gc_pos_lock);
@@ -1108,10 +1105,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
init_rwsem(&ca->bucket_lock);
- writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
-
- bch2_dev_copygc_init(ca);
-
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_init(&ca->io_latency[READ]);
@@ -1241,7 +1234,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
return ret;
if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) {
+ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca, 0);
mutex_unlock(&c->sb_lock);
@@ -1352,7 +1345,11 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
- bch2_copygc_stop(ca);
+ /*
+ * Device going read only means the copygc reserve get smaller, so we
+ * don't want that happening while copygc is in progress:
+ */
+ bch2_copygc_stop(c);
/*
* The allocator thread itself allocates btree nodes, so stop it first:
@@ -1360,6 +1357,8 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
+
+ bch2_copygc_start(c);
}
static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
@@ -1374,9 +1373,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
if (bch2_dev_allocator_start(ca))
return "error starting allocator thread";
- if (bch2_copygc_start(c, ca))
- return "error starting copygc thread";
-
return NULL;
}
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c169d282a1f9..0cb29f43d99d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -75,7 +75,6 @@ do { \
#define sysfs_hprint(file, val) \
do { \
if (attr == &sysfs_ ## file) { \
- struct printbuf out = _PBUF(buf, PAGE_SIZE); \
bch2_hprint(&out, val); \
pr_buf(&out, "\n"); \
return out.pos - buf; \
@@ -168,6 +167,7 @@ read_attribute(btree_updates);
read_attribute(dirty_btree_nodes);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
+read_attribute(stripes_heap);
read_attribute(internal_uuid);
@@ -238,24 +238,22 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
-static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
if (!fs_usage)
return -ENOMEM;
- bch2_fs_usage_to_text(&out, c, fs_usage);
+ bch2_fs_usage_to_text(out, c, fs_usage);
percpu_up_read(&c->mark_lock);
kfree(fs_usage);
-
- return out.pos - buf;
+ return 0;
}
-static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -298,59 +296,26 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
if (ret)
return ret;
- return scnprintf(buf, PAGE_SIZE,
- "uncompressed data:\n"
- " nr extents: %llu\n"
- " size (bytes): %llu\n"
- "compressed data:\n"
- " nr extents: %llu\n"
- " compressed size (bytes): %llu\n"
- " uncompressed size (bytes): %llu\n",
- nr_uncompressed_extents,
- uncompressed_sectors << 9,
- nr_compressed_extents,
- compressed_sectors_compressed << 9,
- compressed_sectors_uncompressed << 9);
-}
-
-static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
-{
- char *out = buf, *end = buf + PAGE_SIZE;
- struct ec_stripe_head *h;
- struct ec_stripe_new *s;
-
- mutex_lock(&c->ec_new_stripe_lock);
- list_for_each_entry(h, &c->ec_new_stripe_list, list) {
- out += scnprintf(out, end - out,
- "target %u algo %u redundancy %u:\n",
- h->target, h->algo, h->redundancy);
-
- if (h->s)
- out += scnprintf(out, end - out,
- "\tpending: blocks %u allocated %u\n",
- h->s->blocks.nr,
- bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr));
-
- mutex_lock(&h->lock);
- list_for_each_entry(s, &h->stripes, list)
- out += scnprintf(out, end - out,
- "\tin flight: blocks %u allocated %u pin %u\n",
- s->blocks.nr,
- bitmap_weight(s->blocks_allocated,
- s->blocks.nr),
- atomic_read(&s->pin));
- mutex_unlock(&h->lock);
-
- }
- mutex_unlock(&c->ec_new_stripe_lock);
-
- return out - buf;
+ pr_buf(out,
+ "uncompressed data:\n"
+ " nr extents: %llu\n"
+ " size (bytes): %llu\n"
+ "compressed data:\n"
+ " nr extents: %llu\n"
+ " compressed size (bytes): %llu\n"
+ " uncompressed size (bytes): %llu\n",
+ nr_uncompressed_extents,
+ uncompressed_sectors << 9,
+ nr_compressed_extents,
+ compressed_sectors_compressed << 9,
+ compressed_sectors_uncompressed << 9);
+ return 0;
}
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
@@ -378,9 +343,12 @@ SHOW(bch2_fs)
sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
+ sysfs_pd_controller_show(copy_gc, &c->copygc_pd);
- if (attr == &sysfs_rebalance_work)
- return bch2_rebalance_work_show(c, buf);
+ if (attr == &sysfs_rebalance_work) {
+ bch2_rebalance_work_to_text(&out, c);
+ return out.pos - buf;
+ }
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@@ -390,44 +358,61 @@ SHOW(bch2_fs)
/* Debugging: */
if (attr == &sysfs_alloc_debug)
- return show_fs_alloc_debug(c, buf);
+ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
- if (attr == &sysfs_journal_debug)
- return bch2_journal_print_debug(&c->journal, buf);
+ if (attr == &sysfs_journal_debug) {
+ bch2_journal_debug_to_text(&out, &c->journal);
+ return out.pos - buf;
+ }
- if (attr == &sysfs_journal_pins)
- return bch2_journal_print_pins(&c->journal, buf);
+ if (attr == &sysfs_journal_pins) {
+ bch2_journal_pins_to_text(&out, &c->journal);
+ return out.pos - buf;
+ }
- if (attr == &sysfs_btree_updates)
- return bch2_btree_updates_print(c, buf);
+ if (attr == &sysfs_btree_updates) {
+ bch2_btree_updates_to_text(&out, c);
+ return out.pos - buf;
+ }
- if (attr == &sysfs_dirty_btree_nodes)
- return bch2_dirty_btree_nodes_print(c, buf);
+ if (attr == &sysfs_dirty_btree_nodes) {
+ bch2_dirty_btree_nodes_to_text(&out, c);
+ return out.pos - buf;
+ }
if (attr == &sysfs_btree_key_cache) {
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
-
bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
return out.pos - buf;
}
if (attr == &sysfs_btree_transactions) {
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
-
bch2_btree_trans_to_text(&out, c);
return out.pos - buf;
}
- if (attr == &sysfs_compression_stats)
- return bch2_compression_stats(c, buf);
+ if (attr == &sysfs_stripes_heap) {
+ bch2_stripes_heap_to_text(&out, c);
+ return out.pos - buf;
+ }
+
+ if (attr == &sysfs_compression_stats) {
+ bch2_compression_stats_to_text(&out, c);
+ return out.pos - buf;
+ }
- if (attr == &sysfs_new_stripes)
- return bch2_new_stripes(c, buf);
+ if (attr == &sysfs_new_stripes) {
+ bch2_new_stripes_to_text(&out, c);
+ return out.pos - buf;
+ }
- if (attr == &sysfs_io_timers_read)
- return bch2_io_timers_show(&c->io_clock[READ], buf);
- if (attr == &sysfs_io_timers_write)
- return bch2_io_timers_show(&c->io_clock[WRITE], buf);
+ if (attr == &sysfs_io_timers_read) {
+ bch2_io_timers_to_text(&out, &c->io_clock[READ]);
+ return out.pos - buf;
+ }
+ if (attr == &sysfs_io_timers_write) {
+ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
+ return out.pos - buf;
+ }
#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
BCH_DEBUG_PARAMS()
@@ -452,14 +437,11 @@ STORE(bch2_fs)
}
if (attr == &sysfs_copy_gc_enabled) {
- struct bch_dev *ca;
- unsigned i;
ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
?: (ssize_t) size;
- for_each_member_device(ca, c, i)
- if (ca->copygc_thread)
- wake_up_process(ca->copygc_thread);
+ if (c->copygc_thread)
+ wake_up_process(c->copygc_thread);
return ret;
}
@@ -474,6 +456,7 @@ STORE(bch2_fs)
sysfs_strtoul(pd_controllers_update_seconds,
c->pd_controllers_update_seconds);
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
+ sysfs_pd_controller_store(copy_gc, &c->copygc_pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
@@ -583,6 +566,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_dirty_btree_nodes,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
+ &sysfs_stripes_heap,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
@@ -598,6 +582,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_rebalance_enabled,
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
+ sysfs_pd_controller_files(copy_gc),
&sysfs_new_stripes,
@@ -696,11 +681,13 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
-#define x(name) \
- if (attr == &sysfs_time_stat_##name) \
- return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
- buf, PAGE_SIZE);
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) { \
+ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
+ return out.pos - buf; \
+ }
BCH_TIME_STATS()
#undef x
@@ -753,13 +740,13 @@ static int unsigned_cmp(const void *_l, const void *_r)
return cmp_int(*l, *r);
}
-static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
- char *buf, bucket_map_fn *fn, void *private)
+static int quantiles_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bch_dev *ca,
+ bucket_map_fn *fn, void *private)
{
size_t i, n;
/* Compute 31 quantiles */
unsigned q[31], *p;
- ssize_t ret = 0;
down_read(&ca->bucket_lock);
n = ca->mi.nbuckets;
@@ -786,38 +773,33 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
vfree(p);
for (i = 0; i < ARRAY_SIZE(q); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "%u ", q[i]);
- buf[ret - 1] = '\n';
-
- return ret;
+ pr_buf(out, "%u ", q[i]);
+ pr_buf(out, "\n");
+ return 0;
}
-static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
enum alloc_reserve i;
spin_lock(&ca->fs->freelist_lock);
- pr_buf(&out, "free_inc:\t%zu\t%zu\n",
+ pr_buf(out, "free_inc:\t%zu\t%zu\n",
fifo_used(&ca->free_inc),
ca->free_inc.size);
for (i = 0; i < RESERVE_NR; i++)
- pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i,
+ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
fifo_used(&ca->free[i]),
ca->free[i].size);
spin_unlock(&ca->fs->freelist_lock);
-
- return out.pos - buf;
}
-static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
unsigned i, nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
@@ -825,7 +807,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].type]++;
- return scnprintf(buf, PAGE_SIZE,
+ pr_buf(out,
"free_inc: %zu/%zu\n"
"free[RESERVE_BTREE]: %zu/%zu\n"
"free[RESERVE_MOVINGGC]: %zu/%zu\n"
@@ -861,27 +843,27 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets_alloc,
- stats.buckets[BCH_DATA_SB],
- stats.buckets[BCH_DATA_JOURNAL],
- stats.buckets[BCH_DATA_BTREE],
- stats.buckets[BCH_DATA_USER],
- stats.buckets[BCH_DATA_CACHED],
+ stats.buckets[BCH_DATA_sb],
+ stats.buckets[BCH_DATA_journal],
+ stats.buckets[BCH_DATA_btree],
+ stats.buckets[BCH_DATA_user],
+ stats.buckets[BCH_DATA_cached],
stats.buckets_ec,
- ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
- stats.sectors[BCH_DATA_SB],
- stats.sectors[BCH_DATA_JOURNAL],
- stats.sectors[BCH_DATA_BTREE],
- stats.sectors[BCH_DATA_USER],
- stats.sectors[BCH_DATA_CACHED],
+ __dev_buckets_available(ca, stats),
+ stats.sectors[BCH_DATA_sb],
+ stats.sectors[BCH_DATA_journal],
+ stats.sectors[BCH_DATA_btree],
+ stats.sectors[BCH_DATA_user],
+ stats.sectors[BCH_DATA_cached],
stats.sectors_ec,
stats.sectors_fragmented,
- ca->copygc_threshold,
+ c->copygc_threshold,
c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
BTREE_NODE_OPEN_BUCKET_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty",
- nr[BCH_DATA_BTREE],
- nr[BCH_DATA_USER],
+ nr[BCH_DATA_btree],
+ nr[BCH_DATA_user],
c->btree_reserve_cache_nr);
}
@@ -891,21 +873,18 @@ static const char * const bch2_rw[] = {
NULL
};
-static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
int rw, i;
for (rw = 0; rw < 2; rw++) {
- pr_buf(&out, "%s:\n", bch2_rw[rw]);
+ pr_buf(out, "%s:\n", bch2_rw[rw]);
for (i = 1; i < BCH_DATA_NR; i++)
- pr_buf(&out, "%-12s:%12llu\n",
+ pr_buf(out, "%-12s:%12llu\n",
bch2_data_types[i],
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
-
- return out.pos - buf;
}
SHOW(bch2_dev)
@@ -942,8 +921,6 @@ SHOW(bch2_dev)
return out.pos - buf;
}
- sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
-
if (attr == &sysfs_cache_replacement_policy) {
bch2_string_opt_to_text(&out,
bch2_cache_replacement_policies,
@@ -959,34 +936,44 @@ SHOW(bch2_dev)
return out.pos - buf;
}
- if (attr == &sysfs_iodone)
- return show_dev_iodone(ca, buf);
+ if (attr == &sysfs_iodone) {
+ dev_iodone_to_text(&out, ca);
+ return out.pos - buf;
+ }
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
- if (attr == &sysfs_io_latency_stats_read)
- return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
- if (attr == &sysfs_io_latency_stats_write)
- return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+ if (attr == &sysfs_io_latency_stats_read) {
+ bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
+ return out.pos - buf;
+ }
+ if (attr == &sysfs_io_latency_stats_write) {
+ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
+ return out.pos - buf;
+ }
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
if (attr == &sysfs_bucket_quantiles_last_read)
- return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
if (attr == &sysfs_bucket_quantiles_last_write)
- return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
if (attr == &sysfs_bucket_quantiles_fragmentation)
- return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf;
if (attr == &sysfs_bucket_quantiles_oldest_gen)
- return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf;
- if (attr == &sysfs_reserve_stats)
- return show_reserve_stats(ca, buf);
- if (attr == &sysfs_alloc_debug)
- return show_dev_alloc_debug(ca, buf);
+ if (attr == &sysfs_reserve_stats) {
+ reserve_stats_to_text(&out, ca);
+ return out.pos - buf;
+ }
+ if (attr == &sysfs_alloc_debug) {
+ dev_alloc_debug_to_text(&out, ca);
+ return out.pos - buf;
+ }
return 0;
}
@@ -997,8 +984,6 @@ STORE(bch2_dev)
struct bch_fs *c = ca->fs;
struct bch_member *mi;
- sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
-
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
@@ -1083,8 +1068,6 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
&sysfs_wake_allocator,
-
- sysfs_pd_controller_files(copy_gc),
NULL
};
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e69d03d1109f..fd4044a6a08f 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -318,43 +318,40 @@ static void pr_time_units(struct printbuf *out, u64 ns)
pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
-size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
{
- struct printbuf out = _PBUF(buf, len);
const struct time_unit *u;
u64 freq = READ_ONCE(stats->average_frequency);
u64 q, last_q = 0;
int i;
- pr_buf(&out, "count:\t\t%llu\n",
+ pr_buf(out, "count:\t\t%llu\n",
stats->count);
- pr_buf(&out, "rate:\t\t%llu/sec\n",
+ pr_buf(out, "rate:\t\t%llu/sec\n",
freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
- pr_buf(&out, "frequency:\t");
- pr_time_units(&out, freq);
+ pr_buf(out, "frequency:\t");
+ pr_time_units(out, freq);
- pr_buf(&out, "\navg duration:\t");
- pr_time_units(&out, stats->average_duration);
+ pr_buf(out, "\navg duration:\t");
+ pr_time_units(out, stats->average_duration);
- pr_buf(&out, "\nmax duration:\t");
- pr_time_units(&out, stats->max_duration);
+ pr_buf(out, "\nmax duration:\t");
+ pr_time_units(out, stats->max_duration);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
- pr_buf(&out, "\nquantiles (%s):\t", u->name);
+ pr_buf(out, "\nquantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
- pr_buf(&out, "%llu%s",
+ pr_buf(out, "%llu%s",
div_u64(q, u->nsecs),
is_last ? "\n" : " ");
last_q = q;
}
-
- return out.pos - buf;
}
void bch2_time_stats_exit(struct time_stats *stats)
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 2b19a0038045..4dcd28456e00 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -398,7 +398,7 @@ static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
__bch2_time_stats_update(stats, start, local_clock());
}
-size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
void bch2_time_stats_exit(struct time_stats *);
void bch2_time_stats_init(struct time_stats *);
@@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 745b2d0dcf78..09887c0f9a03 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1013,6 +1013,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
+const char *blk_status_to_str(blk_status_t status);
bool blk_poll(struct request_queue *q, blk_qc_t cookie);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index bafbccafae30..9b4e8295ed75 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -470,10 +470,10 @@ TRACE_EVENT(move_data,
);
TRACE_EVENT(copygc,
- TP_PROTO(struct bch_dev *ca,
+ TP_PROTO(struct bch_fs *c,
u64 sectors_moved, u64 sectors_not_moved,
u64 buckets_moved, u64 buckets_not_moved),
- TP_ARGS(ca,
+ TP_ARGS(c,
sectors_moved, sectors_not_moved,
buckets_moved, buckets_not_moved),
@@ -486,7 +486,7 @@ TRACE_EVENT(copygc,
),
TP_fast_assign(
- memcpy(__entry->uuid, ca->uuid.b, 16);
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
__entry->sectors_moved = sectors_moved;
__entry->sectors_not_moved = sectors_not_moved;
__entry->buckets_moved = buckets_moved;
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
index 3acee748e052..49d46ed2e18e 100644
--- a/kernel/locking/six.c
+++ b/kernel/locking/six.c
@@ -15,7 +15,7 @@
#endif
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
+#define six_release(l) lock_release(l, _RET_IP_)
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */