summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-04-04 22:34:05 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2021-04-04 22:34:54 -0400
commit2625a7f0726213283750cd65043363f679fda9e8 (patch)
tree69f1d080efbbceb74b329d7285f581a438df8f05
parentbd6be71b2d23f021dccdba0103826de7632c8e63 (diff)
Merge with 8eb434efa5 bcachefs: Use x-macros for compat feature bits
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/alloc_background.c621
-rw-r--r--fs/bcachefs/alloc_background.h49
-rw-r--r--fs/bcachefs/alloc_foreground.c100
-rw-r--r--fs/bcachefs/alloc_types.h37
-rw-r--r--fs/bcachefs/bcachefs.h29
-rw-r--r--fs/bcachefs/bcachefs_format.h126
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h20
-rw-r--r--fs/bcachefs/bkey.c7
-rw-r--r--fs/bcachefs/bkey.h6
-rw-r--r--fs/bcachefs/bkey_buf.h60
-rw-r--r--fs/bcachefs/bkey_on_stack.h43
-rw-r--r--fs/bcachefs/bkey_sort.c18
-rw-r--r--fs/bcachefs/btree_cache.c84
-rw-r--r--fs/bcachefs/btree_cache.h4
-rw-r--r--fs/bcachefs/btree_gc.c591
-rw-r--r--fs/bcachefs/btree_gc.h3
-rw-r--r--fs/bcachefs/btree_io.c148
-rw-r--r--fs/bcachefs/btree_io.h6
-rw-r--r--fs/bcachefs/btree_iter.c421
-rw-r--r--fs/bcachefs/btree_iter.h1
-rw-r--r--fs/bcachefs/btree_key_cache.c6
-rw-r--r--fs/bcachefs/btree_key_cache.h3
-rw-r--r--fs/bcachefs/btree_types.h5
-rw-r--r--fs/bcachefs/btree_update.h2
-rw-r--r--fs/bcachefs/btree_update_interior.c37
-rw-r--r--fs/bcachefs/btree_update_leaf.c112
-rw-r--r--fs/bcachefs/buckets.c675
-rw-r--r--fs/bcachefs/buckets.h42
-rw-r--r--fs/bcachefs/buckets_types.h18
-rw-r--r--fs/bcachefs/chardev.c6
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/clock.c8
-rw-r--r--fs/bcachefs/clock_types.h2
-rw-r--r--fs/bcachefs/compress.c15
-rw-r--r--fs/bcachefs/debug.c2
-rw-r--r--fs/bcachefs/ec.c834
-rw-r--r--fs/bcachefs/ec.h72
-rw-r--r--fs/bcachefs/ec_types.h9
-rw-r--r--fs/bcachefs/extent_update.c45
-rw-r--r--fs/bcachefs/extents.c62
-rw-r--r--fs/bcachefs/extents.h5
-rw-r--r--fs/bcachefs/fs-io.c442
-rw-r--r--fs/bcachefs/fs-io.h7
-rw-r--r--fs/bcachefs/fs.c43
-rw-r--r--fs/bcachefs/fsck.c42
-rw-r--r--fs/bcachefs/inode.c21
-rw-r--r--fs/bcachefs/inode.h2
-rw-r--r--fs/bcachefs/io.c105
-rw-r--r--fs/bcachefs/io.h8
-rw-r--r--fs/bcachefs/journal.c49
-rw-r--r--fs/bcachefs/journal.h7
-rw-r--r--fs/bcachefs/journal_io.c399
-rw-r--r--fs/bcachefs/journal_io.h4
-rw-r--r--fs/bcachefs/journal_reclaim.c19
-rw-r--r--fs/bcachefs/journal_reclaim.h7
-rw-r--r--fs/bcachefs/journal_types.h3
-rw-r--r--fs/bcachefs/migrate.c20
-rw-r--r--fs/bcachefs/move.c241
-rw-r--r--fs/bcachefs/move.h8
-rw-r--r--fs/bcachefs/movinggc.c32
-rw-r--r--fs/bcachefs/opts.c7
-rw-r--r--fs/bcachefs/opts.h11
-rw-r--r--fs/bcachefs/rebalance.c20
-rw-r--r--fs/bcachefs/rebalance_types.h2
-rw-r--r--fs/bcachefs/recovery.c505
-rw-r--r--fs/bcachefs/recovery.h17
-rw-r--r--fs/bcachefs/reflink.c21
-rw-r--r--fs/bcachefs/replicas.c142
-rw-r--r--fs/bcachefs/replicas.h15
-rw-r--r--fs/bcachefs/super-io.c104
-rw-r--r--fs/bcachefs/super-io.h5
-rw-r--r--fs/bcachefs/super.c138
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/super_types.h2
-rw-r--r--fs/bcachefs/sysfs.c108
-rw-r--r--fs/bcachefs/util.h36
-rw-r--r--kernel/locking/six.c2
78 files changed, 3804 insertions, 3162 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 09a7f8c8583a..8e9cbc0625f7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -14,6 +14,7 @@
#include "ec.h"
#include "error.h"
#include "recovery.h"
+#include "varint.h"
#include <linux/kthread.h>
#include <linux/math64.h>
@@ -24,15 +25,12 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
- BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+ BCH_ALLOC_FIELDS_V1()
#undef x
- NULL
};
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
@@ -54,10 +52,10 @@ static void pd_controllers_update(struct work_struct *work)
* reclaimed by copy GC
*/
fragmented += max_t(s64, 0, (bucket_to_sector(ca,
- stats.buckets[BCH_DATA_user] +
- stats.buckets[BCH_DATA_cached]) -
- (stats.sectors[BCH_DATA_user] +
- stats.sectors[BCH_DATA_cached])) << 9);
+ stats.d[BCH_DATA_user].buckets +
+ stats.d[BCH_DATA_cached].buckets) -
+ (stats.d[BCH_DATA_user].sectors +
+ stats.d[BCH_DATA_cached].sectors)) << 9);
}
bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
/* Persistent alloc info: */
-static inline u64 get_alloc_field(const struct bch_alloc *a,
- const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+ const void **p, unsigned field)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
u64 v;
if (!(a->fields & (1 << field)))
@@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
return v;
}
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
- unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+ unsigned field, u64 v)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
if (!v)
return;
@@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
*p += bytes;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
{
- struct bkey_alloc_unpacked ret = { .gen = 0 };
+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+ const void *d = in->data;
+ unsigned idx = 0;
- if (k.k->type == KEY_TYPE_alloc) {
- const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
- const void *d = a->data;
- unsigned idx = 0;
+ out->gen = in->gen;
- ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+}
+
+static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
+ void *d = a->v.data;
+ unsigned bytes, idx = 0;
-#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
- BCH_ALLOC_FIELDS()
+ a->k.p = POS(src.dev, src.bucket);
+ a->v.fields = 0;
+ a->v.gen = src.gen;
+
+#define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
+ BCH_ALLOC_FIELDS_V1()
#undef x
- }
- return ret;
+ bytes = (void *) d - (void *) &a->v;
+ set_bkey_val_bytes(&a->k, bytes);
+ memset_u64s_tail(&a->v, 0, bytes);
}
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
- const struct bkey_alloc_unpacked src)
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
{
- unsigned idx = 0;
- void *d = dst->v.data;
+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ u8 *out = a->v.data;
+ u8 *end = (void *) &dst[1];
+ u8 *last_nonzero_field = out;
unsigned bytes;
- dst->v.fields = 0;
- dst->v.gen = src.gen;
+ a->k.p = POS(src.dev, src.bucket);
+ a->v.gen = src.gen;
+ a->v.oldest_gen = src.oldest_gen;
+ a->v.data_type = src.data_type;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (src._name) { \
+ out += bch2_varint_encode(out, src._name); \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ }
-#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ a->v.nr_fields = last_nonzero_fieldnr;
+
+ bytes = (u8 *) out - (u8 *) &a->v;
+ set_bkey_val_bytes(&a->k, bytes);
+ memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked ret = {
+ .dev = k.k->p.inode,
+ .bucket = k.k->p.offset,
+ .gen = 0,
+ };
+
+ if (k.k->type == KEY_TYPE_alloc_v2)
+ bch2_alloc_unpack_v2(&ret, k);
+ else if (k.k->type == KEY_TYPE_alloc)
+ bch2_alloc_unpack_v1(&ret, k);
- bytes = (void *) d - (void *) &dst->v;
- set_bkey_val_bytes(&dst->k, bytes);
- memset_u64s_tail(&dst->v, 0, bytes);
+ return ret;
+}
+
+void bch2_alloc_pack(struct bch_fs *c,
+ struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
+ bch2_alloc_pack_v2(dst, src);
+ else
+ bch2_alloc_pack_v1(dst, src);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
if (a->fields & (1 << i))
- bytes += BCH_ALLOC_FIELD_BYTES[i];
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
@@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- const void *d = a.v->data;
- unsigned i;
+ struct bkey_alloc_unpacked u;
+
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
- pr_buf(out, "gen %u", a.v->gen);
+ if (bch2_alloc_unpack_v2(&u, k))
+ return "unpack error";
- for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
- if (a.v->fields & (1 << i))
- pr_buf(out, " %s %llu",
- bch2_alloc_field_names[i],
- get_alloc_field(a.v, &d, i));
+ return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ pr_buf(out, "gen %u oldest_gen %u data_type %u",
+ u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
+ BCH_ALLOC_FIELDS_V2()
+#undef x
}
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,11 +315,13 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
struct bucket *g;
struct bkey_alloc_unpacked u;
- if (level || k.k->type != KEY_TYPE_alloc)
+ if (level ||
+ (k.k->type != KEY_TYPE_alloc &&
+ k.k->type != KEY_TYPE_alloc_v2))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = __bucket(ca, k.k->p.offset, 0);
+ g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
g->_mark.gen = u.gen;
@@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ int ret;
down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return ret;
}
- percpu_down_write(&c->mark_lock);
- bch2_dev_usage_from_buckets(c);
- percpu_up_write(&c->mark_lock);
-
- mutex_lock(&c->bucket_clock[READ].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, READ);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[READ].lock);
-
- mutex_lock(&c->bucket_clock[WRITE].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, WRITE);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[WRITE].lock);
-
return 0;
}
@@ -278,12 +360,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *ba;
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
- __BKEY_PADDED(k, 8) alloc_key; /* hack: */
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
int ret;
retry:
bch2_trans_begin(trans);
@@ -302,193 +382,60 @@ retry:
percpu_down_read(&c->mark_lock);
ca = bch_dev_bkey_exists(c, iter->pos.inode);
- ba = bucket_array(ca);
-
- g = &ba->b[iter->pos.offset];
+ g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- new_u = alloc_mem_to_key(g, m);
+ new_u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, new_u);
-
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, new_u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- flags);
+ BTREE_INSERT_NOFAIL|flags);
err:
if (ret == -EINTR)
goto retry;
return ret;
}
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
- u64 first_bucket, nbuckets;
+ struct bch_dev *ca;
+ unsigned i;
int ret = 0;
- percpu_down_read(&c->mark_lock);
- first_bucket = bucket_array(ca)->first_bucket;
- nbuckets = bucket_array(ca)->nbuckets;
- percpu_up_read(&c->mark_lock);
-
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
- POS(ca->dev_idx, first_bucket),
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (iter->pos.offset < nbuckets) {
- bch2_trans_cond_resched(&trans);
-
- ret = bch2_alloc_write_key(&trans, iter, flags);
- if (ret)
- break;
- bch2_btree_iter_next_slot(iter);
- }
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
+ for_each_member_device(ca, c, i) {
+ bch2_btree_iter_set_pos(iter,
+ POS(ca->dev_idx, ca->mi.first_bucket));
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ while (iter->pos.offset < ca->mi.nbuckets) {
+ bch2_trans_cond_resched(&trans);
- for_each_rw_member(ca, c, i) {
- bch2_dev_alloc_write(c, ca, flags);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- break;
+ ret = bch2_alloc_write_key(&trans, iter, flags);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ goto err;
+ }
+ bch2_btree_iter_next_slot(iter);
}
}
-
+err:
+ bch2_trans_exit(&trans);
return ret;
}
/* Bucket IO clocks: */
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket *g;
- u16 max_last_io = 0;
- unsigned i;
-
- lockdep_assert_held(&c->bucket_clock[rw].lock);
-
- /* Recalculate max_last_io for this device: */
- for_each_bucket(g, buckets)
- max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
- ca->max_last_bucket_io[rw] = max_last_io;
-
- /* Recalculate global max_last_io: */
- max_last_io = 0;
-
- for_each_member_device(ca, c, i)
- max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
- clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets;
- struct bch_dev *ca;
- struct bucket *g;
- unsigned i;
-
- trace_rescale_prios(c);
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->io_time[rw] = clock->hand -
- bucket_last_io(c, g, rw) / 2;
-
- bch2_recalc_oldest_io(c, ca, rw);
-
- up_read(&ca->bucket_lock);
- }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
- return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
- struct bucket_clock *clock = container_of(timer,
- struct bucket_clock, rescale);
- struct bch_fs *c = container_of(clock,
- struct bch_fs, bucket_clock[clock->rw]);
- struct bch_dev *ca;
- u64 capacity;
- unsigned i;
-
- mutex_lock(&clock->lock);
-
- /* if clock cannot be advanced more, rescale prio */
- if (clock->max_last_io >= U16_MAX - 2)
- bch2_rescale_bucket_io_times(c, clock->rw);
-
- BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
- for_each_member_device(ca, c, i)
- ca->max_last_bucket_io[clock->rw]++;
- clock->max_last_io++;
- clock->hand++;
-
- mutex_unlock(&clock->lock);
-
- capacity = READ_ONCE(c->capacity);
-
- if (!capacity)
- return;
-
- /*
- * we only increment when 0.1% of the filesystem capacity has been read
- * or written too, this determines if it's time
- *
- * XXX: we shouldn't really be going off of the capacity of devices in
- * RW mode (that will be 0 when we're RO, yet we can still service
- * reads)
- */
- timer->expire += bucket_clock_freq(capacity);
-
- bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
-
- clock->hand = 1;
- clock->rw = rw;
- clock->rescale.fn = bch2_inc_clock_hand;
- clock->rescale.expire = bucket_clock_freq(c->capacity);
- mutex_init(&clock->lock);
-}
-
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw)
{
@@ -496,37 +443,38 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
struct btree_iter *iter;
struct bucket *g;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
- u16 *time;
+ u64 *time, now;
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto out;
+
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
- u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
-
time = rw == READ ? &u.read_time : &u.write_time;
- if (*time == c->bucket_clock[rw].hand)
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (*time == now)
goto out;
- *time = c->bucket_clock[rw].hand;
+ *time = now;
- bch2_alloc_pack(a, u);
-
- ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+ bch2_alloc_pack(c, a, u);
+ ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
bch2_trans_iter_put(trans, iter);
@@ -550,7 +498,8 @@ out:
static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
unsigned long gc_count = c->gc_count;
- u64 available;
+ s64 available;
+ unsigned i;
int ret = 0;
ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -566,13 +515,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
- available = max_t(s64, 0, dev_buckets_available(ca) -
- ca->inc_gen_really_needs_gc);
+ available = dev_buckets_available(ca);
+ available -= ca->inc_gen_really_needs_gc;
+
+ spin_lock(&c->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ available -= fifo_used(&ca->free[i]);
+ spin_unlock(&c->freelist_lock);
+
+ available = max(available, 0LL);
if (available > fifo_free(&ca->free_inc) ||
(available &&
- (!fifo_full(&ca->free[RESERVE_BTREE]) ||
- !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+ !fifo_full(&ca->free[RESERVE_MOVINGGC])))
break;
up_read(&c->gc_lock);
@@ -588,20 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
- size_t bucket,
- struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+ struct bucket_mark m)
{
u8 gc_gen;
- if (!is_available_bucket(mark))
+ if (!is_available_bucket(m))
+ return false;
+
+ if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
- test_bit(bucket, ca->buckets_nouse))
+ test_bit(b, ca->buckets_nouse))
return false;
- gc_gen = bucket_gc_gen(ca, bucket);
+ gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
@@ -615,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- * indication of how hot the data is -- we scale the prio so that the prio
- * farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- * indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- * smallest bucket_gc_gen() - since incrementing the same bucket's generation
- * number repeatedly forces us to run mark and sweep gc to avoid generation
- * number wraparound.
*/
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+ u64 now, u64 last_seq_ondisk)
{
- unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
- unsigned max_last_io = ca->max_last_bucket_io[READ];
+ unsigned used = bucket_sectors_used(m);
- /*
- * Time since last read, scaled to [0, 8) where larger value indicates
- * more recently read data:
- */
- unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
- /* How much we want to keep the data in this bucket: */
- unsigned long data_wantness =
- (hotness + 1) * bucket_sectors_used(m);
-
- unsigned long needs_journal_commit =
- bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+ if (used) {
+ /*
+ * Prefer to keep buckets that have been read more recently, and
+ * buckets that have more data in them:
+ */
+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
- return (data_wantness << 9) |
- (needs_journal_commit << 8) |
- (bucket_gc_gen(ca, b) / 16);
+ return -last_read_scaled;
+ } else {
+ /*
+ * Prefer to use buckets with smaller gc_gen so that we don't
+ * have to walk the btree and recalculate oldest_gen - but shift
+ * off the low bits so that buckets will still have equal sort
+ * keys when there's only a small difference, so that we can
+ * keep sequential buckets together:
+ */
+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+ (bucket_gc_gen(g) >> 4);
+ }
}
static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -674,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
+ u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
- ca->alloc_heap.used = 0;
-
- mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
-
- bch2_recalc_oldest_io(c, ca, READ);
+ ca->alloc_heap.used = 0;
+ now = atomic64_read(&c->io_clock[READ].now);
+ last_seq_ondisk = c->journal.last_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -691,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- unsigned long key = bucket_sort_key(c, ca, b, m);
+ struct bucket *g = &buckets->b[b];
+ struct bucket_mark m = READ_ONCE(g->mark);
+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
@@ -727,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
}
up_read(&ca->bucket_lock);
- mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -872,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
-#if 0
- __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
- /* hack: */
- __BKEY_PADDED(k, 8) alloc_key;
-#endif
struct bch_fs *c = trans->c;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
@@ -893,34 +833,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- verify_not_on_freelist(c, ca, b);
-
- BUG_ON(!fifo_push(&ca->free_inc, b));
-
g = bucket(ca, b);
m = READ_ONCE(g->mark);
- invalidating_cached_data = m.cached_sectors != 0;
+ BUG_ON(m.dirty_sectors);
+
+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+ spin_lock(&c->freelist_lock);
+ verify_not_on_freelist(c, ca, b);
+ BUG_ON(!fifo_push(&ca->free_inc, b));
+ spin_unlock(&c->freelist_lock);
/*
* If we're not invalidating cached data, we only increment the bucket
* gen in memory here, the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer():
*/
+ if (!m.cached_sectors &&
+ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+ BUG_ON(m.data_type);
+ bucket_cmpxchg(g, m, m.gen++);
+ percpu_up_read(&c->mark_lock);
+ goto out;
+ }
- if (!invalidating_cached_data)
- bch2_invalidate_bucket(c, ca, b, &m);
- else
- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
- spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
- if (!invalidating_cached_data)
- goto out;
-
/*
* If the read-only path is trying to shut down, we can't be generating
* new btree updates:
@@ -930,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out;
}
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
ret = bch2_btree_iter_traverse(iter);
@@ -941,7 +878,7 @@ retry:
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(g, m);
+ u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
@@ -951,14 +888,11 @@ retry:
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
- u.read_time = c->bucket_clock[READ].hand;
- u.write_time = c->bucket_clock[WRITE].hand;
-
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
@@ -973,8 +907,7 @@ retry:
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_JOURNAL_RESERVED|
flags);
if (ret == -EINTR)
goto retry;
@@ -1135,6 +1068,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
return 0;
}
+static inline bool allocator_thread_running(struct bch_dev *ca)
+{
+ return ca->mi.state == BCH_MEMBER_STATE_RW &&
+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+}
+
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
@@ -1151,9 +1090,16 @@ static int bch2_allocator_thread(void *arg)
int ret;
set_freezable();
- ca->allocator_state = ALLOCATOR_RUNNING;
while (1) {
+ if (!allocator_thread_running(ca)) {
+ ca->allocator_state = ALLOCATOR_STOPPED;
+ if (kthread_wait_freezable(allocator_thread_running(ca)))
+ break;
+ }
+
+ ca->allocator_state = ALLOCATOR_RUNNING;
+
cond_resched();
if (kthread_should_stop())
break;
@@ -1454,8 +1400,11 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
p = kthread_create(bch2_allocator_thread, ca,
"bch-alloc/%s", ca->name);
- if (IS_ERR(p))
+ if (IS_ERR(p)) {
+ bch_err(ca->fs, "error creating allocator thread: %li",
+ PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
rcu_assign_pointer(ca->alloc_thread, p);
@@ -1466,8 +1415,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- bch2_bucket_clock_init(c, READ);
- bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index d10ff56e4de1..6fededcd9f86 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -7,12 +7,33 @@
#include "debug.h"
struct bkey_alloc_unpacked {
+ u64 bucket;
+ u8 dev;
u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
};
+struct bkey_alloc_buf {
+ struct bkey_i k;
+
+ union {
+ struct {
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef x
+ } _v1;
+ struct {
+#define x(_name, _bits) + 8 + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+ } _v2;
+ };
+} __attribute__((packed, aligned(8)));
+
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
{
- return l.gen != r.gen
-#define x(_name, _bits) || l._name != r._name
- BCH_ALLOC_FIELDS()
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type
+#define x(_name, ...) || l._name != r._name
+ BCH_ALLOC_FIELDS_V2()
#undef x
;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+ struct bucket *g, struct bucket_mark m)
{
return (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
- .key_invalid = bch2_alloc_invalid, \
+ .key_invalid = bch2_alloc_v1_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
@@ -98,7 +130,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7a92e3d53254..8f0b94f591be 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
rcu_read_lock();
buckets = bucket_array(ca);
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark))
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+ if (is_available_bucket(buckets->b[b].mark) &&
+ !buckets->b[b].mark.owned_by_allocator)
goto success;
b = -1;
success:
@@ -204,9 +205,10 @@ success:
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
{
switch (reserve) {
- case RESERVE_ALLOC:
- return 0;
case RESERVE_BTREE:
+ case RESERVE_BTREE_MOVINGGC:
+ return 0;
+ case RESERVE_MOVINGGC:
return OPEN_BUCKETS_COUNT / 4;
default:
return OPEN_BUCKETS_COUNT / 2;
@@ -223,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bool may_alloc_partial,
struct closure *cl)
{
- struct bucket_array *buckets;
struct open_bucket *ob;
- long bucket = 0;
+ long b = 0;
spin_lock(&c->freelist_lock);
@@ -259,22 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
goto out;
switch (reserve) {
- case RESERVE_ALLOC:
- if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
- goto out;
- break;
- case RESERVE_BTREE:
- if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
- ca->free[RESERVE_BTREE].size &&
- fifo_pop(&ca->free[RESERVE_BTREE], bucket))
- goto out;
- break;
+ case RESERVE_BTREE_MOVINGGC:
case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
goto out;
break;
default:
@@ -292,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
trace_bucket_alloc_fail(ca, reserve);
return ERR_PTR(-FREELIST_EMPTY);
out:
- verify_not_on_freelist(c, ca, bucket);
+ verify_not_on_freelist(c, ca, b);
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
- buckets = bucket_array(ca);
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->ptr = (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = buckets->b[bucket].mark.gen,
- .offset = bucket_to_sector(ca, bucket),
+ .gen = bucket(ca, b)->mark.gen,
+ .offset = bucket_to_sector(ca, b),
.dev = ca->dev_idx,
};
@@ -458,16 +449,18 @@ bch2_bucket_alloc_set(struct bch_fs *c,
* it's to a device we don't want:
*/
-static void bucket_alloc_from_stripe(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- unsigned flags)
+static enum bucket_alloc_ret
+bucket_alloc_from_stripe(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ u16 target,
+ unsigned erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ unsigned flags,
+ struct closure *cl)
{
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
@@ -476,31 +469,39 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
unsigned i, ec_idx;
if (!erasure_code)
- return;
+ return 0;
if (nr_replicas < 2)
- return;
+ return 0;
if (ec_open_bucket(c, ptrs))
- return;
+ return 0;
- h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+ wp == &c->copygc_write_point,
+ cl);
+ if (IS_ERR(h))
+ return -PTR_ERR(h);
if (!h)
- return;
+ return 0;
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
for (i = 0; i < devs_sorted.nr; i++)
- open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ if (!h->s->blocks[ec_idx])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[ec_idx];
if (ob->ptr.dev == devs_sorted.devs[i] &&
- !test_and_set_bit(h->s->data_block_idx[ec_idx],
- h->s->blocks_allocated))
+ !test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
+ }
goto out_put_head;
got_bucket:
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- ob->ec_idx = h->s->data_block_idx[ec_idx];
+ ob->ec_idx = ec_idx;
ob->ec = h->s;
add_new_bucket(c, ptrs, devs_may_alloc,
@@ -508,6 +509,7 @@ got_bucket:
atomic_inc(&h->s->pin);
out_put_head:
bch2_ec_stripe_head_put(c, h);
+ return 0;
}
/* Sector allocator */
@@ -585,10 +587,13 @@ open_bucket_add_buckets(struct bch_fs *c,
}
if (!ec_open_bucket(c, ptrs)) {
- bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+ ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
target, erasure_code,
nr_replicas, nr_effective,
- have_cache, flags);
+ have_cache, flags, _cl);
+ if (ret == FREELIST_EMPTY ||
+ ret == OPEN_BUCKETS_EMPTY)
+ return ret;
if (*nr_effective >= nr_replicas)
return 0;
}
@@ -634,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
- open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
- drop |= ob2->ptr.dev == ca->dev_idx;
- open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+ if (!ob->ec->blocks[j])
+ continue;
+
+ ob2 = c->open_buckets + ob->ec->blocks[j];
drop |= ob2->ptr.dev == ca->dev_idx;
+ }
mutex_unlock(&ob->ec->lock);
}
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 20705460bb0a..be164d6108bb 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,38 +10,12 @@
struct ec_bucket_buf;
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
- /*
- * "now" in (read/write) IO time - incremented whenever we do X amount
- * of reads or writes.
- *
- * Goes with the bucket read/write prios: when we read or write to a
- * bucket we reset the bucket's prio to the current hand; thus hand -
- * prio = time since bucket was last read/written.
- *
- * The units are some amount (bytes/sectors) of data read/written, and
- * the units can change on the fly if we need to rescale to fit
- * everything in a u16 - your only guarantee is that the units are
- * consistent.
- */
- u16 hand;
- u16 max_last_io;
-
- int rw;
-
- struct io_timer rescale;
- struct mutex lock;
-};
-
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
enum alloc_reserve {
- RESERVE_ALLOC = -1,
- RESERVE_BTREE = 0,
- RESERVE_MOVINGGC = 1,
- RESERVE_NONE = 2,
- RESERVE_NR = 3,
+ RESERVE_BTREE_MOVINGGC = -2,
+ RESERVE_BTREE = -1,
+ RESERVE_MOVINGGC = 0,
+ RESERVE_NONE = 1,
+ RESERVE_NR = 2,
};
typedef FIFO(long) alloc_fifo;
@@ -89,7 +63,6 @@ struct write_point {
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
- bool is_ec;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d3d67cc71faa..8e363e2fa8c4 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -429,7 +429,9 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage __percpu *usage[2];
+ struct bch_dev_usage *usage_base;
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@@ -451,9 +453,6 @@ struct bch_dev {
size_t fifo_last_bucket;
- /* last calculated minimum prio */
- u16 max_last_bucket_io[2];
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
@@ -473,6 +472,7 @@ struct bch_dev {
atomic64_t rebalance_work;
struct journal_device journal;
+ u64 prev_journal_sector;
struct work_struct io_error_work;
@@ -509,8 +509,9 @@ enum {
BCH_FS_ERRORS_FIXED,
/* misc: */
- BCH_FS_FIXED_GENS,
- BCH_FS_ALLOC_WRITTEN,
+ BCH_FS_NEED_ANOTHER_GC,
+ BCH_FS_DELETED_NODES,
+ BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
@@ -539,11 +540,13 @@ struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
unsigned level:8;
+ bool allocated;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
} *d;
size_t nr;
+ size_t size;
u64 journal_seq_base;
};
@@ -579,7 +582,10 @@ struct bch_fs {
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
+ struct journal_entry_res btree_root_journal_res;
struct journal_entry_res replicas_journal_res;
+ struct journal_entry_res clock_journal_res;
+ struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
@@ -688,14 +694,6 @@ struct bch_fs {
struct mutex usage_scratch_lock;
struct bch_fs_usage *usage_scratch;
- /*
- * When we invalidate buckets, we use both the priority and the amount
- * of good data to determine which buckets to reuse first - to weight
- * those together consistently we keep track of the smallest nonzero
- * priority of any bucket.
- */
- struct bucket_clock bucket_clock[2];
-
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
@@ -759,7 +757,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -840,6 +838,7 @@ struct bch_fs {
struct journal journal;
struct list_head journal_entries;
struct journal_keys journal_keys;
+ struct list_head journal_iters;
u64 last_bucket_seq_cleanup;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 02a76c3d3acb..73eeeb10472a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19)
+ x(indirect_inline_data, 19) \
+ x(alloc_v2, 20)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
- idx:51;
+ redundancy:4,
+ idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:51,
+ __u64 idx:47,
+ redundancy:4,
block:8,
type:5;
#endif
@@ -603,13 +606,14 @@ struct bch_btree_ptr_v2 {
__u64 mem_ptr;
__le64 seq;
__le16 sectors_written;
- /* In case we ever decide to do variable size btree nodes: */
- __le16 sectors;
+ __le16 flags;
struct bpos min_key;
struct bch_extent_ptr start[0];
__u64 _data[0];
} __attribute__((packed, aligned(8)));
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
struct bch_extent {
struct bch_val v;
@@ -634,8 +638,6 @@ struct bch_reservation {
#define BKEY_EXTENT_VAL_U64s_MAX \
(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@@ -800,35 +802,40 @@ struct bch_alloc {
__u8 data[];
} __attribute__((packed, aligned(8)));
-#define BCH_ALLOC_FIELDS() \
+#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
- x(oldest_gen, 8)
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
- BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
- BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
- DIV_ROUND_UP(offsetof(struct bch_alloc, data)
- BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
/* Quotas: */
enum quota_types {
@@ -1132,8 +1139,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
__le64 journal_seq;
union {
@@ -1306,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
/*
* Features:
@@ -1333,15 +1341,23 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15) \
- x(journal_no_flush, 16)
+ x(journal_no_flush, 16) \
+ x(alloc_v2, 17) \
+ x(extents_across_btree_nodes, 18)
+
+#define BCH_SB_FEATURES_ALWAYS \
+ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+ (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+ (1ULL << BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
- ((1ULL << BCH_FEATURE_new_siphash)| \
- (1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (BCH_SB_FEATURES_ALWAYS| \
+ (1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
- (1ULL << BCH_FEATURE_journal_no_flush))
+ (1ULL << BCH_FEATURE_journal_no_flush)| \
+ (1ULL << BCH_FEATURE_alloc_v2))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1350,15 +1366,25 @@ enum bch_sb_feature {
BCH_FEATURE_NR,
};
+#define BCH_SB_COMPAT() \
+ x(alloc_info, 0) \
+ x(alloc_metadata, 1) \
+ x(extents_above_btree_updates_done, 2) \
+ x(bformat_overflow_done, 3)
+
enum bch_sb_compat {
- BCH_COMPAT_FEAT_ALLOC_INFO = 0,
- BCH_COMPAT_FEAT_ALLOC_METADATA = 1,
+#define x(f, n) BCH_COMPAT_##f,
+ BCH_SB_COMPAT()
+#undef x
+ BCH_COMPAT_NR,
};
/* options: */
#define BCH_REPLICAS_MAX 4U
+#define BCH_BKEY_PTRS_MAX 16U
+
enum bch_error_actions {
BCH_ON_ERROR_CONTINUE = 0,
BCH_ON_ERROR_RO = 1,
@@ -1492,7 +1518,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
- x(data_usage, 6)
+ x(data_usage, 6) \
+ x(clock, 7) \
+ x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1540,6 +1568,30 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r;
} __attribute__((packed));
+struct jset_entry_clock {
+ struct jset_entry entry;
+ __u8 rw;
+ __u8 pad[7];
+ __le64 time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+ __le64 buckets;
+ __le64 sectors;
+ __le64 fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+ struct jset_entry entry;
+ __le32 dev;
+ __u32 pad;
+
+ __le64 buckets_ec;
+ __le64 buckets_unavailable;
+
+ struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1562,8 +1614,8 @@ struct jset {
__u8 encrypted_start[0];
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 0e626b098d91..f679fc2151bc 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -14,6 +14,9 @@
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
+#define BCH_FORCE_IF_LOST \
+ (BCH_FORCE_IF_DATA_LOST| \
+ BCH_FORCE_IF_METADATA_LOST)
#define BCH_FORCE_IF_DEGRADED \
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
@@ -168,10 +171,11 @@ struct bch_ioctl_disk_set_state {
};
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_NR = 3,
+ BCH_DATA_OP_SCRUB = 0,
+ BCH_DATA_OP_REREPLICATE = 1,
+ BCH_DATA_OP_MIGRATE = 2,
+ BCH_DATA_OP_REWRITE_OLD_NODES = 3,
+ BCH_DATA_OP_NR = 4,
};
/*
@@ -184,11 +188,13 @@ enum bch_data_ops {
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
- __u32 op;
+ __u16 op;
+ __u8 start_btree;
+ __u8 end_btree;
__u32 flags;
- struct bpos start;
- struct bpos end;
+ struct bpos start_pos;
+ struct bpos end_pos;
union {
struct {
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index c06d0a965be1..e1906f257ef2 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -551,7 +551,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
unsigned bits, u64 offset)
{
- offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+ bits = min(bits, unpacked_bits);
+
+ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
f->bits_per_field[i] = bits;
f->field_offset[i] = cpu_to_le64(offset);
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 2d2c640305e2..48821f6c09aa 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -170,6 +170,11 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
return bkey_cmp(l, r) < 0 ? l : r;
}
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+ return bkey_cmp(l, r) > 0 ? l : r;
+}
+
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
@@ -525,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2);
BKEY_VAL_ACCESSORS(indirect_inline_data);
+BKEY_VAL_ACCESSORS(alloc_v2);
/* byte order helpers */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
new file mode 100644
index 000000000000..0d7c67a959af
--- /dev/null
+++ b/fs/bcachefs/bkey_buf.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+
+struct bkey_buf {
+ struct bkey_i *k;
+ u64 onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+ struct bch_fs *c, unsigned u64s)
+{
+ if (s->k == (void *) s->onstack &&
+ u64s > ARRAY_SIZE(s->onstack)) {
+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ memcpy(s->k, s->onstack, sizeof(s->onstack));
+ }
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_buf_realloc(s, c, k.k->u64s);
+ bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_i *src)
+{
+ bch2_bkey_buf_realloc(s, c, src->k.u64s);
+ bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct btree *b,
+ struct bkey_packed *src)
+{
+ bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+ bkeyp_val_u64s(&b->format, src));
+ bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+ s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+ if (s->k != (void *) s->onstack)
+ mempool_free(s->k, &c->large_bkey_pool);
+ s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h
deleted file mode 100644
index f607a0cb37ed..000000000000
--- a/fs/bcachefs/bkey_on_stack.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_ON_STACK_H
-#define _BCACHEFS_BKEY_ON_STACK_H
-
-#include "bcachefs.h"
-
-struct bkey_on_stack {
- struct bkey_i *k;
- u64 onstack[12];
-};
-
-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
- struct bch_fs *c, unsigned u64s)
-{
- if (s->k == (void *) s->onstack &&
- u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
- memcpy(s->k, s->onstack, sizeof(s->onstack));
- }
-}
-
-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
- struct bch_fs *c,
- struct bkey_s_c k)
-{
- bkey_on_stack_realloc(s, c, k.k->u64s);
- bkey_reassemble(s->k, k);
-}
-
-static inline void bkey_on_stack_init(struct bkey_on_stack *s)
-{
- s->k = (void *) s->onstack;
-}
-
-static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
- struct bch_fs *c)
-{
- if (s->k != (void *) s->onstack)
- mempool_free(s->k, &c->large_bkey_pool);
- s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_ON_STACK_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 99e0a4011fae..2e1d9cd65f43 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bkey_sort.h"
#include "bset.h"
#include "extents.h"
@@ -187,11 +187,11 @@ bch2_sort_repack_merge(struct bch_fs *c,
bool filter_whiteouts)
{
struct bkey_packed *out = vstruct_last(dst), *k_packed;
- struct bkey_on_stack k;
+ struct bkey_buf k;
struct btree_nr_keys nr;
memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&k);
+ bch2_bkey_buf_init(&k);
while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
if (filter_whiteouts && bkey_whiteout(k_packed))
@@ -204,7 +204,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
* node; we have to make a copy of the entire key before calling
* normalize
*/
- bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+ bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
bch2_bkey_unpack(src, k.k, k_packed);
if (filter_whiteouts &&
@@ -215,7 +215,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- bkey_on_stack_exit(&k, c);
+ bch2_bkey_buf_exit(&k, c);
return nr;
}
@@ -315,11 +315,11 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
struct bkey l_unpacked, r_unpacked;
struct bkey_s l, r;
struct btree_nr_keys nr;
- struct bkey_on_stack split;
+ struct bkey_buf split;
unsigned i;
memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&split);
+ bch2_bkey_buf_init(&split);
sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
for (i = 0; i < iter->used;) {
@@ -379,7 +379,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
/*
* r wins, but it overlaps in the middle of l - split l:
*/
- bkey_on_stack_reassemble(&split, c, l.s_c);
+ bch2_bkey_buf_reassemble(&split, c, l.s_c);
bch2_cut_back(bkey_start_pos(r.k), split.k);
bch2_cut_front_s(r.k->p, l);
@@ -398,7 +398,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- bkey_on_stack_exit(&split, c);
+ bch2_bkey_buf_exit(&split, c);
return nr;
}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 1dc1f7460f9a..19c219cb317b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
+#include "error.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
@@ -81,8 +83,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
@@ -812,9 +813,12 @@ lock_node:
return ERR_PTR(-EIO);
}
- EBUG_ON(b->c.btree_id != iter->btree_id ||
- BTREE_NODE_LEVEL(b->data) != level ||
- bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->c.btree_id != iter->btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ bkey_cmp(b->data->min_key,
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
@@ -822,7 +826,8 @@ lock_node:
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
const struct bkey_i *k,
enum btree_id btree_id,
- unsigned level)
+ unsigned level,
+ bool nofill)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@@ -837,6 +842,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
+ if (nofill)
+ goto out;
+
b = bch2_btree_node_fill(c, NULL, k, btree_id,
level, SIX_LOCK_read, true);
@@ -844,8 +852,12 @@ retry:
if (!b)
goto retry;
+ if (IS_ERR(b) &&
+ !bch2_btree_cache_cannibalize_lock(c, NULL))
+ goto retry;
+
if (IS_ERR(b))
- return b;
+ goto out;
} else {
lock_node:
ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
@@ -880,13 +892,18 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- return ERR_PTR(-EIO);
+ b = ERR_PTR(-EIO);
+ goto out;
}
- EBUG_ON(b->c.btree_id != btree_id ||
- BTREE_NODE_LEVEL(b->data) != level ||
- bkey_cmp(b->data->max_key, k->k.p));
-
+ EBUG_ON(b->c.btree_id != btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ bkey_cmp(b->data->min_key,
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
+out:
+ bch2_btree_cache_cannibalize_unlock(c);
return b;
}
@@ -899,10 +916,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
- BKEY_PADDED(k) tmp;
+ struct bkey_buf tmp;
struct btree *ret = NULL;
unsigned level = b->c.level;
+ bch2_bkey_buf_init(&tmp);
+
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
@@ -936,9 +955,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
if (!k)
goto out;
- bch2_bkey_unpack(parent, &tmp.k, k);
+ bch2_bkey_buf_unpack(&tmp, c, parent, k);
- ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+ ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
@@ -958,7 +977,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
- ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+ ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
/*
@@ -993,30 +1012,46 @@ out:
if (sib != btree_prev_sib)
swap(n1, n2);
- BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
- n2->data->min_key));
+ if (bkey_cmp(bkey_successor(n1->key.k.p),
+ n2->data->min_key)) {
+ char buf1[200], buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
+
+ bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
+ "prev: %s\n"
+ "next: %s\n",
+ bch2_btree_ids[iter->btree_id], level,
+ buf1, buf2);
+
+ six_unlock_intent(&ret->c.lock);
+ ret = NULL;
+ }
}
bch2_btree_trans_verify_locks(trans);
+ bch2_bkey_buf_exit(&tmp, c);
+
return ret;
}
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
- const struct bkey_i *k, unsigned level)
+ const struct bkey_i *k,
+ enum btree_id btree_id, unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- BUG_ON(!btree_node_locked(iter, level + 1));
+ BUG_ON(iter && !btree_node_locked(iter, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);
if (b)
return;
- bch2_btree_node_fill(c, iter, k, iter->btree_id,
- level, SIX_LOCK_read, false);
+ bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1068,6 +1103,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
{
- pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
- pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+ pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+ pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+ pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index e766ef552ce7..5fffae92effb 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -26,13 +26,13 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
- enum btree_id, unsigned);
+ enum btree_id, unsigned, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
- const struct bkey_i *, unsigned);
+ const struct bkey_i *, enum btree_id, unsigned);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6268ea637d19..3a1518999c4b 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -8,7 +8,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "bkey_methods.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -50,39 +50,228 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
static int bch2_gc_check_topology(struct bch_fs *c,
- struct bkey_s_c k,
- struct bpos *expected_start,
- struct bpos expected_end,
+ struct btree *b,
+ struct bkey_buf *prev,
+ struct bkey_buf cur,
bool is_last)
{
+ struct bpos node_start = b->data->min_key;
+ struct bpos node_end = b->data->max_key;
+ struct bpos expected_start = bkey_deleted(&prev->k->k)
+ ? node_start
+ : bkey_successor(prev->k->k.p);
+ char buf1[200], buf2[200];
+ bool update_min = false;
+ bool update_max = false;
int ret = 0;
- if (k.k->type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
+
+ if (bkey_deleted(&prev->k->k))
+ scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu",
+ node_start.inode,
+ node_start.offset);
+ else
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
+
+ if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1,
+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+ update_min = true;
+ }
+
+ if (fsck_err_on(is_last &&
+ bkey_cmp(cur.k->k.p, node_end), c,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+ update_max = true;
+
+ bch2_bkey_buf_copy(prev, c, cur.k);
+
+ if (update_min || update_max) {
+ struct bkey_i *new;
+ struct bkey_i_btree_ptr_v2 *bp = NULL;
+ struct btree *n;
+
+ if (update_max) {
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur.k->k.p);
+ if (ret)
+ return ret;
+ }
+
+ new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_copy(new, cur.k);
+
+ if (new->k.type == KEY_TYPE_btree_ptr_v2)
+ bp = bkey_i_to_btree_ptr_v2(new);
+
+ if (update_min)
+ bp->v.min_key = expected_start;
+ if (update_max)
+ new->k.p = node_end;
+ if (bp)
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
- if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
- "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
- bp.v->min_key.inode,
- bp.v->min_key.offset,
- expected_start->inode,
- expected_start->offset)) {
- BUG();
+ n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+ b->c.level - 1, true);
+ if (n) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+ bkey_copy(&n->key, new);
+ if (update_min)
+ n->data->min_key = expected_start;
+ if (update_max)
+ n->data->max_key = node_end;
+
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&n->c.lock);
}
}
+fsck_err:
+ return ret;
+}
- *expected_start = bkey_cmp(k.k->p, POS_MAX)
- ? bkey_successor(k.k->p)
- : k.k->p;
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c *k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ int ret = 0;
- if (fsck_err_on(is_last &&
- bkey_cmp(k.k->p, expected_end), c,
- "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
- k.k->p.inode,
- k.k->p.offset,
- expected_end.inode,
- expected_end.offset)) {
- BUG();
+ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+
+ if (fsck_err_on(!g->gen_valid, c,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen)) {
+ if (p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen)) {
+ if (p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ g2->_mark.data_type = 0;
+ g2->_mark.dirty_sectors = 0;
+ g2->_mark.cached_sectors = 0;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen))
+ do_update = true;
+
+ if (p.has_ec) {
+ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx))
+ do_update = true;
+ }
+ }
+
+ if (do_update) {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *new;
+
+ if (is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ return -EINVAL;
+ }
+
+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_reassemble(new, *k);
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+ (ptr->cached &&
+ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!ptr->cached &&
+ gen_cmp(ptr->gen, g->mark.gen) < 0);
+ }));
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct stripe *m = genradix_ptr(&c->stripes[true],
+ entry->stripe_ptr.idx);
+
+ if (!m || !m->alive) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+
+ ret = bch2_journal_key_insert(c, btree_id, level, new);
+ if (ret)
+ kfree(new);
+ else
+ *k = bkey_i_to_s_c(new);
}
fsck_err:
return ret;
@@ -90,7 +279,9 @@ fsck_err:
/* marking of btree keys/nodes: */
-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c k,
u8 *max_stale, bool initial)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -104,7 +295,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > journal_cur_seq(&c->journal));
- /* XXX change to fsck check */
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
"key version number higher than recorded: %llu > %llu",
k.k->version.lo,
@@ -116,37 +306,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
"superblock not marked as containing replicas (type %u)",
k.k->type)) {
ret = bch2_mark_bkey_replicas(c, k);
- if (ret)
- return ret;
- }
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
- struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
-
- if (mustfix_fsck_err_on(!g->gen_valid, c,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
- ptr->dev, PTR_BUCKET_NR(ca, ptr),
- bch2_data_types[ptr_data_type(k.k, ptr)],
- ptr->gen)) {
- g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->gen_valid = g->gen_valid = true;
- }
-
- if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
- ptr->dev, PTR_BUCKET_NR(ca, ptr),
- bch2_data_types[ptr_data_type(k.k, ptr)],
- ptr->gen, g->mark.gen)) {
- g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
- set_bit(BCH_FS_FIXED_GENS, &c->flags);
+ if (ret) {
+ bch_err(c, "error marking bkey replicas: %i", ret);
+ goto err;
}
}
+
+ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
}
bkey_for_each_ptr(ptrs, ptr) {
@@ -161,16 +327,19 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
fsck_err:
+err:
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
bool initial)
{
- struct bpos next_node_start = b->data->min_key;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
+ struct bkey_buf prev, cur;
int ret = 0;
*max_stale = 0;
@@ -179,37 +348,42 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
bch2_bkey_debugcheck(c, b, k);
- ret = bch2_gc_mark_key(c, k, max_stale, initial);
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ k, max_stale, initial);
if (ret)
break;
bch2_btree_node_iter_advance(&iter, b);
if (b->c.level) {
- ret = bch2_gc_check_topology(c, k,
- &next_node_start,
- b->data->max_key,
+ bch2_bkey_buf_reassemble(&cur, c, k);
+
+ ret = bch2_gc_check_topology(c, b, &prev, cur,
bch2_btree_node_iter_end(&iter));
if (ret)
break;
}
}
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
return ret;
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
- bool initial, bool metadata_only)
+ bool initial)
{
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- unsigned depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
+ unsigned depth = bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
@@ -233,7 +407,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
if (max_stale > 64)
bch2_btree_node_rewrite(c, iter,
b->data->keys.seq,
- BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
else if (!bch2_btree_gc_rewrite_disabled &&
@@ -253,7 +426,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
if (!btree_node_fake(b))
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ bkey_i_to_s_c(&b->key),
&max_stale, initial);
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
@@ -262,16 +436,18 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
}
static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
- struct journal_keys *journal_keys,
unsigned target_depth)
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bpos next_node_start = b->data->min_key;
+ struct bkey_buf cur, prev;
u8 max_stale = 0;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_debugcheck(c, b, k);
@@ -279,57 +455,82 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
- ret = bch2_gc_mark_key(c, k, &max_stale, true);
- if (ret)
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ k, &max_stale, true);
+ if (ret) {
+ bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
break;
+ }
if (b->c.level) {
- struct btree *child;
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ k = bkey_i_to_s_c(cur.k);
bch2_btree_and_journal_iter_advance(&iter);
- ret = bch2_gc_check_topology(c, k,
- &next_node_start,
- b->data->max_key,
+ ret = bch2_gc_check_topology(c, b,
+ &prev, cur,
!bch2_btree_and_journal_iter_peek(&iter).k);
if (ret)
break;
+ } else {
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+ }
- if (b->c.level > target_depth) {
- child = bch2_btree_node_get_noiter(c, &tmp.k,
- b->c.btree_id, b->c.level - 1);
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
+ if (b->c.level > target_depth) {
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ struct btree *child;
- ret = bch2_gc_btree_init_recurse(c, child,
- journal_keys, target_depth);
- six_unlock_read(&child->c.lock);
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+ child = bch2_btree_node_get_noiter(c, cur.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(child);
+
+ if (fsck_err_on(ret == -EIO, c,
+ "unreadable btree node")) {
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur.k->k.p);
if (ret)
- break;
+ return ret;
+
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ continue;
}
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
+
+ if (ret) {
+ bch_err(c, "%s: error %i getting btree node",
+ __func__, ret);
+ break;
+ }
+
+ ret = bch2_gc_btree_init_recurse(c, child,
+ target_depth);
+ six_unlock_read(&child->c.lock);
+
+ if (ret)
+ break;
}
}
-
+fsck_err:
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
+ bch2_btree_and_journal_iter_exit(&iter);
return ret;
}
static int bch2_gc_btree_init(struct bch_fs *c,
- struct journal_keys *journal_keys,
- enum btree_id btree_id,
- bool metadata_only)
+ enum btree_id btree_id)
{
struct btree *b;
- unsigned target_depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
+ unsigned target_depth = bch2_expensive_debug_checks ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
int ret = 0;
@@ -355,15 +556,17 @@ static int bch2_gc_btree_init(struct bch_fs *c,
}
if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(c, b,
- journal_keys, target_depth);
+ ret = bch2_gc_btree_init_recurse(c, b, target_depth);
if (!ret)
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ bkey_i_to_s_c(&b->key),
&max_stale, true);
fsck_err:
six_unlock_read(&b->c.lock);
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
@@ -373,8 +576,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial)
{
enum btree_id ids[BTREE_ID_NR];
unsigned i;
@@ -386,11 +588,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
int ret = initial
- ? bch2_gc_btree_init(c, journal_keys,
- id, metadata_only)
- : bch2_gc_btree(c, id, initial, metadata_only);
- if (ret)
+ ? bch2_gc_btree_init(c, id)
+ : bch2_gc_btree(c, id, initial);
+ if (ret) {
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
+ }
}
return 0;
@@ -546,8 +749,8 @@ static void bch2_gc_free(struct bch_fs *c)
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
- free_percpu(ca->usage[1]);
- ca->usage[1] = NULL;
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
@@ -555,13 +758,12 @@ static void bch2_gc_free(struct bch_fs *c)
}
static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
+ bool initial)
{
struct bch_dev *ca;
- bool verify = !metadata_only &&
- (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
- unsigned i;
+ bool verify = (!initial ||
+ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+ unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
@@ -570,18 +772,17 @@ static int bch2_gc_done(struct bch_fs *c,
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, "stripe %zu has wrong "_msg \
": got %u, should be %u", \
- dst_iter.pos, ##__VA_ARGS__, \
+ iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- dst->dirty = true; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
@@ -592,48 +793,46 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_data_types[dst->b[b].mark.data_type],\
dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
- if (!metadata_only) {
- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+ {
+ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
- c->ec_stripes_heap.used = 0;
-
- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
- BUG_ON(src_iter.pos != dst_iter.pos);
-
- copy_stripe_field(alive, "alive");
- copy_stripe_field(sectors, "sectors");
- copy_stripe_field(algorithm, "algorithm");
- copy_stripe_field(nr_blocks, "nr_blocks");
- copy_stripe_field(nr_redundant, "nr_redundant");
- copy_stripe_field(blocks_nonempty,
- "blocks_nonempty");
+ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
+
+ if (dst->alive != src->alive ||
+ dst->sectors != src->sectors ||
+ dst->algorithm != src->algorithm ||
+ dst->nr_blocks != src->nr_blocks ||
+ dst->nr_redundant != src->nr_redundant) {
+ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+ ret = -EINVAL;
+ goto fsck_err;
+ }
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
- if (dst->alive) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_insert(c, dst, dst_iter.pos);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
+ dst->blocks_nonempty = 0;
+ for (i = 0; i < dst->nr_blocks; i++)
+ dst->blocks_nonempty += dst->block_sectors[i] != 0;
- genradix_iter_advance(&dst_iter, &c->stripes[0]);
- genradix_iter_advance(&src_iter, &c->stripes[1]);
+ genradix_iter_advance(&iter, &c->stripes[1]);
}
}
- for_each_member_device(ca, c, i) {
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
@@ -648,12 +847,23 @@ static int bch2_gc_done(struct bch_fs *c,
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
- };
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
+ {
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
- bch2_dev_usage_from_buckets(c);
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ }
+ }
+ };
{
unsigned nr = fs_usage_u64s(c);
@@ -663,28 +873,20 @@ static int bch2_gc_done(struct bch_fs *c,
copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree");
+ copy_fs_field(data, "data");
+ copy_fs_field(cached, "cached");
+ copy_fs_field(reserved, "reserved");
+ copy_fs_field(nr_inodes,"nr_inodes");
- if (!metadata_only) {
- copy_fs_field(data, "data");
- copy_fs_field(cached, "cached");
- copy_fs_field(reserved, "reserved");
- copy_fs_field(nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(persistent_reserved[i],
+ "persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
char buf[80];
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
bch2_replicas_entry_to_text(&PBUF(buf), e);
copy_fs_field(replicas[i], "%s", buf);
@@ -697,11 +899,12 @@ static int bch2_gc_done(struct bch_fs *c,
#undef copy_stripe_field
#undef copy_field
fsck_err:
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
-static int bch2_gc_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
@@ -718,7 +921,7 @@ static int bch2_gc_start(struct bch_fs *c,
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
- BUG_ON(ca->usage[1]);
+ BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
@@ -729,9 +932,9 @@ static int bch2_gc_start(struct bch_fs *c,
return -ENOMEM;
}
- ca->usage[1] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[1]) {
- bch_err(c, "error allocating ca->usage[gc]");
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage_gc) {
+ bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
@@ -765,13 +968,6 @@ static int bch2_gc_start(struct bch_fs *c,
d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
d->gen_valid = s->gen_valid;
-
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached)) {
- d->_mark = s->mark;
- d->_mark.owned_by_allocator = 0;
- }
}
};
@@ -798,8 +994,7 @@ static int bch2_gc_start(struct bch_fs *c,
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+int bch2_gc(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
@@ -815,13 +1010,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
again:
- ret = bch2_gc_start(c, metadata_only);
+ ret = bch2_gc_start(c);
if (ret)
goto out;
bch2_mark_superblocks(c);
- ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+ ret = bch2_gc_btrees(c, initial);
if (ret)
goto out;
@@ -831,16 +1026,15 @@ again:
bch2_mark_allocator_buckets(c);
c->gc_count++;
-out:
- if (!ret &&
- (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
- (!iter && bch2_test_restart_gc))) {
+
+ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ (!iter && bch2_test_restart_gc)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
- bch_info(c, "Fixed gens, restarting mark and sweep:");
- clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
percpu_down_write(&c->mark_lock);
@@ -855,12 +1049,12 @@ out:
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
-
+out:
if (!ret) {
bch2_journal_block(&c->journal);
percpu_down_write(&c->mark_lock);
- ret = bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_done(c, initial);
bch2_journal_unblock(&c->journal);
} else {
@@ -930,10 +1124,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int ret = 0;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -942,7 +1136,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
if (gc_btree_gens_key(c, k)) {
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
@@ -962,7 +1156,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
}
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
@@ -1074,7 +1268,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
}
if (bch2_keylist_realloc(&keylist, NULL, 0,
- (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+ BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
return;
@@ -1354,7 +1548,7 @@ static int bch2_gc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic_long_read(&clock->now);
+ unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
@@ -1375,7 +1569,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
- if (atomic_long_read(&clock->now) >= next)
+ if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
@@ -1387,14 +1581,14 @@ static int bch2_gc_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
* Full gc is currently incompatible with btree key cache:
*/
#if 0
- ret = bch2_gc(c, NULL, false, false);
+ ret = bch2_gc(c, false, false);
#else
ret = bch2_gc_gens(c);
#endif
@@ -1424,11 +1618,14 @@ int bch2_gc_thread_start(struct bch_fs *c)
{
struct task_struct *p;
- BUG_ON(c->gc_thread);
+ if (c->gc_thread)
+ return 0;
p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
- if (IS_ERR(p))
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
c->gc_thread = p;
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 3694a3df62a8..fa604efc70cc 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -6,8 +6,7 @@
void bch2_coalesce(struct bch_fs *);
-struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, bool);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 87f97ccb3f1f..a84a473101dc 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -608,11 +608,16 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
}
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca,
struct btree *b, struct bset *i,
unsigned offset, int write)
{
- pr_buf(out, "error validating btree node %sat btree ",
- write ? "before write " : "");
+ pr_buf(out, "error validating btree node ");
+ if (write)
+ pr_buf(out, "before write ");
+ if (ca)
+ pr_buf(out, "on %s ", ca->name);
+ pr_buf(out, "at btree ");
btree_pos_to_text(out, c, b);
pr_buf(out, "\n node offset %u", b->written);
@@ -631,30 +636,30 @@ enum btree_validate_ret {
BTREE_RETRY_READ = 64,
};
-#define btree_err(type, c, b, i, msg, ...) \
+#define btree_err(type, c, ca, b, i, msg, ...) \
({ \
__label__ out; \
char _buf[300]; \
- char *buf = _buf; \
+ char *buf2 = _buf; \
struct printbuf out = PBUF(_buf); \
\
- buf = kmalloc(4096, GFP_ATOMIC); \
- if (buf) \
- out = _PBUF(buf, 4986); \
+ buf2 = kmalloc(4096, GFP_ATOMIC); \
+ if (buf2) \
+ out = _PBUF(buf2, 4986); \
\
- btree_err_msg(&out, c, b, i, b->written, write); \
+ btree_err_msg(&out, c, ca, b, i, b->written, write); \
pr_buf(&out, ": " msg, ##__VA_ARGS__); \
\
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", buf); \
+ mustfix_fsck_err(c, "%s", buf2); \
goto out; \
} \
\
switch (write) { \
case READ: \
- bch_err(c, "%s", buf); \
+ bch_err(c, "%s", buf2); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
@@ -675,7 +680,7 @@ enum btree_validate_ret {
} \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", buf); \
+ bch_err(c, "corrupt metadata before write: %s", buf2); \
\
if (bch2_fs_inconsistent(c)) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
@@ -684,16 +689,16 @@ enum btree_validate_ret {
break; \
} \
out: \
- if (buf != _buf) \
- kfree(buf); \
+ if (buf2 != _buf) \
+ kfree(buf2); \
true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
-static int validate_bset(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned sectors,
- int write, bool have_retry)
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, struct bset *i,
+ unsigned sectors, int write, bool have_retry)
{
unsigned version = le16_to_cpu(i->version);
const char *err;
@@ -702,18 +707,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
btree_err_on((version != BCH_BSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max,
- BTREE_ERR_FATAL, c, b, i,
+ BTREE_ERR_FATAL, c, ca, b, i,
"unsupported bset version");
if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
return 0;
}
btree_err_on(b->written && !i->u64s,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"empty bset");
if (!b->written) {
@@ -727,16 +732,16 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- BTREE_ERR_MUST_RETRY, c, b, i,
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- BTREE_ERR_MUST_RETRY, c, b, i,
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect level");
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
@@ -753,8 +758,13 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
+ if (BTREE_PTR_RANGE_UPDATED(bp)) {
+ b->data->min_key = bp->min_key;
+ b->data->max_key = b->key.k.p;
+ }
+
btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect min_key: got %llu:%llu should be %llu:%llu",
b->data->min_key.inode,
b->data->min_key.offset,
@@ -763,7 +773,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
}
btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
- BTREE_ERR_MUST_RETRY, c, b, i,
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect max key %llu:%llu",
bn->max_key.inode,
bn->max_key.offset);
@@ -788,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
#endif
err = bch2_bkey_format_validate(&bn->format);
btree_err_on(err,
- BTREE_ERR_FATAL, c, b, i,
+ BTREE_ERR_FATAL, c, ca, b, i,
"invalid bkey format: %s", err);
compat_bformat(b->c.level, b->c.btree_id, version,
@@ -820,14 +830,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
const char *invalid;
if (btree_err_on(bkey_next(k) > vstruct_last(i),
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
@@ -850,8 +860,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, b, i,
- "invalid bkey:\n%s\n%s", invalid, buf);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "invalid bkey: %s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
@@ -884,7 +894,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(c, b, i, 0);
- btree_err(BTREE_ERR_FATAL, c, b, i,
+ btree_err(BTREE_ERR_FATAL, c, NULL, b, i,
"keys out of order: %s > %s",
buf1, buf2);
/* XXX: repair this */
@@ -897,7 +907,8 @@ fsck_err:
return ret;
}
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, bool have_retry)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@@ -909,20 +920,22 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
unsigned u64s;
int ret, retry_read = 0, write = READ;
+ b->version_ondisk = U16_MAX;
+
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
sort_iter_init(iter, b);
iter->size = (btree_blocks(c) + 1) * 2;
if (bch2_meta_read_fault("btree"))
- btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"bad magic");
btree_err_on(!b->data->keys.seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"bad btree header");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -930,7 +943,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
&bkey_i_to_btree_ptr_v2(&b->key)->v;
btree_err_on(b->data->keys.seq != bp->seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"got wrong btree node (seq %llx want %llx)",
b->data->keys.seq, bp->seq);
}
@@ -945,7 +958,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
i = &b->data->keys;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
@@ -953,7 +966,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
btree_err_on(bch2_crc_cmp(csum, b->data->csum),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
bset_encrypt(c, i, b->written << 9);
@@ -973,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
break;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
@@ -981,7 +994,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
btree_err_on(bch2_crc_cmp(csum, bne->csum),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
bset_encrypt(c, i, b->written << 9);
@@ -989,7 +1002,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
sectors = vstruct_sectors(bne, c->block_bits);
}
- ret = validate_bset(c, b, i, sectors,
+ b->version_ondisk = min(b->version_ondisk,
+ le16_to_cpu(i->version));
+
+ ret = validate_bset(c, ca, b, i, sectors,
READ, have_retry);
if (ret)
goto fsck_err;
@@ -1011,7 +1027,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
true);
btree_err_on(blacklisted && first,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"first btree node bset has blacklisted journal seq");
if (blacklisted && !first)
continue;
@@ -1028,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
bset_byte_offset(b, bne) < btree_bytes(c);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq,
- BTREE_ERR_WANT_RETRY, c, b, NULL,
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
"found bset signature after last bset");
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1063,7 +1079,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, b, i,
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
"invalid bkey %s: %s", buf, invalid);
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1154,7 +1170,7 @@ start:
&failed, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, can_retry))
+ !bch2_btree_node_read_done(c, ca, b, can_retry))
break;
if (!can_retry) {
@@ -1320,12 +1336,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
struct btree_write_bio *wbio)
{
struct btree *b = wbio->wbio.bio.bi_private;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_buf k;
struct bch_extent_ptr *ptr;
struct btree_trans trans;
struct btree_iter *iter;
int ret;
+ bch2_bkey_buf_init(&k);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
@@ -1344,21 +1361,22 @@ retry:
BUG_ON(!btree_node_hashed(b));
- bkey_copy(&tmp.k, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+ bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
goto err;
- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
if (ret == -EINTR)
goto retry;
if (ret)
goto err;
out:
bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&k, c);
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
return;
@@ -1458,7 +1476,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
return -1;
- ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
+ ret = validate_bset(c, NULL, b, i, sectors, WRITE, false) ?:
validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
if (ret) {
bch2_inconsistent_error(c);
@@ -1476,7 +1494,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
- BKEY_PADDED(key) k;
+ struct bkey_buf k;
struct bch_extent_ptr *ptr;
struct sort_iter sort_iter;
struct nonce nonce;
@@ -1487,6 +1505,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
bool validate_before_checksum = false;
void *data;
+ bch2_bkey_buf_init(&k);
+
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return;
@@ -1620,7 +1640,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
validate_before_checksum = true;
/* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
+ if (le16_to_cpu(i->version) <= bcachefs_metadata_version_inode_btree_change)
validate_before_checksum = true;
/* if we're going to be encrypting, check metadata validity first: */
@@ -1695,15 +1715,16 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
* just make all btree node writes FUA to keep things sane.
*/
- bkey_copy(&k.key, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
ptr->offset += b->written;
b->written += sectors_to_write;
/* XXX: submitting IO with btree locks held: */
- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+ bch2_bkey_buf_exit(&k, c);
return;
err:
set_btree_node_noevict(b);
@@ -1823,23 +1844,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
-
- BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
- (flags & (1 << BTREE_NODE_write_in_flight)));
- }
- rcu_read_unlock();
-}
-
void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bucket_table *tbl;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 6de92263af6c..89685bd57fc0 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -121,7 +121,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -134,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct bch_fs *, struct btree *,
struct btree_iter *);
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+ struct btree *, bool);
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
@@ -185,7 +186,6 @@ do { \
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 21253be5aab6..c41fe4e0bc00 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
@@ -33,13 +34,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
+ return bkey_cmp(iter->real_pos, b->data->min_key) < 0;
}
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+ return bkey_cmp(b->key.k.p, iter->real_pos) < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -490,7 +491,6 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
static void bch2_btree_iter_verify_level(struct btree_iter *iter,
unsigned level)
{
- struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[level];
struct btree_node_iter tmp = l->iter;
bool locked = btree_node_locked(iter, level);
@@ -515,12 +515,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
if (!bch2_btree_node_relock(iter, level))
return;
- /*
- * Ideally this invariant would always be true, and hopefully in the
- * future it will be, but for now set_pos_same_leaf() breaks it:
- */
- BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
- !btree_iter_pos_in_node(iter, l->b));
+ BUG_ON(!btree_iter_pos_in_node(iter, l->b));
/*
* node iterators don't use leaf node iterator:
@@ -543,12 +538,12 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
: bch2_btree_node_iter_prev_all(&tmp, l->b);
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+ if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
msg = "before";
goto err;
}
- if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+ if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
msg = "after";
goto err;
}
@@ -571,12 +566,11 @@ err:
}
panic("iterator should be %s key at level %u:\n"
- "iter pos %s %llu:%llu\n"
+ "iter pos %llu:%llu\n"
"prev key %s\n"
"cur key %s\n",
msg, level,
- iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
- iter->pos.inode, iter->pos.offset,
+ iter->real_pos.inode, iter->real_pos.offset,
buf1, buf2);
}
@@ -584,12 +578,24 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
{
unsigned i;
- bch2_btree_trans_verify_locks(iter->trans);
+ EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+ bch2_btree_iter_verify_locks(iter);
for (i = 0; i < BTREE_MAX_DEPTH; i++)
bch2_btree_iter_verify_level(iter, i);
}
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ enum btree_iter_type type = btree_iter_type(iter);
+
+ BUG_ON((type == BTREE_ITER_KEYS ||
+ type == BTREE_ITER_CACHED) &&
+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+ bkey_cmp(iter->pos, iter->k.p) > 0));
+}
+
void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
{
struct btree_iter *iter;
@@ -605,6 +611,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
#endif
@@ -630,12 +637,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
struct bkey_packed *where)
{
struct btree_iter_level *l = &iter->l[b->c.level];
- struct bpos pos = btree_iter_search_key(iter);
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
return;
- if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+ if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
@@ -670,7 +676,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
bool iter_current_key_modified =
orig_iter_pos >= offset &&
orig_iter_pos <= offset + clobber_u64s;
- struct bpos iter_pos = btree_iter_search_key(iter);
btree_node_iter_for_each(node_iter, set)
if (set->end == old_end)
@@ -678,7 +683,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
@@ -693,7 +698,7 @@ found:
return;
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
@@ -829,12 +834,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
struct btree_iter_level *l,
int max_advance)
{
- struct bpos pos = btree_iter_search_key(iter);
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
@@ -897,10 +901,16 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
static inline void __btree_iter_init(struct btree_iter *iter,
unsigned level)
{
- struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &pos);
+ bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (level)
+ bch2_btree_node_iter_peek(&l->iter, l->b);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
@@ -1041,27 +1051,32 @@ static void btree_iter_prefetch(struct btree_iter *iter)
struct btree_iter_level *l = &iter->l[iter->level];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
- BKEY_PADDED(k) tmp;
+ struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
? (iter->level > 1 ? 0 : 2)
: (iter->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(iter, iter->level);
+ bch2_bkey_buf_init(&tmp);
+
while (nr) {
if (!bch2_btree_node_relock(iter, iter->level))
- return;
+ break;
bch2_btree_node_iter_advance(&node_iter, l->b);
k = bch2_btree_node_iter_peek(&node_iter, l->b);
if (!k)
break;
- bch2_bkey_unpack(l->b, &tmp.k, k);
- bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
+ iter->level - 1);
}
if (!was_locked)
btree_node_unlock(iter, iter->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
}
static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
@@ -1093,30 +1108,34 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
struct btree *b;
unsigned level = iter->level - 1;
enum six_lock_type lock_type = __btree_lock_want(iter, level);
- BKEY_PADDED(k) tmp;
+ struct bkey_buf tmp;
+ int ret;
EBUG_ON(!btree_node_locked(iter, iter->level));
- bch2_bkey_unpack(l->b, &tmp.k,
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
- if (unlikely(IS_ERR(b)))
- return PTR_ERR(b);
+ b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (unlikely(ret))
+ goto err;
mark_btree_node_locked(iter, level, lock_type);
btree_iter_node_set(iter, b);
- if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
- unlikely(b != btree_node_mem_ptr(&tmp.k)))
+ if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(iter, level + 1, b);
if (iter->flags & BTREE_ITER_PREFETCH)
btree_iter_prefetch(iter);
iter->level = level;
-
- return 0;
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
}
static void btree_iter_up(struct btree_iter *iter)
@@ -1330,21 +1349,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
return ret;
}
-static inline void bch2_btree_iter_checks(struct btree_iter *iter)
-{
- enum btree_iter_type type = btree_iter_type(iter);
-
- EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-
- BUG_ON((type == BTREE_ITER_KEYS ||
- type == BTREE_ITER_CACHED) &&
- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
- bkey_cmp(iter->pos, iter->k.p) > 0));
-
- bch2_btree_iter_verify_locks(iter);
- bch2_btree_iter_verify_level(iter, iter->level);
-}
-
/* Iterate across nodes (leaf and interior nodes) */
struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
@@ -1353,7 +1357,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
if (iter->uptodate == BTREE_ITER_UPTODATE)
return iter->l[iter->level].b;
@@ -1368,7 +1372,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = b->key.k.p;
+ iter->pos = iter->real_pos = b->key.k.p;
iter->uptodate = BTREE_ITER_UPTODATE;
bch2_btree_iter_verify(iter);
@@ -1382,7 +1386,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
/* already got to end? */
if (!btree_iter_node(iter, iter->level))
@@ -1419,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
if (btree_node_read_locked(iter, iter->level))
btree_node_unlock(iter, iter->level);
- iter->pos = bkey_successor(iter->pos);
+ iter->pos = iter->real_pos = bkey_successor(iter->pos);
iter->level = iter->min_depth;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
@@ -1430,7 +1434,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
b = iter->l[iter->level].b;
}
- iter->pos = b->key.k.p;
+ iter->pos = iter->real_pos = b->key.k.p;
iter->uptodate = BTREE_ITER_UPTODATE;
bch2_btree_iter_verify(iter);
@@ -1440,36 +1444,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
/* Iterate across keys (in leaf nodes only) */
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- EBUG_ON(iter->level != 0);
- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
- EBUG_ON(!btree_node_locked(iter, 0));
- EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
- btree_iter_advance_to_pos(iter, l, -1);
-
- /*
- * XXX:
- * keeping a node locked that's outside (even just outside) iter->pos
- * breaks __bch2_btree_node_lock(). This seems to only affect
- * bch2_btree_node_get_sibling so for now it's fixed there, but we
- * should try to get rid of this corner case.
- *
- * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
- */
-
- if (bch2_btree_node_iter_end(&l->iter) &&
- btree_iter_pos_after_node(iter, l->b))
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-}
-
static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
{
unsigned l = iter->level;
@@ -1508,67 +1482,85 @@ out:
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
- bool strictly_greater)
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
{
- struct bpos old = btree_iter_search_key(iter);
- int cmp;
+ int cmp = bkey_cmp(new_pos, iter->real_pos);
- iter->flags &= ~BTREE_ITER_IS_EXTENTS;
- iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+ iter->real_pos = new_pos;
+ btree_iter_pos_changed(iter, cmp);
+
+ bch2_btree_iter_verify(iter);
+}
+
+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
+ bool strictly_greater)
+{
bkey_init(&iter->k);
iter->k.p = iter->pos = new_pos;
- cmp = bkey_cmp(btree_iter_search_key(iter), old);
+ iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
- btree_iter_pos_changed(iter, cmp);
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
}
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
- int cmp = bkey_cmp(new_pos, iter->pos);
+ __bch2_btree_iter_set_pos(iter, new_pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS) != 0);
+}
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
+static inline bool bch2_btree_iter_advance_pos(struct btree_iter *iter)
+{
+ struct bpos pos = iter->k.p;
+ bool ret = bkey_cmp(pos, POS_MAX) != 0;
- btree_iter_pos_changed(iter, cmp);
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+static inline bool bch2_btree_iter_rewind_pos(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- bool ret;
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = l->b->key.k.p;
+ struct bpos pos = bkey_start_pos(&iter->k);
+ bool ret = bkey_cmp(pos, POS_MIN) != 0;
- ret = bkey_cmp(iter->pos, POS_MAX) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- iter->k.p = iter->pos = bkey_successor(iter->pos);
-
- btree_iter_pos_changed(iter, 1);
+ pos = bkey_predecessor(pos);
+ bch2_btree_iter_set_pos(iter, pos);
return ret;
}
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- bool ret;
+ struct bpos next_pos = iter->l[0].b->key.k.p;
+ bool ret = bkey_cmp(next_pos, POS_MAX) != 0;
- bkey_init(&iter->k);
- iter->k.p = iter->pos = l->b->data->min_key;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ /*
+ * Typically, we don't want to modify iter->pos here, since that
+ * indicates where we searched from - unless we got to the end of the
+ * btree, in that case we want iter->pos to reflect that:
+ */
+ if (ret)
+ btree_iter_set_search_pos(iter, bkey_successor(next_pos));
+ else
+ bch2_btree_iter_set_pos(iter, POS_MAX);
- ret = bkey_cmp(iter->pos, POS_MIN) != 0;
- if (ret) {
- iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+ return ret;
+}
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- iter->k.p = iter->pos = bkey_predecessor(iter->pos);
- }
+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+{
+ struct bpos next_pos = iter->l[0].b->data->min_key;
+ bool ret = bkey_cmp(next_pos, POS_MIN) != 0;
+
+ if (ret)
+ btree_iter_set_search_pos(iter, bkey_predecessor(next_pos));
+ else
+ bch2_btree_iter_set_pos(iter, POS_MIN);
- btree_iter_pos_changed(iter, -1);
return ret;
}
@@ -1611,7 +1603,10 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
if (iter->uptodate == BTREE_ITER_UPTODATE &&
!bkey_deleted(&iter->k))
@@ -1634,13 +1629,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
* iter->pos should always be equal to the key we just
* returned - except extents can straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
+ iter->real_pos = k.k->p;
+
iter->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_iter_verify_level(iter, 0);
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
@@ -1650,14 +1647,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
*/
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance_pos(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
-
return bch2_btree_iter_peek(iter);
}
@@ -1699,7 +1691,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
while (1) {
ret = bch2_btree_iter_traverse(iter);
@@ -1709,10 +1701,7 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
k = __bch2_btree_iter_peek_with_updates(iter);
if (k.k && bkey_deleted(k.k)) {
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
+ bch2_btree_iter_advance_pos(iter);
continue;
}
@@ -1724,11 +1713,10 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
}
/*
- * iter->pos should always be equal to the key we just
- * returned - except extents can straddle iter->pos:
+ * iter->pos should be mononotically increasing, and always be equal to
+ * the key we just returned - except extents can straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1737,14 +1725,9 @@ struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance_pos(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
-
return bch2_btree_iter_peek_with_updates(iter);
}
@@ -1760,7 +1743,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ btree_iter_set_search_pos(iter, iter->pos);
if (iter->uptodate == BTREE_ITER_UPTODATE &&
!bkey_deleted(&iter->k))
@@ -1768,24 +1754,49 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
while (1) {
ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto no_key;
+ }
k = __btree_iter_peek(iter, l);
- if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
+ if (!k.k ||
+ ((iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_cmp(bkey_start_pos(k.k), pos) >= 0
+ : bkey_cmp(bkey_start_pos(k.k), pos) > 0))
k = __btree_iter_prev(iter, l);
if (likely(k.k))
break;
- if (!btree_iter_set_pos_to_prev_leaf(iter))
- return bkey_s_c_null;
+ if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+ k = bkey_s_c_null;
+ goto no_key;
+ }
}
EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
- iter->pos = bkey_start_pos(k.k);
- iter->uptodate = BTREE_ITER_UPTODATE;
+
+ /* Extents can straddle iter->pos: */
+ if (bkey_cmp(k.k->p, pos) < 0)
+ iter->pos = k.k->p;
+ iter->real_pos = k.k->p;
+ iter->uptodate = BTREE_ITER_UPTODATE;
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
+no_key:
+ /*
+ * __btree_iter_peek() may have set iter->k to a key we didn't want, and
+ * then we errored going to the previous leaf - make sure it's
+ * consistent with iter->pos:
+ */
+ BUG_ON(bkey_cmp(pos, iter->pos) &&
+ bkey_cmp(iter->pos, POS_MIN));
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+ goto out;
}
/**
@@ -1794,27 +1805,17 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
*/
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
- struct bpos pos = bkey_start_pos(&iter->k);
-
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
-
- if (unlikely(!bkey_cmp(pos, POS_MIN)))
+ if (!bch2_btree_iter_rewind_pos(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
-
return bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter;
struct bkey_s_c k;
- struct bkey n;
- int ret;
+ struct bpos pos, next_start;
/* keys & holes can't span inode numbers: */
if (iter->pos.offset == KEY_OFFSET_MAX) {
@@ -1822,53 +1823,36 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
return bkey_s_c_null;
bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
-
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
}
- /*
- * iterator is now at the correct position for inserting at iter->pos,
- * but we need to keep iterating until we find the first non whiteout so
- * we know how big a hole we have, if any:
- */
-
- node_iter = l->iter;
- k = __btree_iter_unpack(iter, l, &iter->k,
- bch2_btree_node_iter_peek(&node_iter, l->b));
+ pos = iter->pos;
+ k = bch2_btree_iter_peek(iter);
+ iter->pos = pos;
- if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
- /*
- * We're not setting iter->uptodate because the node iterator
- * doesn't necessarily point at the key we're returning:
- */
-
- EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
- bch2_btree_iter_verify_level(iter, 0);
+ if (bkey_err(k))
return k;
- }
- /* hole */
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
+ return k;
- if (!k.k)
- k.k = &l->b->key.k;
+ next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
- bkey_init(&n);
- n.p = iter->pos;
- bch2_key_resize(&n,
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+ bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
- (k.k->p.inode == n.p.inode
- ? bkey_start_offset(k.k)
+ (next_start.inode == iter->pos.inode
+ ? next_start.offset
: KEY_OFFSET_MAX) -
- n.p.offset));
+ iter->pos.offset));
- EBUG_ON(!n.size);
+ EBUG_ON(!iter->k.size);
- iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_iter_verify_level(iter, 0);
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
+
return (struct bkey_s_c) { &iter->k, NULL };
}
@@ -1879,18 +1863,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
+
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
if (iter->uptodate == BTREE_ITER_UPTODATE)
return btree_iter_peek_uptodate(iter);
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return __bch2_btree_iter_peek_slot_extents(iter);
+
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- return __bch2_btree_iter_peek_slot_extents(iter);
-
k = __btree_iter_peek_all(iter, l, &iter->k);
EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
@@ -1903,20 +1890,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
iter->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_iter_verify_level(iter, 0);
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance_pos(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
-
return bch2_btree_iter_peek_slot(iter);
}
@@ -1926,7 +1909,7 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
@@ -1957,6 +1940,7 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
bkey_init(&iter->k);
iter->k.p = pos;
iter->flags = flags;
+ iter->real_pos = btree_iter_search_key(iter);
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
iter->level = 0;
@@ -2096,7 +2080,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
if (best &&
bkey_cmp(bpos_diff(best->pos, pos),
- bpos_diff(iter->pos, pos)) < 0)
+ bpos_diff(iter->real_pos, pos)) < 0)
continue;
best = iter;
@@ -2117,9 +2101,12 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
iter->flags &= ~BTREE_ITER_USER_FLAGS;
iter->flags |= flags & BTREE_ITER_USER_FLAGS;
- if (iter->flags & BTREE_ITER_INTENT)
- bch2_btree_iter_upgrade(iter, 1);
- else
+ if (iter->flags & BTREE_ITER_INTENT) {
+ if (!iter->locks_want) {
+ __bch2_btree_iter_unlock(iter);
+ iter->locks_want = 1;
+ }
+ } else
bch2_btree_iter_downgrade(iter);
BUG_ON(iter->btree_id != btree_id);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9a7f8d0197ec..12c519ae2a60 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -174,7 +174,6 @@ struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 244c5dbcd3e9..4357aefdb668 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -349,8 +349,6 @@ retry:
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_JOURNAL_RESERVED|
BTREE_INSERT_JOURNAL_RECLAIM);
err:
@@ -580,6 +578,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
list_splice(&bc->dirty, &bc->clean);
list_for_each_entry_safe(ck, n, &bc->clean, list) {
+ cond_resched();
+
bch2_journal_pin_drop(&c->journal, &ck->journal);
bch2_journal_preres_put(&c->journal, &ck->res);
@@ -593,6 +593,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
BUG_ON(bc->nr_keys);
list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ cond_resched();
+
list_del(&ck->list);
kmem_cache_free(bch2_key_cache, ck);
}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index dad3e344dcf9..2f8b5521718a 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -16,7 +16,8 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty;
+ return nr_dirty > max_dirty &&
+ test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
}
struct bkey_cached *
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index dc7de27112c6..03894e923037 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -57,7 +57,7 @@ struct btree_write {
struct btree_alloc {
struct open_buckets ob;
- BKEY_PADDED(k);
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_bkey_cached_common {
@@ -76,6 +76,7 @@ struct btree {
u16 written;
u8 nsets;
u8 nr_key_bits;
+ u16 version_ondisk;
struct bkey_format format;
@@ -247,6 +248,8 @@ enum btree_iter_uptodate {
struct btree_iter {
struct btree_trans *trans;
struct bpos pos;
+ /* what we're searching for/what the iterator actually points to: */
+ struct bpos real_pos;
struct bpos pos_after_commit;
u16 flags;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index adb07043cbb3..a25138080169 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -20,7 +20,6 @@ enum btree_insert_flags {
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_USE_RESERVE,
- __BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
@@ -43,7 +42,6 @@ enum btree_insert_flags {
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8f96756ba648..275dcabbbdd6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -195,21 +195,18 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
{
struct write_point *wp;
struct btree *b;
- BKEY_PADDED(k) tmp;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct open_buckets ob = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
- if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+ if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
- alloc_reserve = RESERVE_ALLOC;
- } else if (flags & BTREE_INSERT_USE_RESERVE) {
- nr_reserve = BTREE_NODE_RESERVE / 2;
- alloc_reserve = RESERVE_BTREE;
+ alloc_reserve = RESERVE_BTREE_MOVINGGC;
} else {
nr_reserve = BTREE_NODE_RESERVE;
- alloc_reserve = RESERVE_NONE;
+ alloc_reserve = RESERVE_BTREE;
}
mutex_lock(&c->btree_reserve_cache_lock);
@@ -225,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+ wp = bch2_alloc_sectors_start(c,
+ c->opts.metadata_target ?:
+ c->opts.foreground_target,
+ 0,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
@@ -286,6 +286,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
bch2_bset_init_first(b, &b->data->keys);
b->c.level = level;
b->c.btree_id = as->btree_id;
+ b->version_ondisk = c->sb.version;
memset(&b->nr, 0, sizeof(b->nr));
b->data->magic = cpu_to_le64(bset_magic(c));
@@ -300,7 +301,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
bp->v.mem_ptr = 0;
bp->v.seq = b->data->keys.seq;
bp->v.sectors_written = 0;
- bp->v.sectors = cpu_to_le16(c->opts.btree_node_size);
}
if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
@@ -577,8 +577,6 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_trans_init(&trans, c, 0, 512);
ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED,
@@ -1232,6 +1230,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
src = n;
}
+ /* Also clear out the unwritten whiteouts area: */
+ b->whiteout_u64s = 0;
+
i->u64s = cpu_to_le16((u64 *) dst - i->_data);
set_btree_bset_end(b, b->set);
@@ -1457,15 +1458,6 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
struct btree_update *as;
struct closure cl;
int ret = 0;
- struct btree_insert_entry *i;
-
- /*
- * We already have a disk reservation and open buckets pinned; this
- * allocation must not block:
- */
- trans_for_each_update(trans, i)
- if (btree_node_type_needs_gc(i->iter->btree_id))
- flags |= BTREE_INSERT_USE_RESERVE;
closure_init_stack(&cl);
@@ -1926,10 +1918,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
retry:
as = bch2_btree_update_start(iter->trans, iter->btree_id,
parent ? btree_update_reserve_required(c, parent) : 0,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
+ BTREE_INSERT_NOFAIL, &cl);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e7816afe4a08..53ea91b32fd5 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
- bkey_cmp(bkey_start_pos(&insert->k),
- bkey_predecessor(b->data->min_key)) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
@@ -219,7 +216,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- BUG_ON(bkey_cmp(insert->k.p, iter->pos));
+ BUG_ON(bkey_cmp(insert->k.p, iter->real_pos));
BUG_ON(bch2_debug_check_bkeys &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
__btree_node_type(iter->level, iter->btree_id)));
@@ -287,7 +284,8 @@ btree_key_can_insert_cached(struct btree_trans *trans,
BUG_ON(iter->level);
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(trans->c))
+ bch2_btree_key_cache_must_wait(trans->c) &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
if (u64s <= ck->u64s)
@@ -508,6 +506,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
/*
* Can't be holding any read locks when we go to take write locks:
+ * another thread could be holding an intent lock on the same node we
+ * have a read lock on, and it'll block trying to take a write lock
+ * (because we hold a read lock) and it could be blocking us by holding
+ * its own read lock (while we're trying to to take write locks).
*
* note - this must be done after bch2_trans_journal_preres_get_cold()
* or anything else that might call bch2_trans_relock(), since that
@@ -515,9 +517,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
*/
trans_for_each_iter(trans, iter) {
if (iter->nodes_locked != iter->nodes_intent_locked) {
- EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
- EBUG_ON(trans->iters_live & (1ULL << iter->idx));
- bch2_btree_iter_unlock_noinline(iter);
+ if ((iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
+ (trans->iters_live & (1ULL << iter->idx))) {
+ if (!bch2_btree_iter_upgrade(iter, 1)) {
+ trace_trans_restart_upgrade(trans->ip);
+ return -EINTR;
+ }
+ } else {
+ bch2_btree_iter_unlock_noinline(iter);
+ }
}
}
@@ -695,26 +703,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l,
bkey_cmp(l->pos, r->pos);
}
-static void bch2_trans_update2(struct btree_trans *trans,
+static int bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
.iter = iter, .k = insert
};
+ int ret;
btree_insert_entry_checks(trans, n.iter, n.k);
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
+ ret = bch2_btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return ret;
+
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
*i = n;
- return;
+ return 0;
}
if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
@@ -723,6 +736,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
array_insert_item(trans->updates2, trans->nr_updates2,
i - trans->updates2, n);
+ return 0;
}
static int extent_update_to_keys(struct btree_trans *trans,
@@ -743,9 +757,9 @@ static int extent_update_to_keys(struct btree_trans *trans,
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
- bch2_trans_update2(trans, iter, insert);
+ ret = bch2_trans_update2(trans, iter, insert);
bch2_trans_iter_put(trans, iter);
- return 0;
+ return ret;
}
static int extent_handle_overwrites(struct btree_trans *trans,
@@ -775,8 +789,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_back(start, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
- bch2_trans_update2(trans, update_iter, update);
+ ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
+ if (ret)
+ goto err;
}
if (bkey_cmp(k.k->p, end) > 0) {
@@ -790,8 +806,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_front(end, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
- bch2_trans_update2(trans, update_iter, update);
+ ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
+ if (ret)
+ goto err;
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
@@ -805,8 +823,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
update->k.size = 0;
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
- bch2_trans_update2(trans, update_iter, update);
+ ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
+ if (ret)
+ goto err;
}
k = bch2_btree_iter_next_with_updates(iter);
@@ -826,7 +846,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
int ret = 0;
if (!trans->nr_updates)
- goto out_noupdates;
+ goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&trans->c->gc_lock);
@@ -840,7 +860,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
unlikely(!percpu_ref_tryget(&trans->c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
- return ret;
+ goto out_reset;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -859,8 +879,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans_trigger_run = false;
trans_for_each_update(trans, i) {
- if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
- (ret = bch2_btree_iter_traverse(i->iter)))) {
+ ret = bch2_btree_iter_traverse(i->iter);
+ if (unlikely(ret)) {
trace_trans_restart_traverse(trans->ip);
goto out;
}
@@ -869,8 +889,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
* We're not using bch2_btree_iter_upgrade here because
* we know trans->nounlock can't be set:
*/
- if (unlikely(i->iter->locks_want < 1 &&
- !__bch2_btree_iter_upgrade(i->iter, 1))) {
+ if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
+ !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
trace_trans_restart_upgrade(trans->ip);
ret = -EINTR;
goto out;
@@ -911,17 +931,22 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans_for_each_update(trans, i) {
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
ret = extent_update_to_keys(trans, i->iter, i->k);
- if (ret)
- goto out;
} else {
- bch2_trans_update2(trans, i->iter, i->k);
+ ret = bch2_trans_update2(trans, i->iter, i->k);
}
+ if (ret)
+ goto out;
}
trans_for_each_update2(trans, i) {
- BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
BUG_ON(i->iter->locks_want < 1);
+ ret = bch2_btree_iter_traverse(i->iter);
+ if (unlikely(ret)) {
+ trace_trans_restart_traverse(trans->ip);
+ goto out;
+ }
+
u64s = jset_u64s(i->k->k.u64s);
if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
@@ -941,18 +966,14 @@ retry:
trans_for_each_iter(trans, iter)
if ((trans->iters_live & (1ULL << iter->idx)) &&
- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
- else
- bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
- }
+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
out:
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
-out_noupdates:
+out_reset:
bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
return ret;
@@ -971,10 +992,22 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
- EBUG_ON(bkey_cmp(iter->pos,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_start_pos(&k->k)
- : k->k.p));
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(bkey_cmp(iter->pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&k->k)
+ : k->k.p));
+
+ trans_for_each_update(trans, i) {
+ BUG_ON(bkey_cmp(i->iter->pos,
+ (i->iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&i->k->k)
+ : i->k->k.p));
+
+ BUG_ON(i != trans->updates &&
+ btree_iter_pos_cmp(i[-1].iter, i[0].iter) >= 0);
+ }
+#endif
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
@@ -1074,8 +1107,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
bch2_trans_update(trans, iter, &k, 0);
return bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|flags);
+ BTREE_INSERT_NOFAIL|flags);
}
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 1934b845ea15..ba7a472a1bb7 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
+ struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
+ ca->mi.bucket_size;
+ }
+
percpu_up_write(&c->mark_lock);
}
@@ -189,14 +198,27 @@ out_pool:
return ret;
}
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+ unsigned journal_seq,
+ bool gc)
+{
+ return this_cpu_ptr(gc
+ ? ca->usage_gc
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
+ struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
+ unsigned seq, i, u64s = dev_usage_u64s();
- memset(&ret, 0, sizeof(ret));
- acc_u64s_percpu((u64 *) &ret,
- (u64 __percpu *) ca->usage[0],
- sizeof(ret) / sizeof(u64));
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
@@ -261,7 +283,8 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- unsigned u64s = fs_usage_u64s(c);
+ struct bch_dev *ca;
+ unsigned i, u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ u64s = dev_usage_u64s();
+
+ acc_u64s_percpu((u64 *) ca->usage_base,
+ (u64 __percpu *) ca->usage[idx], u64s);
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+ }
+ rcu_read_unlock();
+
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
@@ -376,15 +409,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
return !is_available_bucket(m);
}
-static inline int is_fragmented_bucket(struct bucket_mark m,
- struct bch_dev *ca)
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bucket_mark m)
{
- if (!m.owned_by_allocator &&
- m.data_type == BCH_DATA_user &&
- bucket_sectors_used(m))
- return max_t(int, 0, (int) ca->mi.bucket_size -
- bucket_sectors_used(m));
- return 0;
+ return bucket_sectors_used(m)
+ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ : 0;
}
static inline int is_stripe_data_bucket(struct bucket_mark m)
@@ -392,11 +422,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m)
return m.stripe && m.data_type != BCH_DATA_parity;
}
-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
- return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
@@ -456,20 +481,20 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
if (type == BCH_DATA_sb || type == BCH_DATA_journal)
fs_usage->hidden += size;
- dev_usage->buckets[type] += nr;
+ dev_usage->d[type].buckets += nr;
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
- bool gc)
+ u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
- u = this_cpu_ptr(ca->usage[gc]);
+ u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
@@ -481,50 +506,24 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+ u->buckets_ec += (int) new.stripe - (int) old.stripe;
u->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
- u->buckets_ec += (int) new.stripe - (int) old.stripe;
- u->sectors_ec += bucket_stripe_sectors(new) -
- bucket_stripe_sectors(old);
-
- u->sectors[old.data_type] -= old.dirty_sectors;
- u->sectors[new.data_type] += new.dirty_sectors;
- u->sectors[BCH_DATA_cached] +=
+ u->d[old.data_type].sectors -= old.dirty_sectors;
+ u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[BCH_DATA_cached].sectors +=
(int) new.cached_sectors - (int) old.cached_sectors;
- u->sectors_fragmented +=
- is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
}
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct bucket_mark old = { .v.counter = 0 };
- struct bucket_array *buckets;
- struct bucket *g;
- unsigned i;
- int cpu;
-
- c->usage_base->hidden = 0;
-
- for_each_member_device(ca, c, i) {
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(ca->usage[0], cpu), 0,
- sizeof(*ca->usage[0]));
-
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- bch2_dev_usage_update(c, ca, c->usage_base,
- old, g->mark, false);
- }
-}
-
static inline int update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
@@ -650,46 +649,6 @@ unwind:
ret; \
})
-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *ret,
- bool gc)
-{
- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
- struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- BUG_ON(!is_available_bucket(new));
-
- new.owned_by_allocator = true;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- new.gen++;
- }));
-
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
- if (old.cached_sectors)
- update_cached_sectors(c, fs_usage, ca->dev_idx,
- -((s64) old.cached_sectors));
-
- if (!gc)
- *ret = old;
- return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old)
-{
- do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
- ca, b, old);
-
- if (!old->owned_by_allocator && old->cached_sectors)
- trace_invalidate(ca, bucket_to_sector(ca, b),
- old->cached_sectors);
-}
-
static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
@@ -702,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
new.owned_by_allocator = owned_by_allocator;
}));
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ /*
+ * XXX: this is wrong, this means we'll be doing updates to the percpu
+ * buckets_alloc counter that don't have an open journal buffer and
+ * we'll race with the machinery that accumulates that to ca->usage_base
+ */
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@@ -734,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
- if (new.k->type != KEY_TYPE_alloc)
+ if (new.k->type != KEY_TYPE_alloc &&
+ new.k->type != KEY_TYPE_alloc_v2)
return 0;
/*
@@ -757,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
+ m.stripe = u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
@@ -764,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
/*
* need to know if we're getting called from the invalidate path or
@@ -827,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
- old, new, gc);
+ old, new, 0, gc);
return 0;
}
@@ -964,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags,
- bool enabled)
+ u64 journal_seq, unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
@@ -981,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
char buf[200];
int ret;
- if (enabled)
- g->ec_redundancy = s->nr_redundant;
+ if (g->stripe && g->stripe != k.k->p.offset) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ return -EINVAL;
+ }
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@@ -990,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
- if (new.stripe && enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- if (!new.stripe && !enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- new.stripe = enabled;
-
- if ((flags & BTREE_TRIGGER_GC) && parity) {
- new.data_type = enabled ? BCH_DATA_parity : 0;
- new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+ if (parity) {
+ new.data_type = BCH_DATA_parity;
+ new.dirty_sectors = le16_to_cpu(s->sectors);
}
if (journal_seq) {
@@ -1015,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
}
}));
- if (!enabled)
- g->ec_redundancy = 0;
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
@@ -1085,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
@@ -1212,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
unsigned i;
int ret;
+ BUG_ON(gc && old_s);
+
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
@@ -1219,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
if (!new_s) {
- /* Deleting: */
- for (i = 0; i < old_s->nr_blocks; i++) {
- ret = bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
-
- if (!gc && m->on_heap) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_del(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
-
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_del(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
- BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
- BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- if (!old_s ||
- memcmp(new_s->ptrs + i,
- old_s->ptrs + i,
- sizeof(struct bch_extent_ptr))) {
-
- if (old_s) {
- bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
- ret = bucket_set_stripe(c, new, i, fs_usage,
- journal_seq, flags, true);
- if (ret)
- return ret;
- }
- }
-
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
@@ -1274,16 +1198,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty += !!m->block_sectors[i];
}
- if (gc && old_s)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
-
bch2_bkey_to_replicas(&m->r.e, new);
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- ((s64) m->sectors * m->nr_redundant));
-
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
@@ -1291,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
}
}
+ if (gc) {
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ ret = mark_stripe_bucket(c, new, i, fs_usage,
+ journal_seq, flags);
+ if (ret)
+ return ret;
+ }
+
+ update_replicas(c, fs_usage, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant));
+ }
+
return 0;
}
@@ -1314,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
switch (k.k->type) {
case KEY_TYPE_alloc:
+ case KEY_TYPE_alloc_v2:
ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
@@ -1382,9 +1318,6 @@ int bch2_mark_update(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
- struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_old;
struct bkey_s_c old;
struct bkey unpacked;
int ret = 0;
@@ -1424,23 +1357,24 @@ int bch2_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
+ struct btree_iter *copy;
+
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
0, new->k.size,
fs_usage, trans->journal_res.seq,
BTREE_TRIGGER_INSERT|flags);
- while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
- unsigned offset = 0;
- s64 sectors;
+ copy = bch2_trans_copy_iter(trans, iter);
- old = bkey_disassemble(b, _old, &unpacked);
- sectors = -((s64) old.k->size);
+ for_each_btree_key_continue(copy, 0, old, ret) {
+ unsigned offset = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
- return 0;
+ break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
@@ -1473,9 +1407,8 @@ int bch2_mark_update(struct btree_trans *trans,
trans->journal_res.seq, flags) ?: 1;
if (ret <= 0)
break;
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
}
return ret;
@@ -1506,27 +1439,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
pr_err("overlapping with");
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
- struct btree *b = iter_l(i->iter)->b;
- struct btree_node_iter node_iter = iter_l(i->iter)->iter;
- struct bkey_packed *_k;
-
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k;
+ struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
+ struct bkey_s_c k;
+ int ret;
- pr_info("_k %px format %u", _k, _k->format);
- k = bkey_disassemble(b, _k, &unpacked);
-
- if (btree_node_is_extents(b)
+ for_each_btree_key_continue(copy, 0, k, ret) {
+ if (btree_node_type_is_extents(i->iter->btree_id)
? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(i->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
} else {
struct bkey_cached *ck = (void *) i->iter->l[0].b;
@@ -1582,9 +1508,10 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+ const struct bch_extent_ptr *ptr,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -1592,8 +1519,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey_alloc_buf *a;
int ret;
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (IS_ERR(a))
+ return a;
+
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
*u = bch2_alloc_unpack(k);
@@ -1605,17 +1537,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
- return ret;
+ return ERR_PTR(ret);
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
- *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
*_iter = iter;
- return 0;
+ return a;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1625,34 +1557,27 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
int ret;
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto out;
-
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
}
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
- struct bch_extent_stripe_ptr p,
+ struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
@@ -1662,14 +1587,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_replicas_padded r;
int ret = 0;
- ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
+ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k);
if (ret < 0)
return ret;
if (k.k->type != KEY_TYPE_stripe) {
bch2_fs_inconsistent(c,
"pointer to nonexistent stripe %llu",
- (u64) p.idx);
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
+ bch2_fs_inconsistent(c,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
ret = -EIO;
goto out;
}
@@ -1680,8 +1613,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto out;
bkey_reassemble(&s->k_i, k);
- stripe_blockcount_set(&s->v, p.block,
- stripe_blockcount_get(&s->v, p.block) +
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
bch2_trans_update(trans, iter, &s->k_i, 0);
@@ -1732,7 +1665,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+ ret = bch2_trans_mark_stripe_ptr(trans, p,
disk_sectors, data_type);
if (ret)
return ret;
@@ -1748,34 +1681,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
- const struct bch_extent_ptr *ptr,
- s64 sectors, bool parity)
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
{
- struct bkey_i_alloc *a;
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct bkey_alloc_buf *a;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- int ret;
+ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+ int ret = 0;
- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
if (parity) {
+ s64 sectors = le16_to_cpu(s.v->sectors);
+
+ if (deleting)
+ sectors = -sectors;
+
u.dirty_sectors += sectors;
u.data_type = u.dirty_sectors
? BCH_DATA_parity
: 0;
}
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto err;
+ if (!deleting) {
+ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ u.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ u.stripe = s.k->p.offset;
+ u.stripe_redundancy = s.v->nr_redundant;
+ } else {
+ u.stripe = 0;
+ u.stripe_redundancy = 0;
+ }
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
err:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1785,51 +1735,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
+ struct bkey_s_c_stripe old_s = { NULL };
+ struct bkey_s_c_stripe new_s = { NULL };
struct bch_replicas_padded r;
unsigned i;
int ret = 0;
+ if (old.k->type == KEY_TYPE_stripe)
+ old_s = bkey_s_c_to_stripe(old);
+ if (new.k->type == KEY_TYPE_stripe)
+ new_s = bkey_s_c_to_stripe(new);
+
/*
* If the pointers aren't changing, we don't need to do anything:
*/
- if (new_s && old_s &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ if (new_s.k && old_s.k &&
+ new_s.v->nr_blocks == old_s.v->nr_blocks &&
+ new_s.v->nr_redundant == old_s.v->nr_redundant &&
+ !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- if (new_s) {
- unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
- s64 sectors = le16_to_cpu(new_s->sectors);
+ if (new_s.k) {
+ s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
- update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
- for (i = 0; i < new_s->nr_blocks; i++) {
- bool parity = i >= nr_data;
-
- ret = bch2_trans_mark_stripe_alloc_ref(trans,
- &new_s->ptrs[i], sectors, parity);
+ for (i = 0; i < new_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+ i, false);
if (ret)
return ret;
}
}
- if (old_s) {
- unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+ if (old_s.k) {
+ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
bch2_bkey_to_replicas(&r.e, old);
- update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
-
- for (i = 0; i < old_s->nr_blocks; i++) {
- bool parity = i >= nr_data;
+ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
- ret = bch2_trans_mark_stripe_alloc_ref(trans,
- &old_s->ptrs[i], sectors, parity);
+ for (i = 0; i < old_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+ i, true);
if (ret)
return ret;
}
@@ -1898,8 +1847,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
}
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
bch2_trans_update(trans, iter, n, 0);
out:
ret = sectors;
@@ -2025,15 +1972,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
- struct btree *b = iter_l(iter)->b;
- struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_old;
- struct bkey unpacked;
+ struct btree_iter *copy;
+ struct bkey _old;
EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
- bkey_init(&unpacked);
- old = (struct bkey_s_c) { &unpacked, NULL };
+ bkey_init(&_old);
+ old = (struct bkey_s_c) { &_old, NULL };
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
0, new->k.size,
@@ -2041,18 +1986,16 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (ret)
return ret;
- while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
- unsigned flags = BTREE_TRIGGER_OVERWRITE;
- unsigned offset = 0;
- s64 sectors;
+ copy = bch2_trans_copy_iter(trans, iter);
- old = bkey_disassemble(b, _old, &unpacked);
- sectors = -((s64) old.k->size);
+ for_each_btree_key_continue(copy, 0, old, ret) {
+ unsigned offset = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
- return 0;
+ break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
@@ -2083,15 +2026,169 @@ int bch2_trans_mark_update(struct btree_trans *trans,
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
offset, sectors, flags);
if (ret)
- return ret;
-
- bch2_btree_node_iter_advance(&node_iter, b);
+ break;
}
+ bch2_trans_iter_put(trans, copy);
}
return ret;
}
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ struct bkey_alloc_buf *a;
+ struct bch_extent_ptr ptr = {
+ .dev = ca->dev_idx,
+ .offset = bucket_to_sector(ca, b),
+ };
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ if (u.data_type && u.data_type != type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ bch2_data_types[type],
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
+ }
+
+ if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type ?: type],
+ u.dirty_sectors, sectors, ca->mi.bucket_size,
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (u.data_type == type &&
+ u.dirty_sectors == sectors)
+ goto out;
+
+ u.data_type = type;
+ u.dirty_sectors = sectors;
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ return __bch2_trans_do(trans, res, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+ ca->mi.bucket_size));
+
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bch_data_type type,
+ u64 *bucket, unsigned *bucket_sectors)
+{
+ int ret;
+
+ do {
+ u64 b = sector_to_bucket(ca, start);
+ unsigned sectors =
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+ if (b != *bucket) {
+ if (*bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ *bucket, type, *bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ *bucket = b;
+ *bucket_sectors = 0;
+ }
+
+ *bucket_sectors += sectors;
+ start += sectors;
+ } while (!ret && start < end);
+
+ return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 bucket = 0;
+ unsigned i, bucket_sectors = 0;
+ int ret;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR) {
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+ 0, BCH_SB_SECTOR,
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ bucket, BCH_DATA_sb, bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ ca->journal.buckets[i],
+ BCH_DATA_journal, ca->mi.bucket_size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
+{
+ return bch2_trans_do(c, res, NULL, 0,
+ __bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
/* Disk reservations: */
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
@@ -2107,7 +2204,7 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- unsigned sectors, int flags)
+ u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, v, get;
@@ -2192,7 +2289,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ca->mi.bucket_size / c->opts.btree_node_size);
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve * 2);
bool resize = ca->buckets[0] != NULL;
@@ -2209,7 +2306,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
- !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@@ -2296,13 +2392,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- free_percpu(ca->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ free_percpu(ca->usage[i]);
+ kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+ unsigned i;
+
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+ if (!ca->usage_base)
return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[i])
+ return -ENOMEM;
+ }
+
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 3a5ed1fcaf78..6d15c455e7cc 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
- return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
{
- struct bucket *g = bucket(ca, b);
-
return g->mark.gen - g->oldest_gen;
}
@@ -153,18 +146,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark)
return mark.dirty_sectors + mark.cached_sectors;
}
-static inline bool bucket_unused(struct bucket_mark mark)
-{
- return !mark.owned_by_allocator &&
- !mark.data_type &&
- !bucket_sectors_used(mark);
-}
-
static inline bool is_available_bucket(struct bucket_mark mark)
{
- return (!mark.owned_by_allocator &&
- !mark.dirty_sectors &&
- !mark.stripe);
+ return !mark.dirty_sectors && !mark.stripe;
}
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
@@ -178,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@@ -223,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
READ_ONCE(c->replicas.nr);
}
+static inline unsigned dev_usage_u64s(void)
+{
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
@@ -245,8 +232,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
- size_t, struct bucket_mark *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
@@ -270,6 +255,12 @@ int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+ struct disk_reservation *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+ struct bch_dev *);
+
/* disk reservations: */
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
@@ -284,8 +275,8 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
int bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- unsigned, int);
+ struct disk_reservation *,
+ u64, int);
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -302,8 +293,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
static inline int bch2_disk_reservation_get(struct bch_fs *c,
struct disk_reservation *res,
- unsigned sectors,
- unsigned nr_replicas,
+ u64 sectors, unsigned nr_replicas,
int flags)
{
*res = bch2_disk_reservation_init(c, nr_replicas);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index d6057d22b18e..404c89a7a264 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -37,11 +37,12 @@ struct bucket {
const struct bucket_mark mark;
};
- u16 io_time[2];
+ u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
- u8 ec_redundancy;
+ u8 stripe_redundancy;
+ u32 stripe;
};
struct bucket_array {
@@ -52,16 +53,15 @@ struct bucket_array {
};
struct bch_dev_usage {
- u64 buckets[BCH_DATA_NR];
u64 buckets_alloc;
+ u64 buckets_ec;
u64 buckets_unavailable;
- /* _compressed_ sectors: */
- u64 sectors[BCH_DATA_NR];
- u64 sectors_fragmented;
-
- u64 buckets_ec;
- u64 sectors_ec;
+ struct {
+ u64 buckets;
+ u64 sectors; /* _compressed_ sectors: */
+ u64 fragmented;
+ } d[BCH_DATA_NR];
};
struct bch_fs_usage {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index e7c8969aaad1..49842ec88390 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -477,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
arg.ec_buckets = src.buckets_ec;
- arg.ec_sectors = src.sectors_ec;
+ arg.ec_sectors = 0;
for (i = 0; i < BCH_DATA_NR; i++) {
- arg.buckets[i] = src.buckets[i];
- arg.sectors[i] = src.sectors[i];
+ arg.buckets[i] = src.d[i].buckets;
+ arg.sectors[i] = src.d[i].sectors;
}
percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -463,7 +464,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 1d1590de55e8..4324cfe7eed0 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
- unsigned long now = atomic_long_add_return(sectors, &clock->now);
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i;
spin_lock(&clock->timer_lock);
- now = atomic_long_read(&clock->now);
+ now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock)
{
- atomic_long_set(&clock->now, 0);
+ atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 92c740a47565..5fae0012d808 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
- atomic_long_t now;
+ atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 0d68a277cfd7..f63651d291e5 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
@@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c,
ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+ /*
+ * ZSTD requires that when we decompress we pass in the exact
+ * compressed size - rounding it up to the nearest sector
+ * doesn't work, so we use the first 4 bytes of the buffer for
+ * that.
+ *
+ * Additionally, the ZSTD code seems to have a bug where it will
+ * write just past the end of the buffer - so subtract a fudge
+ * factor (7 bytes) from the dst buffer size to account for
+ * that.
+ */
size_t len = ZSTD_compressCCtx(ctx,
- dst + 4, dst_len - 4,
+ dst + 4, dst_len - 4 - 7,
src, src_len,
c->zstd_params);
if (ZSTD_isError(len))
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index bbe3fefa2651..06dbca32e189 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
memcpy(n_ondisk, n_sorted, btree_bytes(c));
- if (bch2_btree_node_read_done(c, v, false))
+ if (bch2_btree_node_read_done(c, ca, v, false))
goto out;
n_sorted = c->verify_data->data;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index eb03adc2d533..ec871b5eb992 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,7 +4,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ if (!bkey_cmp(k.k->p, POS_MIN))
+ return "stripe at pos 0";
+
if (k.k->p.inode)
return "invalid stripe key";
@@ -138,44 +141,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
stripe_blockcount_get(s, i));
}
-static int ptr_matches_stripe(struct bch_fs *c,
- struct bch_stripe *v,
- const struct bch_extent_ptr *ptr)
+/* returns blocknr in stripe that we matched: */
+static int bkey_matches_stripe(struct bch_stripe *s,
+ struct bkey_s_c k)
{
- unsigned i;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
- for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
- const struct bch_extent_ptr *ptr2 = v->ptrs + i;
-
- if (ptr->dev == ptr2->dev &&
- ptr->gen == ptr2->gen &&
- ptr->offset >= ptr2->offset &&
- ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
- return i;
- }
-
- return -1;
-}
-
-static int extent_matches_stripe(struct bch_fs *c,
- struct bch_stripe *v,
- struct bkey_s_c k)
-{
-
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- int idx;
-
- extent_for_each_ptr(e, ptr) {
- idx = ptr_matches_stripe(c, v, ptr);
- if (idx >= 0)
- return idx;
- }
- break;
- }
- }
+ bkey_for_each_ptr(ptrs, ptr)
+ for (i = 0; i < nr_data; i++)
+ if (__bch2_ptr_matches_stripe(s, ptr, i))
+ return i;
return -1;
}
@@ -200,46 +177,95 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
return false;
}
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+ unsigned i;
+
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
+ kvpfree(buf->data[i], buf->size << 9);
+ buf->data[i] = NULL;
+ }
+}
+
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+ unsigned offset, unsigned size)
+{
+ struct bch_stripe *v = &buf->key.v;
+ unsigned csum_granularity = 1U << v->csum_granularity_bits;
+ unsigned end = offset + size;
+ unsigned i;
+
+ BUG_ON(end > le16_to_cpu(v->sectors));
+
+ offset = round_down(offset, csum_granularity);
+ end = min_t(unsigned, le16_to_cpu(v->sectors),
+ round_up(end, csum_granularity));
+
+ buf->offset = offset;
+ buf->size = end - offset;
+
+ memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
+ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ if (!buf->data[i])
+ goto err;
+ }
+
+ return 0;
+err:
+ ec_stripe_buf_exit(buf);
+ return -ENOMEM;
+}
+
/* Checksumming: */
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+ unsigned block, unsigned offset)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned csums_per_device = stripe_csums_per_device(v);
- unsigned csum_bytes = bch_crc_bytes[v->csum_type];
- unsigned i, j;
+ unsigned end = buf->offset + buf->size;
+ unsigned len = min(csum_granularity, end - offset);
+
+ BUG_ON(offset >= end);
+ BUG_ON(offset < buf->offset);
+ BUG_ON(offset & (csum_granularity - 1));
+ BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+ (len & (csum_granularity - 1)));
+
+ return bch2_checksum(NULL, v->csum_type,
+ null_nonce(),
+ buf->data[block] + ((offset - buf->offset) << 9),
+ len << 9);
+}
- if (!csum_bytes)
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &buf->key.v;
+ unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+ if (!v->csum_type)
return;
BUG_ON(buf->offset);
BUG_ON(buf->size != le16_to_cpu(v->sectors));
- for (i = 0; i < v->nr_blocks; i++) {
- for (j = 0; j < csums_per_device; j++) {
- unsigned offset = j << v->csum_granularity_bits;
- unsigned len = min(csum_granularity, buf->size - offset);
-
- struct bch_csum csum =
- bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[i] + (offset << 9),
- len << 9);
-
- memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
- }
- }
+ for (i = 0; i < v->nr_blocks; i++)
+ for (j = 0; j < csums_per_device; j++)
+ stripe_csum_set(v, i, j,
+ ec_block_checksum(buf, i, j << v->csum_granularity_bits));
}
static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned csum_bytes = bch_crc_bytes[v->csum_type];
unsigned i;
- if (!csum_bytes)
+ if (!v->csum_type)
return;
for (i = 0; i < v->nr_blocks; i++) {
@@ -252,21 +278,18 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
while (offset < end) {
unsigned j = offset >> v->csum_granularity_bits;
unsigned len = min(csum_granularity, end - offset);
- struct bch_csum csum;
+ struct bch_csum want = stripe_csum_get(v, i, j);
+ struct bch_csum got = ec_block_checksum(buf, i, offset);
- BUG_ON(offset & (csum_granularity - 1));
- BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
- ((offset + len) & (csum_granularity - 1)));
+ if (bch2_crc_cmp(want, got)) {
+ char buf2[200];
- csum = bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[i] + ((offset - buf->offset) << 9),
- len << 9);
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
- if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
bch_err_ratelimited(c,
- "checksum error while doing reconstruct read (%u:%u)",
- i, j);
+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+ (void *) _RET_IP_, i, j, v->csum_type,
+ want.lo, got.lo, buf2);
clear_bit(i, buf->valid);
break;
}
@@ -287,20 +310,16 @@ static void ec_generate_ec(struct ec_stripe_buf *buf)
raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
}
-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
-{
- return nr - bitmap_weight(buf->valid, nr);
-}
-
static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
{
- return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+ return buf->key.v.nr_blocks -
+ bitmap_weight(buf->valid, buf->key.v.nr_blocks);
}
static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
- unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
unsigned nr_data = v->nr_blocks - v->nr_redundant;
unsigned bytes = buf->size << 9;
@@ -323,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
static void ec_block_endio(struct bio *bio)
{
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+ struct bch_stripe *v = &ec_bio->buf->key.v;
+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
@@ -331,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(ca->fs,
+ "error %s stripe: stale pointer after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
+
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
closure_put(cl);
@@ -347,6 +375,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
? BCH_DATA_user
: BCH_DATA_parity;
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer",
+ rw == READ ? "reading from" : "writing to");
+ clear_bit(idx, buf->valid);
+ return;
+ }
+
if (!bch2_dev_get_ioref(ca, rw)) {
clear_bit(idx, buf->valid);
return;
@@ -389,87 +425,77 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
percpu_ref_put(&ca->io_ref);
}
-/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
struct btree_iter *iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+ if (k.k->type != KEY_TYPE_stripe) {
+ ret = -ENOENT;
+ goto err;
+ }
+ bkey_reassemble(&stripe->key.k_i, k);
+err:
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
struct ec_stripe_buf *buf;
struct closure cl;
- struct bkey_s_c k;
struct bch_stripe *v;
- unsigned stripe_idx;
- unsigned offset, end;
- unsigned i, nr_data, csum_granularity;
- int ret = 0, idx;
+ unsigned i, offset;
+ int ret = 0;
closure_init_stack(&cl);
BUG_ON(!rbio->pick.has_ec);
- stripe_idx = rbio->pick.ec.idx;
-
buf = kzalloc(sizeof(*buf), GFP_NOIO);
if (!buf)
return -ENOMEM;
- bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
- POS(0, stripe_idx),
- BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
+ ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+ if (ret) {
bch_err_ratelimited(c,
- "error doing reconstruct read: stripe not found");
+ "error doing reconstruct read: error %i looking up stripe", ret);
kfree(buf);
- return bch2_trans_exit(&trans) ?: -EIO;
+ return -EIO;
}
- bkey_reassemble(&buf->key.k_i, k);
- bch2_trans_exit(&trans);
-
v = &buf->key.v;
- nr_data = v->nr_blocks - v->nr_redundant;
-
- idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
- BUG_ON(idx < 0);
-
- csum_granularity = 1U << v->csum_granularity_bits;
-
- offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
- end = offset + bio_sectors(&rbio->bio);
-
- BUG_ON(end > le16_to_cpu(v->sectors));
-
- buf->offset = round_down(offset, csum_granularity);
- buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
- round_up(end, csum_granularity)) - buf->offset;
-
- for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
- if (!buf->data[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: pointer doesn't match stripe");
+ ret = -EIO;
+ goto err;
}
- memset(buf->valid, 0xFF, sizeof(buf->valid));
-
- for (i = 0; i < v->nr_blocks; i++) {
- struct bch_extent_ptr *ptr = v->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: read is bigger than stripe");
+ ret = -EIO;
+ goto err;
+ }
- if (ptr_stale(ca, ptr)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: stale pointer");
- clear_bit(i, buf->valid);
- continue;
- }
+ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ if (ret)
+ goto err;
+ for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
- }
closure_sync(&cl);
@@ -487,10 +513,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
goto err;
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
- buf->data[idx] + ((offset - buf->offset) << 9));
+ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
err:
- for (i = 0; i < v->nr_blocks; i++)
- kfree(buf->data[i]);
+ ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
}
@@ -643,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
- //pr_info("deleting stripe %zu", idx);
return bch2_btree_delete_range(c, BTREE_ID_EC,
POS(0, idx),
POS(0, idx + 1),
@@ -675,13 +699,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
/* stripe creation: */
static int ec_stripe_bkey_insert(struct bch_fs *c,
- struct ec_stripe_new *s,
- struct bkey_i_stripe *stripe)
+ struct bkey_i_stripe *stripe,
+ struct disk_reservation *res)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bpos start_pos = POS(0, c->ec_stripe_hint);
+ struct bpos min_pos = POS(0, 1);
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
bch2_trans_init(&trans, c, 0, 0);
@@ -692,7 +717,7 @@ retry:
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
- start_pos = POS_MIN;
+ start_pos = min_pos;
bch2_btree_iter_set_pos(iter, start_pos);
continue;
}
@@ -717,7 +742,7 @@ found_slot:
bch2_trans_update(&trans, iter, &stripe->k_i, 0);
- ret = bch2_trans_commit(&trans, &s->res, NULL,
+ ret = bch2_trans_commit(&trans, res, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_put(&trans, iter);
@@ -731,6 +756,46 @@ err:
return ret;
}
+static int ec_stripe_bkey_update(struct btree_trans *trans,
+ struct bkey_i_stripe *new)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ const struct bch_stripe *existing;
+ unsigned i;
+ int ret;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_EC,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || k.k->type != KEY_TYPE_stripe) {
+ bch_err(trans->c, "error updating stripe: not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ existing = bkey_s_c_to_stripe(k).v;
+
+ if (existing->nr_blocks != new->v.nr_blocks) {
+ bch_err(trans->c, "error updating stripe: nr_blocks does not match");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i,
+ stripe_blockcount_get(existing, i));
+
+ bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static void extent_stripe_ptr_add(struct bkey_s_extent e,
struct ec_stripe_buf *s,
struct bch_extent_ptr *ptr,
@@ -745,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
*dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
+ .redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
}
@@ -757,10 +823,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_s_extent e;
- struct bkey_on_stack sk;
- int ret = 0, dev, idx;
+ struct bkey_buf sk;
+ int ret = 0, dev, block;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
/* XXX this doesn't support the reflink btree */
@@ -779,29 +845,28 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
continue;
}
- idx = extent_matches_stripe(c, &s->key.v, k);
- if (idx < 0) {
+ block = bkey_matches_stripe(&s->key.v, k);
+ if (block < 0) {
bch2_btree_iter_next(iter);
continue;
}
- dev = s->key.v.ptrs[idx].dev;
+ dev = s->key.v.ptrs[block].dev;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
e = bkey_i_to_s_extent(sk.k);
bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
BUG_ON(!ec_ptr);
- extent_stripe_ptr_add(e, s, ec_ptr, idx);
+ extent_stripe_ptr_add(e, s, ec_ptr, block);
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_NOFAIL);
if (ret == -EINTR)
ret = 0;
if (ret)
@@ -809,7 +874,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
}
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
@@ -823,14 +888,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
struct open_bucket *ob;
struct bkey_i *k;
struct stripe *m;
- struct bch_stripe *v = &s->stripe.key.v;
+ struct bch_stripe *v = &s->new_stripe.key.v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- struct closure cl;
int ret;
BUG_ON(s->h->s == s);
- closure_init_stack(&cl);
+ closure_sync(&s->iodone);
if (s->err) {
if (s->err != -EROFS)
@@ -838,41 +902,52 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
+ if (s->have_existing_stripe) {
+ ec_validate_checksums(c, &s->existing_stripe);
+
+ if (ec_do_recov(c, &s->existing_stripe)) {
+ bch_err(c, "error creating stripe: error reading existing stripe");
+ goto err;
+ }
+
+ for (i = 0; i < nr_data; i++)
+ if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+ swap(s->new_stripe.data[i],
+ s->existing_stripe.data[i]);
+
+ ec_stripe_buf_exit(&s->existing_stripe);
+ }
+
BUG_ON(!s->allocated);
if (!percpu_ref_tryget(&c->writes))
goto err;
- BUG_ON(bitmap_weight(s->blocks_allocated,
- s->blocks.nr) != s->blocks.nr);
-
- ec_generate_ec(&s->stripe);
+ ec_generate_ec(&s->new_stripe);
- ec_generate_checksums(&s->stripe);
+ ec_generate_checksums(&s->new_stripe);
/* write p/q: */
for (i = nr_data; i < v->nr_blocks; i++)
- ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
-
- closure_sync(&cl);
+ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+ closure_sync(&s->iodone);
- for (i = nr_data; i < v->nr_blocks; i++)
- if (!test_bit(i, s->stripe.valid)) {
- bch_err(c, "error creating stripe: error writing redundancy buckets");
- goto err_put_writes;
- }
+ if (ec_nr_failed(&s->new_stripe)) {
+ bch_err(c, "error creating stripe: error writing redundancy buckets");
+ goto err_put_writes;
+ }
- ret = s->existing_stripe
- ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
- &s->res, NULL, BTREE_INSERT_NOFAIL)
- : ec_stripe_bkey_insert(c, s, &s->stripe.key);
+ ret = s->have_existing_stripe
+ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+ ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
}
for_each_keylist_key(&s->keys, k) {
- ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+ ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
if (ret) {
bch_err(c, "error creating stripe: error %i updating pointers", ret);
break;
@@ -880,31 +955,33 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
-#if 0
- pr_info("created a %s stripe %llu",
- s->existing_stripe ? "existing" : "new",
- s->stripe.key.k.p.offset);
-#endif
+ m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+
BUG_ON(m->on_heap);
- bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+ bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
err_put_writes:
percpu_ref_put(&c->writes);
err:
bch2_disk_reservation_put(c, &s->res);
- open_bucket_for_each(c, &s->blocks, ob, i) {
- ob->ec = NULL;
- __bch2_open_bucket_put(c, ob);
- }
-
- bch2_open_buckets_put(c, &s->parity);
+ for (i = 0; i < v->nr_blocks; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (i < nr_data) {
+ ob->ec = NULL;
+ __bch2_open_bucket_put(c, ob);
+ } else {
+ bch2_open_bucket_put(c, ob);
+ }
+ }
bch2_keylist_free(&s->keys, s->inline_keys);
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
- kvpfree(s->stripe.data[i], s->stripe.size << 9);
+ ec_stripe_buf_exit(&s->existing_stripe);
+ ec_stripe_buf_exit(&s->new_stripe);
+ closure_debug_destroy(&s->iodone);
kfree(s);
}
@@ -981,7 +1058,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
offset = ca->mi.bucket_size - ob->sectors_free;
- return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
@@ -993,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
- //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
ec = ob->ec;
mutex_lock(&ec->lock);
@@ -1088,7 +1163,6 @@ static void ec_stripe_key_init(struct bch_fs *c,
static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s;
- unsigned i;
lockdep_assert_held(&h->lock);
@@ -1097,41 +1171,27 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
return -ENOMEM;
mutex_init(&s->lock);
+ closure_init(&s->iodone, NULL);
atomic_set(&s->pin, 1);
s->c = c;
s->h = h;
s->nr_data = min_t(unsigned, h->nr_active_devs,
- EC_STRIPE_MAX) - h->redundancy;
+ BCH_BKEY_PTRS_MAX) - h->redundancy;
s->nr_parity = h->redundancy;
bch2_keylist_init(&s->keys, s->inline_keys);
- s->stripe.offset = 0;
- s->stripe.size = h->blocksize;
- memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
-
- ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
s->nr_parity, h->blocksize);
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
- s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
- if (!s->stripe.data[i])
- goto err;
- }
-
h->s = s;
-
return 0;
-err:
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
- kvpfree(s->stripe.data[i], s->stripe.size << 9);
- kfree(s);
- return -ENOMEM;
}
static struct ec_stripe_head *
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy)
+ unsigned algo, unsigned redundancy,
+ bool copygc)
{
struct ec_stripe_head *h;
struct bch_dev *ca;
@@ -1147,6 +1207,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->target = target;
h->algo = algo;
h->redundancy = redundancy;
+ h->copygc = copygc;
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1171,16 +1232,17 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
if (h->s &&
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr) == h->s->blocks.nr)
+ h->s->nr_data) == h->s->nr_data)
ec_stripe_set_pending(c, h);
mutex_unlock(&h->lock);
}
struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
- unsigned target,
- unsigned algo,
- unsigned redundancy)
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy,
+ bool copygc)
{
struct ec_stripe_head *h;
@@ -1191,76 +1253,98 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
- h->redundancy == redundancy) {
+ h->redundancy == redundancy &&
+ h->copygc == copygc) {
mutex_lock(&h->lock);
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
found:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+static enum bucket_alloc_ret
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+ struct closure *cl)
{
- struct bch_devs_mask devs;
+ struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- unsigned i, nr_have, nr_data =
- min_t(unsigned, h->nr_active_devs,
- EC_STRIPE_MAX) - h->redundancy;
+ struct open_buckets buckets;
+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
- int ret = 0;
-
- devs = h->devs;
-
- for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
- __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
- --nr_data;
+ enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (test_bit(i, h->s->blocks_gotten)) {
+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+ if (i < h->s->nr_data)
+ nr_have_data++;
+ else
+ nr_have_parity++;
+ }
}
- BUG_ON(h->s->blocks.nr > nr_data);
- BUG_ON(h->s->parity.nr > h->redundancy);
-
- open_bucket_for_each(c, &h->s->parity, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
+ BUG_ON(nr_have_data > h->s->nr_data);
+ BUG_ON(nr_have_parity > h->s->nr_parity);
percpu_down_read(&c->mark_lock);
rcu_read_lock();
- if (h->s->parity.nr < h->redundancy) {
- nr_have = h->s->parity.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->parity,
+ buckets.nr = 0;
+ if (nr_have_parity < h->s->nr_parity) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->parity_stripe,
&devs,
- h->redundancy,
- &nr_have,
+ h->s->nr_parity,
+ &nr_have_parity,
&have_cache,
- RESERVE_NONE,
+ h->copygc
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE,
0,
- NULL);
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data + h->s->nr_parity,
+ h->s->nr_data);
+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
- if (h->s->blocks.nr < nr_data) {
- nr_have = h->s->blocks.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+ buckets.nr = 0;
+ if (nr_have_data < h->s->nr_data) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->block_stripe,
&devs,
- nr_data,
- &nr_have,
+ h->s->nr_data,
+ &nr_have_data,
&have_cache,
- RESERVE_NONE,
+ h->copygc
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE,
0,
- NULL);
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data, 0);
+ BUG_ON(j >= h->s->nr_data);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
@@ -1272,53 +1356,101 @@ err:
/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
- unsigned target,
- unsigned algo,
- unsigned redundancy)
+ struct ec_stripe_head *head)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
+ s64 ret = -1;
if (may_create_new_stripe(c))
return -1;
spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ /* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
stripe_idx = h->data[heap_idx].idx;
m = genradix_ptr(&c->stripes[0], stripe_idx);
- if (m->algorithm == algo &&
- m->nr_redundant == redundancy &&
+ if (m->algorithm == head->algo &&
+ m->nr_redundant == head->redundancy &&
+ m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- return stripe_idx;
+ ret = stripe_idx;
+ break;
}
}
-
spin_unlock(&c->ec_stripes_heap_lock);
- return -1;
+ return ret;
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
+ struct ec_stripe_head *h)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ unsigned i;
+ s64 idx;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (!ret)
- bkey_reassemble(&stripe->key.k_i, k);
- bch2_trans_exit(&trans);
+ idx = get_existing_stripe(c, h);
+ if (idx < 0) {
+ bch_err(c, "failed to find an existing stripe");
+ return -ENOSPC;
+ }
+
+ h->s->have_existing_stripe = true;
+ ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+ return ret;
+ }
+
+ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
+ /*
+ * this is a problem: we have deleted from the
+ * stripes heap already
+ */
+ BUG();
+ }
+
+ BUG_ON(h->s->existing_stripe.size != h->blocksize);
+ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
+ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+ __set_bit(i, h->s->blocks_gotten);
+ __set_bit(i, h->s->blocks_allocated);
+ }
+
+ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ }
+
+ bkey_copy(&h->s->new_stripe.key.k_i,
+ &h->s->existing_stripe.key.k_i);
+
+ return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
+ struct ec_stripe_head *h)
+{
+ int ret;
+
+ ret = bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity, 0);
+
+ if (ret) {
+ /*
+ * This means we need to wait for copygc to
+ * empty out buckets from existing stripes:
+ */
+ bch_err(c, "failed to reserve stripe");
+ }
return ret;
}
@@ -1326,86 +1458,58 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
unsigned target,
unsigned algo,
- unsigned redundancy)
+ unsigned redundancy,
+ bool copygc,
+ struct closure *cl)
{
- struct closure cl;
struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i, data_idx = 0;
- s64 idx;
int ret;
+ bool needs_stripe_new;
- closure_init_stack(&cl);
-
- h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
- if (!h)
+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
+ if (!h) {
+ bch_err(c, "no stripe head");
return NULL;
+ }
- if (!h->s) {
+ needs_stripe_new = !h->s;
+ if (needs_stripe_new) {
if (ec_new_stripe_alloc(c, h)) {
- bch2_ec_stripe_head_put(c, h);
- return NULL;
+ ret = -ENOMEM;
+ bch_err(c, "failed to allocate new stripe");
+ goto err;
}
- idx = get_existing_stripe(c, target, algo, redundancy);
- if (idx >= 0) {
- h->s->existing_stripe = true;
- h->s->existing_stripe_idx = idx;
- if (get_stripe_key(c, idx, &h->s->stripe)) {
- /* btree error */
- BUG();
- }
-
- for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
- if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
- __set_bit(i, h->s->blocks_allocated);
- ec_block_io(c, &h->s->stripe, READ, i, &cl);
- }
- }
+ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
+ BUG();
}
- if (!h->s->allocated) {
- if (!h->s->existing_stripe &&
- !h->s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &h->s->res,
- h->blocksize,
- h->s->nr_parity, 0);
- if (ret) {
- /* What should we do here? */
- bch_err(c, "unable to create new stripe: %i", ret);
- bch2_ec_stripe_head_put(c, h);
- h = NULL;
- goto out;
-
- }
-
- }
-
- if (new_stripe_alloc_buckets(c, h)) {
- bch2_ec_stripe_head_put(c, h);
- h = NULL;
- goto out;
- }
-
- open_bucket_for_each(c, &h->s->blocks, ob, i) {
- data_idx = find_next_zero_bit(h->s->blocks_allocated,
- h->s->nr_data, data_idx);
- BUG_ON(data_idx >= h->s->nr_data);
-
- h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
- h->s->data_block_idx[i] = data_idx;
- data_idx++;
- }
+ /*
+ * Try reserve a new stripe before reusing an
+ * existing stripe. This will prevent unnecessary
+ * read amplification during write oriented workloads.
+ */
+ ret = 0;
+ if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
+ ret = __bch2_ec_stripe_head_reserve(c, h);
+ if (ret && needs_stripe_new)
+ ret = __bch2_ec_stripe_head_reuse(c, h);
+ if (ret)
+ goto err;
- open_bucket_for_each(c, &h->s->parity, ob, i)
- h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+ if (!h->s->allocated) {
+ ret = new_stripe_alloc_buckets(c, h, cl);
+ if (ret)
+ goto err;
- //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
h->s->allocated = true;
}
-out:
- closure_sync(&cl);
+
return h;
+
+err:
+ bch2_ec_stripe_head_put(c, h);
+ return ERR_PTR(-ret);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
@@ -1421,12 +1525,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
if (!h->s)
goto unlock;
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- if (ob->ptr.dev == ca->dev_idx)
- goto found;
- open_bucket_for_each(c, &h->s->parity, ob, i)
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (!h->s->blocks[i])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[i];
if (ob->ptr.dev == ca->dev_idx)
goto found;
+ }
goto unlock;
found:
h->s->err = -EROFS;
@@ -1437,13 +1543,23 @@ unlock:
mutex_unlock(&c->ec_stripe_head_lock);
}
+void bch2_stripes_heap_start(struct bch_fs *c)
+{
+ struct genradix_iter iter;
+ struct stripe *m;
+
+ genradix_for_each(&c->stripes[0], iter, m)
+ if (m->alive)
+ bch2_stripes_heap_insert(c, m, iter.pos);
+}
+
static int __bch2_stripe_write_key(struct btree_trans *trans,
struct btree_iter *iter,
struct stripe *m,
size_t idx,
struct bkey_i_stripe *new_key)
{
- struct bch_fs *c = trans->c;
+ const struct bch_stripe *v;
struct bkey_s_c k;
unsigned i;
int ret;
@@ -1458,16 +1574,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_stripe)
return -EIO;
+ v = bkey_s_c_to_stripe(k).v;
+ for (i = 0; i < v->nr_blocks; i++)
+ if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+ goto write;
+ return 0;
+write:
bkey_reassemble(&new_key->k_i, k);
- spin_lock(&c->ec_stripes_heap_lock);
-
for (i = 0; i < new_key->v.nr_blocks; i++)
stripe_blockcount_set(&new_key->v, i,
m->block_sectors[i]);
- m->dirty = false;
-
- spin_unlock(&c->ec_stripes_heap_lock);
bch2_trans_update(trans, iter, &new_key->k_i, 0);
return 0;
@@ -1491,7 +1608,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
- if (!m->dirty)
+ if (!m->alive)
continue;
ret = __bch2_trans_do(&trans, NULL, NULL,
@@ -1516,18 +1633,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
int ret = 0;
if (k.k->type == KEY_TYPE_stripe) {
- struct stripe *m;
-
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_NOATOMIC);
if (ret)
return ret;
-
- spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes[0], k.k->p.offset);
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
}
return ret;
@@ -1608,19 +1718,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
h->target, h->algo, h->redundancy);
if (h->s)
- pr_buf(out, "\tpending: blocks %u allocated %u\n",
- h->s->blocks.nr,
+ pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+ h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr));
+ h->s->nr_data));
}
mutex_unlock(&c->ec_stripe_head_lock);
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
- s->blocks.nr,
- bitmap_weight(s->blocks_allocated,
- s->blocks.nr),
+ pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+ s->nr_data, s->nr_parity,
atomic_read(&s->pin));
}
mutex_unlock(&c->ec_stripe_new_lock);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 15f751fc2a35..765baa9d9264 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -60,9 +60,51 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
}
static inline void *stripe_csum(struct bch_stripe *s,
- unsigned dev, unsigned csum_idx)
+ unsigned block, unsigned csum_idx)
{
- return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+ EBUG_ON(block >= s->nr_blocks);
+ EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+ return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx)
+{
+ struct bch_csum csum = { 0 };
+
+ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+ return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx,
+ struct bch_csum csum)
+{
+ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s,
+ const struct bch_extent_ptr *ptr,
+ unsigned block)
+{
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+ if (block >= nr_data)
+ return false;
+
+ return ptr->dev == s->ptrs[block].dev &&
+ ptr->gen == s->ptrs[block].gen &&
+ ptr->offset >= s->ptrs[block].offset &&
+ ptr->offset < s->ptrs[block].offset + le16_to_cpu(s->sectors);
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+ struct extent_ptr_decoded p)
+{
+ BUG_ON(!p.has_ec);
+
+ return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block);
}
struct bch_read_bio;
@@ -71,9 +113,9 @@ struct ec_stripe_buf {
/* might not be buffering the entire stripe: */
unsigned offset;
unsigned size;
- unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- void *data[EC_STRIPE_MAX];
+ void *data[BCH_BKEY_PTRS_MAX];
union {
struct bkey_i_stripe key;
@@ -88,6 +130,7 @@ struct ec_stripe_new {
struct ec_stripe_head *h;
struct mutex lock;
struct list_head list;
+ struct closure iodone;
/* counts in flight writes, stripe is created when pin == 0 */
atomic_t pin;
@@ -98,20 +141,18 @@ struct ec_stripe_new {
u8 nr_parity;
bool allocated;
bool pending;
- bool existing_stripe;
- u64 existing_stripe_idx;
+ bool have_existing_stripe;
- unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
-
- struct open_buckets blocks;
- u8 data_block_idx[EC_STRIPE_MAX];
- struct open_buckets parity;
+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
struct keylist keys;
u64 inline_keys[BKEY_U64s * 8];
- struct ec_stripe_buf stripe;
+ struct ec_stripe_buf new_stripe;
+ struct ec_stripe_buf existing_stripe;
};
struct ec_stripe_head {
@@ -121,6 +162,7 @@ struct ec_stripe_head {
unsigned target;
unsigned algo;
unsigned redundancy;
+ bool copygc;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -145,8 +187,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
- unsigned, unsigned);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+ unsigned, unsigned, unsigned, bool, struct closure *);
void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
@@ -156,6 +198,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_stripes_heap_start(struct bch_fs *);
+
struct journal_keys;
int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
int bch2_stripes_write(struct bch_fs *, unsigned);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e4d633fca5bf..847770166223 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -4,11 +4,9 @@
#include <linux/llist.h>
-#define EC_STRIPE_MAX 16
-
struct bch_replicas_padded {
struct bch_replicas_entry e;
- u8 pad[EC_STRIPE_MAX];
+ u8 pad[BCH_BKEY_PTRS_MAX];
};
struct stripe {
@@ -20,11 +18,10 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
- unsigned alive:1;
- unsigned dirty:1;
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
unsigned on_heap:1;
u8 blocks_nonempty;
- u16 block_sectors[EC_STRIPE_MAX];
+ u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_replicas_padded r;
};
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index fd011df3cb99..16d2bca8a662 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
@@ -100,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
struct bpos *end)
{
struct btree_trans *trans = iter->trans;
- struct btree *b;
- struct btree_node_iter node_iter;
- struct bkey_packed *_k;
- unsigned nr_iters = 0;
+ struct btree_iter *copy;
+ struct bkey_s_c k;
+ unsigned nr_iters = 0;
int ret;
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- b = iter->l[0].b;
- node_iter = iter->l[0].iter;
-
- BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
- bkey_cmp(bkey_start_pos(&insert->k),
- bkey_predecessor(b->data->min_key)) < 0);
-
- *end = bpos_min(insert->k.p, b->key.k.p);
+ *end = insert->k.p;
/* extent_update_to_keys(): */
nr_iters += 1;
@@ -127,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
if (ret < 0)
return ret;
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ copy = bch2_trans_copy_iter(trans, iter);
+
+ for_each_btree_key_continue(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@@ -156,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
&nr_iters, EXTENT_ITERS_MAX);
if (ret)
break;
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
return ret < 0 ? ret : 0;
}
@@ -193,18 +179,13 @@ bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- struct bkey_packed *_k;
struct bkey_s_c k;
- struct bkey unpacked;
- int sectors;
-
- _k = bch2_btree_node_iter_peek(&node_iter, l->b);
- if (!_k)
- return BTREE_INSERT_OK;
+ int ret, sectors;
- k = bkey_disassemble(l->b, _k, &unpacked);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
/* Check if we're splitting a compressed extent: */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 7fae6a4ba26f..595dd0add509 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -215,9 +215,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx sectors %u written %u min_key ",
+ pr_buf(out, "seq %llx written %u min_key ",
le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors),
le16_to_cpu(bp.v->sectors_written));
bch2_bpos_to_text(out, bp.v->min_key);
@@ -665,7 +664,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
}
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas)
+ unsigned nr_replicas, bool compressed)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -683,7 +682,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+ if (nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
ret = false;
break;
}
@@ -693,6 +693,27 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
return ret;
}
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ unsigned replicas = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.ptr.cached)
+ continue;
+
+ if (p.has_ec)
+ replicas += p.ec.redundancy;
+
+ replicas++;
+
+ }
+
+ return replicas;
+}
+
static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
struct extent_ptr_decoded p)
{
@@ -707,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
durability = max_t(unsigned, durability, ca->mi.durability);
- if (p.has_ec) {
- struct stripe *s =
- genradix_ptr(&c->stripes[0], p.ec.idx);
-
- if (WARN_ON(!s))
- goto out;
+ if (p.has_ec)
+ durability += p.ec.redundancy;
- durability += s->nr_redundant;
- }
-out:
return durability;
}
@@ -764,6 +778,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
}
}
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+ k->k.u64s -= extent_entry_u64s(entry);
+}
+
void bch2_bkey_append_ptr(struct bkey_i *k,
struct bch_extent_ptr ptr)
{
@@ -1046,16 +1069,17 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_devs_list devs;
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
unsigned size_ondisk = k.k->size;
const char *reason;
unsigned nonce = UINT_MAX;
+ unsigned i;
- if (k.k->type == KEY_TYPE_btree_ptr)
+ if (k.k->type == KEY_TYPE_btree_ptr ||
+ k.k->type == KEY_TYPE_btree_ptr_v2)
size_ondisk = c->opts.btree_node_size;
- if (k.k->type == KEY_TYPE_btree_ptr_v2)
- size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
@@ -1101,6 +1125,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
}
}
+ devs = bch2_bkey_devs(k);
+ bubble_sort(devs.devs, devs.nr, u8_cmp);
+ for (i = 0; i + 1 < devs.nr; i++)
+ if (devs.devs[i] == devs.devs[i + 1])
+ return "multiple ptrs to same device";
+
return NULL;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 74c7bb8f9104..3988315fc404 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -538,12 +538,15 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
unsigned, unsigned);
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 93ad6dbe8c16..56cfb0d60c03 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,7 +3,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
@@ -84,6 +84,7 @@ struct dio_read {
struct closure cl;
struct kiocb *req;
long ret;
+ bool should_dirty;
struct bch_read_bio rbio;
};
@@ -281,28 +282,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
@@ -316,13 +302,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
@@ -645,18 +625,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
@@ -670,10 +644,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -692,31 +666,29 @@ struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
@@ -724,41 +696,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
@@ -819,11 +759,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -855,7 +792,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
int ret = 0;
@@ -863,7 +800,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
rbio->c = c;
rbio->start_time = local_clock();
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
retry:
while (1) {
struct bkey_s_c k;
@@ -881,7 +818,7 @@ retry:
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
ret = bch2_read_indirect_extent(trans,
&offset_into_extent, &sk);
@@ -926,13 +863,12 @@ retry:
bio_endio(&rbio->bio);
}
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
@@ -941,7 +877,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
@@ -976,8 +912,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
@@ -1077,34 +1011,35 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1128,7 +1063,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1282,7 +1217,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1685,12 +1620,22 @@ again:
/* O_DIRECT reads */
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+ if (check_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
static void bch2_dio_read_complete(struct closure *cl)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
dio->req->ki_complete(dio->req, dio->ret, 0);
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
static void bch2_direct_IO_read_endio(struct bio *bio)
@@ -1705,8 +1650,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
static void bch2_direct_IO_read_split_endio(struct bio *bio)
{
+ struct dio_read *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
bch2_direct_IO_read_endio(bio);
- bio_check_pages_dirty(bio); /* transfers ownership */
+ bio_check_or_release(bio, should_dirty);
}
static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
@@ -1760,6 +1708,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
dio->req = req;
dio->ret = ret;
+ /*
+ * This is one of the sketchier things I've encountered: we have to skip
+ * the dirtying of requests that are internal from the kernel (i.e. from
+ * loopback), because we'll deadlock on page_lock.
+ */
+ dio->should_dirty = iter_is_iovec(iter);
goto start;
while (iter->count) {
@@ -1781,7 +1735,9 @@ start:
}
offset += bio->bi_iter.bi_size;
- bio_set_pages_dirty(bio);
+
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
if (iter->count)
closure_get(&dio->cl);
@@ -1795,7 +1751,7 @@ start:
closure_sync(&dio->cl);
closure_debug_destroy(&dio->cl);
ret = dio->ret;
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
return ret;
} else {
return -EIOCBQUEUED;
@@ -1851,8 +1807,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned, iter_count;
+ unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
@@ -1863,7 +1820,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
iter_count = dio->iter.count;
if (kthread)
- use_mm(dio->mm);
+ kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
@@ -1873,7 +1830,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
current->faults_disabled_mapping = NULL;
if (kthread)
- unuse_mm(dio->mm);
+ kthread_unuse_mm(dio->mm);
/*
* If the fault handler returned an error but also signalled
@@ -1906,7 +1863,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1928,7 +1885,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
!bch2_check_range_allocated(c, dio->op.pos,
- bio_sectors(bio), dio->op.opts.data_replicas))
+ bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0))
goto err;
task_io_account_write(bio->bi_iter.bi_size);
@@ -1969,7 +1928,7 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (dio->op.error) {
@@ -2479,9 +2438,9 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- struct bkey_on_stack copy;
+ struct bkey_buf copy;
struct btree_trans trans;
- struct btree_iter *src, *dst;
+ struct btree_iter *src, *dst, *del;
loff_t shift, new_size;
u64 src_start;
int ret;
@@ -2489,7 +2448,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
- bkey_on_stack_init(&copy);
+ bch2_bkey_buf_init(&copy);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
/*
@@ -2551,6 +2510,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
POS(inode->v.i_ino, src_start >> 9),
BTREE_ITER_INTENT);
dst = bch2_trans_copy_iter(&trans, src);
+ del = bch2_trans_copy_iter(&trans, src);
while (1) {
struct disk_reservation disk_res =
@@ -2571,13 +2531,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
if (!k.k || k.k->p.inode != inode->v.i_ino)
break;
- BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
-
if (insert &&
bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
break;
reassemble:
- bkey_on_stack_reassemble(&copy, c, k);
+ bch2_bkey_buf_reassemble(&copy, c, k);
if (insert &&
bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
@@ -2604,6 +2562,7 @@ reassemble:
delete.k.p = copy.k->k.p;
delete.k.size = copy.k->k.size;
delete.k.p.offset -= shift >> 9;
+ bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
@@ -2624,9 +2583,7 @@ reassemble:
BUG_ON(ret);
}
- bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
-
- ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+ ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
bch2_trans_commit(&trans, &disk_res,
&inode->ei_journal_seq,
@@ -2654,7 +2611,7 @@ bkey_err:
}
err:
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&copy, c);
+ bch2_bkey_buf_exit(&copy, c);
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
@@ -2890,235 +2847,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..2537a3d25ede 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
@@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cc0a4b0f0e5b..a2654c862b7b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -3,7 +3,7 @@
#include "bcachefs.h"
#include "acl.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
#include "chardev.h"
@@ -886,17 +886,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack cur, prev;
+ struct bkey_buf cur, prev;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
- bkey_on_stack_init(&cur);
- bkey_on_stack_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -915,7 +919,7 @@ retry:
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_reassemble(&cur, c, k);
+ bch2_bkey_buf_reassemble(&cur, c, k);
ret = bch2_read_indirect_extent(&trans,
&offset_into_extent, &cur);
@@ -923,7 +927,7 @@ retry:
break;
k = bkey_i_to_s_c(cur.k);
- bkey_on_stack_realloc(&prev, c, k.k->u64s);
+ bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
sectors = min(sectors, k.k->size - offset_into_extent);
@@ -957,8 +961,8 @@ retry:
FIEMAP_EXTENT_LAST);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&cur, c);
- bkey_on_stack_exit(&prev, c);
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
}
@@ -995,15 +999,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -1012,16 +1007,16 @@ static const struct file_operations bch_file_operations = {
.open = generic_file_open,
.fsync = bch2_fsync,
.splice_read = generic_file_splice_read,
- /*
- * Broken, on v5.3:
+#if 0
+ /* Busted: */
.splice_write = iter_file_splice_write,
- */
+#endif
.fallocate = bch2_fallocate_dispatch,
.unlocked_ioctl = bch2_fs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1091,7 +1086,7 @@ static const struct address_space_operations bch_address_space_operations = {
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
@@ -1577,9 +1572,7 @@ got_sb:
if (ret)
goto err_put_super;
- sb->s_bdi->congested_fn = bch2_congested;
- sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 39f872de0c18..66c9dad2ef3e 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "dirent.h"
#include "error.h"
@@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans,
buf[name.len] = '\0';
name.name = buf;
- ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+ ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0);
if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
if (ret)
@@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans,
struct inode_walker *w, u64 inum)
{
if (inum != w->cur_inum) {
- int ret = bch2_inode_find_by_inum_trans(trans, inum,
- &w->inode);
+ int ret = __bch2_inode_find_by_inum_trans(trans, inum,
+ &w->inode, 0);
if (ret && ret != -ENOENT)
return ret;
@@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc,
bch2_trans_update(trans, k_iter, &delete, 0);
return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
- tmp, BCH_HASH_SET_MUST_CREATE);
+ tmp, 0);
}
static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -464,11 +464,11 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack prev;
+ struct bkey_buf prev;
u64 i_sectors;
int ret = 0;
- bkey_on_stack_init(&prev);
+ bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -500,7 +500,7 @@ retry:
goto err;
}
}
- bkey_on_stack_reassemble(&prev, c, k);
+ bch2_bkey_buf_reassemble(&prev, c, k);
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
@@ -569,7 +569,7 @@ err:
fsck_err:
if (ret == -EINTR)
goto retry;
- bkey_on_stack_exit(&prev, c);
+ bch2_bkey_buf_exit(&prev, c);
return bch2_trans_exit(&trans) ?: ret;
}
@@ -673,7 +673,7 @@ retry:
continue;
}
- ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
+ ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
if (ret && ret != -ENOENT)
break;
@@ -787,7 +787,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
bch_verbose(c, "checking root directory");
- ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
+ root_inode, 0));
if (ret && ret != -ENOENT)
return ret;
@@ -834,7 +836,8 @@ static int check_lostfound(struct bch_fs *c,
goto create_lostfound;
}
- ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
if (ret && ret != -ENOENT)
return ret;
@@ -1072,6 +1075,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links,
if (inum < range_start || inum >= *range_end)
return;
+ if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
+ *range_end = inum;
+ return;
+ }
+
link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
if (!link) {
bch_verbose(c, "allocation failed during fsck - will need another pass");
@@ -1346,23 +1354,25 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
nlinks_iter = genradix_iter_init(links, 0);
while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret2 = bkey_err(k))) {
+ !(ret2 = bkey_err(k)) &&
+ iter->pos.offset < range_end) {
peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
if (!link && (!k.k || iter->pos.offset >= range_end))
break;
nlinks_pos = range_start + nlinks_iter.pos;
- if (iter->pos.offset > nlinks_pos) {
+
+ if (link && nlinks_pos < iter->pos.offset) {
/* Should have been caught by dirents pass: */
- need_fsck_err_on(link && link->count, c,
+ need_fsck_err_on(link->count, c,
"missing inode %llu (nlink %u)",
nlinks_pos, link->count);
genradix_iter_advance(&nlinks_iter, links);
goto peek_nlinks;
}
- if (iter->pos.offset < nlinks_pos || !link)
+ if (!link || nlinks_pos > iter->pos.offset)
link = &zero_links;
if (k.k && k.k->type == KEY_TYPE_inode) {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index bf1c7319669c..81feb47fe8f9 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -479,7 +479,7 @@ int bch2_inode_create(struct btree_trans *trans,
u64 min, max, start, *hint;
int ret;
- unsigned cpu = raw_smp_processor_id();
+ u64 cpu = raw_smp_processor_id();
unsigned bits = (c->opts.inodes_32bit
? 31 : 63) - c->inode_shard_bits;
@@ -628,16 +628,19 @@ err:
return ret;
}
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
+int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode,
+ unsigned flags)
{
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(0, inode_nr), BTREE_ITER_CACHED);
- k = bch2_btree_iter_peek_cached(iter);
+ POS(0, inode_nr), flags);
+ k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
+ ? bch2_btree_iter_peek_cached(iter)
+ : bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -650,6 +653,14 @@ err:
return ret;
}
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ return __bch2_inode_find_by_inum_trans(trans, inode_nr,
+ inode, BTREE_ITER_CACHED);
+
+}
+
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index dbdfcf63d079..1caf036ae928 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -73,6 +73,8 @@ int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
int bch2_inode_rm(struct bch_fs *, u64, bool);
+int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+ struct bch_inode_unpacked *, unsigned);
int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
struct bch_inode_unpacked *);
int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d31a1d449c03..5f74583f5c61 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -9,7 +9,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
#include "buckets.h"
@@ -135,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -183,18 +183,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
/* Extent update path: */
-static int sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new,
- bool *maybe_extending,
- s64 *i_sectors_delta,
- s64 *disk_sectors_delta)
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i *new,
+ bool *maybe_extending,
+ bool *should_check_enospc,
+ s64 *i_sectors_delta,
+ s64 *disk_sectors_delta)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_s_c old;
+ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
int ret = 0;
*maybe_extending = true;
+ *should_check_enospc = false;
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
@@ -213,6 +218,11 @@ static int sum_sector_overwrites(struct btree_trans *trans,
(int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
bch2_bkey_nr_ptrs_fully_allocated(old));
+ if (!*should_check_enospc &&
+ (new_replicas > bch2_bkey_replicas(c, old) ||
+ (!new_compressed && bch2_bkey_sectors_compressed(old))))
+ *should_check_enospc = true;
+
if (bkey_cmp(old.k->p, new->k.p) >= 0) {
/*
* Check if there's already data above where we're
@@ -250,7 +260,7 @@ int bch2_extent_update(struct btree_trans *trans,
{
/* this must live until after bch2_trans_commit(): */
struct bkey_inode_buf inode_p;
- bool extending = false;
+ bool extending = false, should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
int ret;
@@ -258,8 +268,9 @@ int bch2_extent_update(struct btree_trans *trans,
if (ret)
return ret;
- ret = sum_sector_overwrites(trans, iter, k,
+ ret = bch2_sum_sector_overwrites(trans, iter, k,
&extending,
+ &should_check_enospc,
&i_sectors_delta,
&disk_sectors_delta);
if (ret)
@@ -269,7 +280,8 @@ int bch2_extent_update(struct btree_trans *trans,
disk_sectors_delta > (s64) disk_res->sectors) {
ret = bch2_disk_reservation_add(trans->c, disk_res,
disk_sectors_delta - disk_res->sectors,
- 0);
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
return ret;
}
@@ -320,8 +332,7 @@ int bch2_extent_update(struct btree_trans *trans,
ret = bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_NOFAIL);
if (ret)
return ret;
@@ -404,14 +415,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
struct btree_iter *iter;
int ret;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -423,7 +434,7 @@ int bch2_write_index_default(struct bch_write_op *op)
k = bch2_keylist_front(keys);
- bkey_on_stack_realloc(&sk, c, k->k.u64s);
+ bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
bkey_copy(sk.k, k);
bch2_cut_front(iter->pos, sk.k);
@@ -440,7 +451,7 @@ int bch2_write_index_default(struct bch_write_op *op)
} while (!bch2_keylist_empty(keys));
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
@@ -488,9 +499,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
- if (!journal_flushes_device(ca))
- n->bio.bi_opf |= REQ_FUA;
-
if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
@@ -1617,14 +1625,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct bkey_s_c k;
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1636,7 +1644,7 @@ retry:
if (bkey_err(k))
goto err;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
@@ -1657,7 +1665,7 @@ retry:
out:
bch2_rbio_done(rbio);
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return;
err:
rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1670,14 +1678,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct bkey_s_c k;
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
@@ -1687,7 +1695,7 @@ retry:
BTREE_ITER_SLOTS, k, ret) {
unsigned bytes, sectors, offset_into_extent;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
@@ -1736,7 +1744,7 @@ err:
rbio->bio.bi_status = BLK_STS_IOERR;
out:
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
bch2_rbio_done(rbio);
}
@@ -1807,17 +1815,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if ((ret = bkey_err(k)))
goto out;
- /*
- * going to be temporarily appending another checksum entry:
- */
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
- BKEY_EXTENT_U64s_MAX * 8);
- if ((ret = PTR_ERR_OR_ZERO(new)))
- goto out;
-
- bkey_reassemble(new, k);
- k = bkey_i_to_s_c(new);
-
if (bversion_cmp(k.k->version, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -1836,6 +1833,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
goto out;
}
+ /*
+ * going to be temporarily appending another checksum entry:
+ */
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+ sizeof(struct bch_extent_crc128));
+ if ((ret = PTR_ERR_OR_ZERO(new)))
+ goto out;
+
+ bkey_reassemble(new, k);
+
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
@@ -2002,7 +2009,7 @@ static void bch2_read_endio(struct bio *bio)
int __bch2_read_indirect_extent(struct btree_trans *trans,
unsigned *offset_into_extent,
- struct bkey_on_stack *orig_k)
+ struct bkey_buf *orig_k)
{
struct btree_iter *iter;
struct bkey_s_c k;
@@ -2029,7 +2036,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
}
*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
- bkey_on_stack_reassemble(orig_k, trans->c, k);
+ bch2_bkey_buf_reassemble(orig_k, trans->c, k);
err:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -2208,7 +2215,11 @@ get_bio:
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- if (pick.ptr.cached)
+ /*
+ * If it's being moved internally, we don't want to flag it as a cache
+ * hit:
+ */
+ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
@@ -2290,7 +2301,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct bkey_s_c k;
unsigned flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
@@ -2304,7 +2315,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c;
rbio->start_time = local_clock();
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
@@ -2327,7 +2338,7 @@ retry:
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
ret = bch2_read_indirect_extent(&trans,
&offset_into_extent, &sk);
@@ -2364,7 +2375,7 @@ retry:
}
out:
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return;
err:
if (ret == -EINTR)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index e6aac594f3e6..04f6baa1daf7 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -3,7 +3,7 @@
#define _BCACHEFS_IO_H
#include "checksum.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "io_types.h"
#define to_wbio(_bio) \
@@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
: op->c->wq;
}
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, bool *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct disk_reservation *,
u64 *, u64, s64 *);
@@ -112,11 +114,11 @@ struct cache_promote_op;
struct extent_ptr_decoded;
int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
- struct bkey_on_stack *);
+ struct bkey_buf *);
static inline int bch2_read_indirect_extent(struct btree_trans *trans,
unsigned *offset_into_extent,
- struct bkey_on_stack *k)
+ struct bkey_buf *k)
{
return k->k->k.type == KEY_TYPE_reflink_p
? __bch2_read_indirect_extent(trans, offset_into_extent, k)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d54424829378..395021b5ac8e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "btree_gc.h"
+#include "btree_update.h"
#include "buckets.h"
#include "journal.h"
#include "journal_io.h"
@@ -82,6 +83,7 @@ static void bch2_journal_buf_init(struct journal *j)
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
+ buf->separate_flush = false;
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@@ -118,6 +120,9 @@ void __bch2_journal_buf_put(struct journal *j)
/*
* Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
*/
static bool __journal_entry_close(struct journal *j)
{
@@ -155,6 +160,7 @@ static bool __journal_entry_close(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ /* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
@@ -185,6 +191,7 @@ static bool __journal_entry_close(struct journal *j)
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
+ /* Initialize new buffer: */
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
@@ -568,6 +575,8 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
spin_lock(&j->lock);
+ BUG_ON(seq > journal_cur_seq(j));
+
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
ret = -EIO;
@@ -633,9 +642,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
u64 start_time = local_clock();
int ret, ret2;
- ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
- bch2_time_stats_update(j->flush_seq_time, start_time);
+ if (!ret)
+ bch2_time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
@@ -777,7 +787,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
}
} else {
rcu_read_lock();
- ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+ ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
false, cl);
rcu_read_unlock();
if (IS_ERR(ob)) {
@@ -818,18 +828,28 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
+ if (!c || new_fs)
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
percpu_up_read(&c->mark_lock);
}
+ if (c && !new_fs)
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+ bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+
if (!new_fs)
bch2_open_bucket_put(c, ob);
+
+ if (ret)
+ goto err;
}
err:
bch2_sb_resize_journal(&ca->disk_sb,
@@ -948,6 +968,7 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
(journal_entry_is_open(j) ||
j->last_empty_seq + 1 != journal_cur_seq(j)));
@@ -993,13 +1014,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
}
list_for_each_entry(i, journal_entries, list) {
+ unsigned ptr;
+
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
if (seq < last_seq)
continue;
- journal_seq_pin(j, seq)->devs = i->devs;
+ p = journal_seq_pin(j, seq);
+
+ p->devs.nr = 0;
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
}
spin_lock(&j->lock);
@@ -1093,10 +1120,6 @@ int bch2_fs_journal_init(struct journal *j)
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
- /* Btree roots: */
- j->entry_u64s_reserved +=
- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
@@ -1138,6 +1161,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
+ "flushed_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
"nr flush writes:\t%llu\n"
"nr noflush writes:\t%llu\n"
@@ -1150,6 +1174,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
journal_cur_seq(j),
journal_last_seq(j),
j->last_seq_ondisk,
+ j->flushed_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
j->nr_flush_writes,
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 34ef66994b73..bda8cb97d321 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -288,7 +288,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
@@ -494,11 +494,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
- return true;
-}
-
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 0e6fbe2f6a75..0d361f5c39b5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -5,6 +5,7 @@
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
@@ -46,15 +47,16 @@ struct journal_list {
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct bch_extent_ptr entry_ptr,
struct journal_list *jlist, struct jset *j,
bool bad)
{
- struct journal_replay *i, *pos;
- struct bch_devs_list devs = { .nr = 0 };
+ struct journal_replay *i, *pos, *dup = NULL;
+ struct bch_extent_ptr *ptr;
struct list_head *where;
size_t bytes = vstruct_bytes(j);
u64 last_seq = 0;
- int ret;
+ int ret = JOURNAL_ENTRY_ADD_OK;
list_for_each_entry_reverse(i, jlist->head, list) {
if (!JSET_NO_FLUSH(&i->j)) {
@@ -88,28 +90,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
where = jlist->head;
add:
- i = where->next != jlist->head
+ dup = where->next != jlist->head
? container_of(where->next, struct journal_replay, list)
: NULL;
+ if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
+ dup = NULL;
+
/*
* Duplicate journal entries? If so we want the one that didn't have a
* checksum error:
*/
- if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- if (i->bad) {
- devs = i->devs;
- __journal_replay_free(i);
+ if (dup) {
+ if (dup->bad) {
+ /* we'll replace @dup: */
} else if (bad) {
+ i = dup;
goto found;
} else {
- fsck_err_on(bytes != vstruct_bytes(&i->j) ||
- memcmp(j, &i->j, bytes), c,
+ fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
+ memcmp(j, &dup->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
+ i = dup;
goto found;
}
-
}
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
@@ -118,17 +123,34 @@ add:
goto out;
}
- list_add(&i->list, where);
- i->devs = devs;
- i->bad = bad;
- i->ignore = false;
+ i->nr_ptrs = 0;
+ i->bad = bad;
+ i->ignore = false;
memcpy(&i->j, j, bytes);
+
+ if (dup) {
+ i->nr_ptrs = dup->nr_ptrs;
+ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+ __journal_replay_free(dup);
+ }
+
+ list_add(&i->list, where);
found:
- if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
- bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
- else
- fsck_err_on(1, c, "duplicate journal entries on same device");
- ret = JOURNAL_ENTRY_ADD_OK;
+ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+ if (ptr->dev == ca->dev_idx) {
+ bch_err(c, "duplicate journal entry %llu on same device",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+ }
+
+ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
return ret;
@@ -405,6 +427,69 @@ fsck_err:
return ret;
}
+static int journal_entry_validate_clock(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, "invalid journal entry clock: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, "invalid journal entry clock: bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, "invalid journal entry dev usage: bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, "invalid journal entry dev usage: bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
@@ -470,7 +555,8 @@ static int jset_validate(struct bch_fs *c,
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max, c,
"%s sector %llu seq %llu: unknown journal entry version %u",
- ca->name, sector, le64_to_cpu(jset->seq),
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
version)) {
/* don't try to continue: */
return EINVAL;
@@ -482,32 +568,42 @@ static int jset_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq), bytes)) {
ret = JOURNAL_ENTRY_BAD;
le32_add_cpu(&jset->u64s,
-((bytes - (bucket_sectors_left << 9)) / 8));
}
- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
- ca->name, sector, le64_to_cpu(jset->seq),
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
JSET_CSUM_TYPE(jset))) {
ret = JOURNAL_ENTRY_BAD;
- goto bad_csum_type;
+ goto csum_done;
}
+ if (write)
+ goto csum_done;
+
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
"%s sector %llu seq %llu: journal checksum bad",
- ca->name, sector, le64_to_cpu(jset->seq)))
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq)))
ret = JOURNAL_ENTRY_BAD;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
-bad_csum_type:
- if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
- "invalid journal entry: last_seq > seq")) {
+csum_done:
+ /* last_seq is ignored when JSET_NO_FLUSH is true */
+ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(jset->last_seq),
+ le64_to_cpu(jset->seq))) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
@@ -515,6 +611,14 @@ fsck_err:
return ret;
}
+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+{
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
+
+ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
+ jset_validate_entries(c, jset, WRITE);
+}
+
struct journal_read_buf {
void *data;
size_t size;
@@ -577,8 +681,15 @@ reread:
if (bch2_dev_io_err_on(ret, ca,
"journal read error: sector %llu",
offset) ||
- bch2_meta_read_fault("journal"))
- return -EIO;
+ bch2_meta_read_fault("journal")) {
+ /*
+ * We don't error out of the recovery process
+ * here, since the relevant journal entry may be
+ * found on a different device, and missing or
+ * no journal entries will be handled later
+ */
+ return 0;
+ }
j = buf->data;
}
@@ -628,7 +739,10 @@ reread:
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, jlist, j, ret != 0);
+ ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
+ .dev = ca->dev_idx,
+ .offset = offset,
+ }, jlist, j, ret != 0);
mutex_unlock(&jlist->lock);
switch (ret) {
@@ -716,6 +830,25 @@ err:
goto out;
}
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ unsigned i;
+
+ for (i = 0; i < j->nr_ptrs; i++) {
+ struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+ u64 offset;
+
+ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+
+ if (i)
+ pr_buf(out, " ");
+ pr_buf(out, "%u:%llu (offset %llu)",
+ j->ptrs[i].dev,
+ (u64) j->ptrs[i].offset, offset);
+ }
+}
+
int bch2_journal_read(struct bch_fs *c, struct list_head *list,
u64 *blacklist_seq, u64 *start_seq)
{
@@ -813,6 +946,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
+ char buf1[200], buf2[200];
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -827,10 +961,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
+ if (i->list.prev != list) {
+ struct printbuf out = PBUF(buf1);
+ struct journal_replay *p = list_prev_entry(i, list);
+
+ bch2_journal_ptrs_to_text(&out, c, p);
+ pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+ } else
+ sprintf(buf1, "(none)");
+ bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+
missing_end = seq - 1;
- fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+ " prev at %s\n"
+ " next at %s",
missing_start, missing_end,
- last_seq, *blacklist_seq - 1);
+ last_seq, *blacklist_seq - 1,
+ buf1, buf2);
}
seq++;
@@ -839,7 +986,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct bch_replicas_padded replicas;
+ struct bch_replicas_padded replicas = {
+ .e.data_type = BCH_DATA_journal,
+ .e.nr_required = 1,
+ };
+ unsigned ptr;
char buf[80];
if (i->ignore)
@@ -849,13 +1000,16 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (ret)
goto fsck_err;
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+ bch2_replicas_entry_sort(&replicas.e);
+
/*
* If we're mounting in degraded mode - if we didn't read all
* the devices - this is wrong:
*/
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
-
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
@@ -946,16 +1100,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
+ unsigned target = c->opts.metadata_target ?:
+ c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
rcu_read_lock();
+retry:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
- &c->rw_devs[BCH_DATA_journal]);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
@@ -987,9 +1145,17 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
+
+ if (replicas < replicas_want && target) {
+ /* Retry from all devices: */
+ target = 0;
+ goto retry;
+ }
done:
rcu_read_unlock();
+ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
}
@@ -1050,9 +1216,13 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
return;
memcpy(new_buf, buf->data, buf->buf_size);
- kvpfree(buf->data, buf->buf_size);
- buf->data = new_buf;
- buf->buf_size = new_size;
+
+ spin_lock(&j->lock);
+ swap(buf->data, new_buf);
+ swap(buf->buf_size, new_size);
+ spin_unlock(&j->lock);
+
+ kvpfree(new_buf, new_size);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1069,9 +1239,7 @@ static void journal_write_done(struct closure *cl)
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 seq = le64_to_cpu(w->data->seq);
- u64 last_seq = le64_to_cpu(w->data->last_seq);
- u64 v;
+ u64 v, seq, last_seq;
int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -1089,6 +1257,9 @@ static void journal_write_done(struct closure *cl)
bch2_fatal_error(c);
spin_lock(&j->lock);
+ seq = le64_to_cpu(w->data->seq);
+ last_seq = le64_to_cpu(w->data->last_seq);
+
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = devs;
@@ -1096,7 +1267,7 @@ static void journal_write_done(struct closure *cl)
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
- if (!w->noflush) {
+ if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = last_seq;
}
@@ -1156,6 +1327,56 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
+static void do_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_extent_ptr *ptr;
+ struct bio *bio;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ /* XXX: fix this */
+ bch_err(c, "missing device for journal write\n");
+ continue;
+ }
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+ sectors);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_iter.bi_sector = ptr->offset;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+ if (!JSET_NO_FLUSH(w->data))
+ bio->bi_opf |= REQ_FUA;
+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+ bio->bi_opf |= REQ_PREFLUSH;
+
+ bch2_bio_map(bio, w->data, sectors << 9);
+
+ trace_journal_write(bio);
+ closure_bio_submit(bio, cl);
+
+ ca->journal.bucket_seq[ca->journal.cur_idx] =
+ le64_to_cpu(w->data->seq);
+ }
+
+ continue_at(cl, journal_write_done, system_highpri_wq);
+ return;
+}
+
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
@@ -1165,9 +1386,9 @@ void bch2_journal_write(struct closure *cl)
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- struct bch_extent_ptr *ptr;
+ char *journal_debug_buf = NULL;
bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s;
+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1184,7 +1405,7 @@ void bch2_journal_write(struct closure *cl)
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
- jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+ jset->last_seq = 0;
j->nr_noflush_writes++;
} else {
@@ -1209,8 +1430,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
- end = bch2_journal_super_entries_add_common(c, end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1219,10 +1440,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
-
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
@@ -1236,11 +1454,11 @@ void bch2_journal_write(struct closure *cl)
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
- if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
+ if (le32_to_cpu(jset->version) <= bcachefs_metadata_version_inode_btree_change)
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate_entries(c, jset, WRITE))
+ jset_validate_for_write(c, jset))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -1251,7 +1469,7 @@ void bch2_journal_write(struct closure *cl)
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate_entries(c, jset, WRITE))
+ jset_validate_for_write(c, jset))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
@@ -1270,6 +1488,12 @@ retry_alloc:
goto retry_alloc;
}
+ if (ret) {
+ journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+ if (journal_debug_buf)
+ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ }
+
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
@@ -1284,7 +1508,9 @@ retry_alloc:
spin_unlock(&j->lock);
if (ret) {
- bch_err(c, "Unable to allocate journal write");
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf);
+ kfree(journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
return;
@@ -1297,49 +1523,30 @@ retry_alloc:
if (c->opts.nochanges)
goto no_io;
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!percpu_ref_tryget(&ca->io_ref)) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
+ for_each_rw_member(ca, c, i)
+ nr_rw_members++;
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
- sectors);
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
- if (!JSET_NO_FLUSH(jset))
- bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
- bch2_bio_map(bio, jset, sectors << 9);
+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ for_each_rw_member(ca, c, i) {
+ percpu_ref_get(&ca->io_ref);
- trace_journal_write(bio);
- closure_bio_submit(bio, cl);
-
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_opf = REQ_OP_FLUSH;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
}
- if (!JSET_NO_FLUSH(jset)) {
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
- }
+ bch2_bucket_seq_cleanup(c);
+
+ continue_at(cl, do_journal_write, system_highpri_wq);
+ return;
no_io:
bch2_bucket_seq_cleanup(c);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6b4c80968f52..a4931ab93a68 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -8,7 +8,9 @@
*/
struct journal_replay {
struct list_head list;
- struct bch_devs_list devs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+ unsigned nr_ptrs;
+
/* checksum error, but we may want to try using it anyways: */
bool bad;
bool ignore;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 61bcd77190a7..bbf8e5ad8aa0 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -11,7 +11,6 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
/* Free space calculations: */
@@ -385,12 +384,22 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin_list *pin_list;
spin_lock(&j->lock);
+
+ if (seq < journal_last_seq(j)) {
+ /*
+ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+ * the src pin - with the pin dropped, the entry to pin might no
+ * longer to exist, but that means there's no longer anything to
+ * copy and we can bail out here:
+ */
+ spin_unlock(&j->lock);
+ return;
+ }
+
pin_list = journal_seq_pin(j, seq);
__journal_pin_drop(j, pin);
- BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
-
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
@@ -682,8 +691,10 @@ int bch2_journal_reclaim_start(struct journal *j)
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
- if (IS_ERR(p))
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
j->reclaim_thread = p;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index f02caa3d49ea..adf1f5c981cd 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -53,8 +53,11 @@ static inline void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
- if (journal_pin_active(src))
- bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+ /* Guard against racing with journal_pin_drop(src): */
+ u64 seq = READ_ONCE(src->seq);
+
+ if (seq)
+ bch2_journal_pin_add(j, seq, dst, flush_fn);
}
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 67ee47eb17a7..d17a1ff82a18 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -20,7 +20,7 @@
struct journal_buf {
struct jset *data;
- BKEY_PADDED(key);
+ __BKEY_PADDED(key, BCH_REPLICAS_MAX);
struct closure_waitlist wait;
@@ -31,6 +31,7 @@ struct journal_buf {
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
+ bool separate_flush;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 96c8690adc5b..6241ff0c129f 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,7 +4,7 @@
*/
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
@@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int ret = 0;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -57,7 +57,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
continue;
}
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
dev_idx, flags, false);
@@ -90,7 +90,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
}
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
BUG_ON(ret == -EINTR);
@@ -109,6 +109,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
struct btree_iter *iter;
struct closure cl;
struct btree *b;
+ struct bkey_buf k;
unsigned id;
int ret;
@@ -116,28 +117,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
+ bch2_bkey_buf_init(&k);
bch2_trans_init(&trans, c, 0, 0);
closure_init_stack(&cl);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH, b) {
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
retry:
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
continue;
- bkey_copy(&tmp.k, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
+ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
goto err;
}
- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
goto retry;
@@ -157,6 +158,7 @@ retry:
ret = 0;
err:
ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_bkey_buf_exit(&k, c);
BUG_ON(ret == -EINTR);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fbeaa3b67326..75b7046d6042 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -2,7 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
@@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
struct migrate_write *m =
container_of(op, struct migrate_write, op);
struct keylist *keys = &op->insert_keys;
+ struct bkey_buf _new, _insert;
int ret = 0;
+ bch2_bkey_buf_init(&_new);
+ bch2_bkey_buf_init(&_insert);
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, m->btree_id,
@@ -73,21 +78,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
struct bkey_s_c k;
struct bkey_i *insert;
struct bkey_i_extent *new;
- BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bool did_work = false;
- int nr;
+ bool extending = false, should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
bch2_trans_reset(&trans, 0);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
- if (ret) {
- if (ret == -EINTR)
- continue;
- break;
- }
+ if (ret)
+ goto err;
new = bkey_i_to_extent(bch2_keylist_front(keys));
@@ -95,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
!bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
goto nomatch;
- bkey_reassemble(&_insert.k, k);
- insert = &_insert.k;
+ bkey_reassemble(_insert.k, k);
+ insert = _insert.k;
- bkey_copy(&_new.k, bch2_keylist_front(keys));
- new = bkey_i_to_extent(&_new.k);
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(_new.k);
bch2_cut_front(iter->pos, &new->k_i);
bch2_cut_front(iter->pos, insert);
@@ -144,23 +146,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
op->opts.background_target,
op->opts.data_replicas);
- /*
- * If we're not fully overwriting @k, and it's compressed, we
- * need a reservation for all the pointers in @insert
- */
- nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
- m->nr_ptrs_reserved;
+ ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+ &extending,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ goto err;
- if (insert->k.size < k.k->size &&
- bch2_bkey_sectors_compressed(k) &&
- nr > 0) {
+ if (disk_sectors_delta > (s64) op->res.sectors) {
ret = bch2_disk_reservation_add(c, &op->res,
- keylist_sectors(keys) * nr, 0);
+ disk_sectors_delta - op->res.sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto out;
-
- m->nr_ptrs_reserved += nr;
- goto next;
}
bch2_trans_update(&trans, iter, insert, 0);
@@ -168,8 +168,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
m->data_opts.btree_insert_flags);
+err:
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
if (ret == -EINTR)
@@ -197,6 +197,8 @@ nomatch:
}
out:
bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&_insert, c);
+ bch2_bkey_buf_exit(&_new, c);
BUG_ON(ret == -EINTR);
return ret;
}
@@ -326,12 +328,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -516,7 +518,7 @@ static int __bch2_move_data(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
@@ -525,12 +527,12 @@ static int __bch2_move_data(struct bch_fs *c,
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_user;
stats->btree_id = btree_id;
- stats->pos = POS_MIN;
+ stats->pos = start;
iter = bch2_trans_get_iter(&trans, btree_id, start,
BTREE_ITER_PREFETCH);
@@ -605,13 +607,19 @@ peek:
}
/* unlock before doing IO: */
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
+ if (ret2 == -EINTR) {
+ bch2_trans_reset(&trans, 0);
+ bch2_trans_cond_resched(&trans);
+ continue;
+ }
+
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
@@ -633,20 +641,21 @@ next_nondata:
}
out:
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
int bch2_move_data(struct bch_fs *c,
+ enum btree_id start_btree_id, struct bpos start_pos,
+ enum btree_id end_btree_id, struct bpos end_pos,
struct bch_ratelimit *rate,
struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
move_pred_fn pred, void *arg,
struct bch_move_stats *stats)
{
struct moving_context ctxt = { .stats = stats };
+ enum btree_id id;
int ret;
closure_init_stack(&ctxt.cl);
@@ -655,10 +664,23 @@ int bch2_move_data(struct bch_fs *c,
stats->data_type = BCH_DATA_user;
- ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
- pred, arg, stats, BTREE_ID_EXTENTS) ?:
- __bch2_move_data(c, &ctxt, rate, wp, start, end,
- pred, arg, stats, BTREE_ID_REFLINK);
+ for (id = start_btree_id;
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+ id++) {
+ stats->btree_id = id;
+
+ if (id != BTREE_ID_EXTENTS &&
+ id != BTREE_ID_REFLINK)
+ continue;
+
+ ret = __bch2_move_data(c, &ctxt, rate, wp,
+ id == start_btree_id ? start_pos : POS_MIN,
+ id == end_btree_id ? end_pos : POS_MAX,
+ pred, arg, stats, id);
+ if (ret)
+ break;
+ }
+
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
@@ -672,16 +694,22 @@ int bch2_move_data(struct bch_fs *c,
return ret;
}
+typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_opts *);
+
static int bch2_move_btree(struct bch_fs *c,
- move_pred_fn pred,
- void *arg,
+ enum btree_id start_btree_id, struct bpos start_pos,
+ enum btree_id end_btree_id, struct bpos end_pos,
+ move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- unsigned id;
+ enum btree_id id;
struct data_opts data_opts;
enum data_cmd cmd;
int ret = 0;
@@ -690,16 +718,24 @@ static int bch2_move_btree(struct bch_fs *c,
stats->data_type = BCH_DATA_btree;
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (id = start_btree_id;
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+ id++) {
stats->btree_id = id;
- for_each_btree_node(&trans, iter, id, POS_MIN,
+ for_each_btree_node(&trans, iter, id,
+ id == start_btree_id ? start_pos : POS_MIN,
BTREE_ITER_PREFETCH, b) {
+ if (kthread && kthread_should_stop())
+ goto out;
+
+ if ((cmp_int(id, end_btree_id) ?:
+ bkey_cmp(b->key.k.p, end_pos)) > 0)
+ break;
+
stats->pos = iter->pos;
- switch ((cmd = pred(c, arg,
- bkey_i_to_s_c(&b->key),
- &io_opts, &data_opts))) {
+ switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
case DATA_SKIP:
goto next;
case DATA_SCRUB:
@@ -719,9 +755,12 @@ next:
ret = bch2_trans_iter_free(&trans, iter) ?: ret;
}
-
+out:
bch2_trans_exit(&trans);
+ if (ret)
+ bch_err(c, "error %i in bch2_move_btree", ret);
+
return ret;
}
@@ -778,6 +817,83 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
return DATA_REWRITE;
}
+static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+ unsigned i;
+
+ for (i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f->bits_per_field[i] > unpacked_bits)
+ return true;
+
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ return true;
+
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+ unpacked_mask) <
+ field_offset)
+ return true;
+ }
+
+ return false;
+}
+
+static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ if (b->version_ondisk != c->sb.version ||
+ btree_node_need_rewrite(b) ||
+ bformat_needs_redo(&b->format)) {
+ data_opts->target = 0;
+ data_opts->nr_replicas = 1;
+ data_opts->btree_insert_flags = 0;
+ return DATA_REWRITE;
+ }
+
+ return DATA_SKIP;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ int ret;
+
+ ret = bch2_move_btree(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
+ rewrite_old_nodes_pred, c, stats);
+ if (!ret) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+ c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return ret;
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
@@ -789,17 +905,20 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ rereplicate_btree_pred, c, stats) ?: ret;
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
ret = bch2_replicas_gc2(c) ?: ret;
- ret = bch2_move_data(c, NULL,
- writepoint_hashed((unsigned long) current),
- op.start,
- op.end,
+ ret = bch2_move_data(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ NULL, writepoint_hashed((unsigned long) current),
rereplicate_pred, c, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
@@ -810,16 +929,22 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ migrate_btree_pred, &op, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
- ret = bch2_move_data(c, NULL,
- writepoint_hashed((unsigned long) current),
- op.start,
- op.end,
+ ret = bch2_move_data(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ NULL, writepoint_hashed((unsigned long) current),
migrate_pred, &op, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
+ case BCH_DATA_OP_REWRITE_OLD_NODES:
+ ret = bch2_scan_old_btree_nodes(c, stats);
+ break;
default:
ret = -EINVAL;
}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index b04bc669226d..5076153689d1 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -52,9 +52,13 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
struct bkey_s_c,
struct bch_io_opts *, struct data_opts *);
-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_data(struct bch_fs *,
+ enum btree_id, struct bpos,
+ enum btree_id, struct bpos,
+ struct bch_ratelimit *,
struct write_point_specifier,
- struct bpos, struct bpos,
move_pred_fn, void *,
struct bch_move_stats *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 2c5daed58aca..03668e481f7a 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev;
- if (p.has_ec) {
- struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
- data_opts->nr_replicas += m->nr_redundant;
- }
+ if (p.has_ec)
+ data_opts->nr_replicas += p.ec.redundancy;
return DATA_REWRITE;
}
@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
- WARN_ON(m.stripe && !g->ec_redundancy);
+ WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
- .replicas = 1 + g->ec_redundancy,
+ .replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
@@ -200,6 +197,11 @@ static int bch2_copygc(struct bch_fs *c)
return -1;
}
+ /*
+ * Our btree node allocations also come out of RESERVE_MOVINGGC:
+ */
+ sectors_to_move = (sectors_to_move * 3) / 4;
+
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += i->sectors * i->replicas;
@@ -217,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c)
sizeof(h->data[0]),
bucket_offset_cmp, NULL);
- ret = bch2_move_data(c, &c->copygc_pd.rate,
+ ret = bch2_move_data(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
+ &c->copygc_pd.rate,
writepoint_ptr(&c->copygc_write_point),
- POS_MIN, POS_MAX,
copygc_pred, NULL,
&move_stats);
@@ -286,7 +290,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
fragmented_allowed += ((__dev_buckets_available(ca, usage) *
ca->mi.bucket_size) >> 1);
- fragmented += usage.sectors_fragmented;
+ fragmented += usage.d[BCH_DATA_user].fragmented;
}
return max_t(s64, 0, fragmented_allowed - fragmented);
@@ -296,7 +300,7 @@ static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last, wait;
+ u64 last, wait;
set_freezable();
@@ -304,7 +308,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
@@ -346,8 +350,10 @@ int bch2_copygc_start(struct bch_fs *c)
return -ENOMEM;
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
- if (IS_ERR(t))
+ if (IS_ERR(t)) {
+ bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
return PTR_ERR(t);
+ }
get_task_struct(t);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 97a36ac0beea..d53b6dccd161 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -23,6 +23,13 @@ const char * const bch2_sb_features[] = {
NULL
};
+const char * const bch2_sb_compat[] = {
+#define x(f, n) #f,
+ BCH_SB_COMPAT()
+#undef x
+ NULL
+};
+
const char * const bch2_csum_opts[] = {
"none",
"crc32c",
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 710a7ee67039..7ce2b3adb8d7 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -10,6 +10,7 @@
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
@@ -136,6 +137,11 @@ enum opt_type {
OPT_STR(bch2_str_hash_types), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \
NULL, "Hash function for directory entries and xattrs")\
+ x(metadata_target, u16, \
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_METADATA_TARGET, 0, \
+ "(target)", "Device or disk group for metadata writes") \
x(foreground_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_FN(bch2_opt_target), \
@@ -217,6 +223,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
+ x(very_degraded, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c3373c48fa81..aa9bbdbfa65e 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
- unsigned long io_start;
+ u64 io_start;
long throttle;
set_freezable();
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent),
50);
- if (atomic_long_read(&clock->now) + clock->max_slop <
+ if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
@@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg)
rebalance_work_reset(c);
bch2_move_data(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
- POS_MIN, POS_MAX,
rebalance_pred, NULL,
&r->move_stats);
}
@@ -274,7 +275,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
- atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
@@ -311,12 +312,17 @@ int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
+ if (c->rebalance.thread)
+ return 0;
+
if (c->opts.nochanges)
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
- if (IS_ERR(p))
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 192c6be20ced..2f62a643c39f 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev;
enum rebalance_state state;
- unsigned long throttled_until_iotime;
+ u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1883a1faf380..c42919277c72 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
#include "btree_update.h"
@@ -15,6 +16,7 @@
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "move.h"
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
@@ -39,78 +41,174 @@ static void drop_alloc_keys(struct journal_keys *keys)
/* iterate over keys read from the journal: */
-static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+static int __journal_key_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ struct bpos l_pos,
+ struct journal_key *r)
+{
+ return (cmp_int(l_btree_id, r->btree_id) ?:
+ cmp_int(l_level, r->level) ?:
+ bkey_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+{
+ return (cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p));
+}
+
+static size_t journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
- cmp_int(level, journal_keys->d[m].level) ?:
- bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+ if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < journal_keys->nr &&
- (cmp_int(id, journal_keys->d[l].btree_id) ?:
- cmp_int(level, journal_keys->d[l].level) ?:
- bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
BUG_ON(l &&
- (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
- cmp_int(level, journal_keys->d[l - 1].level) ?:
- bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+
+ return l;
+}
+
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+{
+ struct bkey_i *n = iter->keys->d[idx].k;
+ struct btree_and_journal_iter *biter =
+ container_of(iter, struct btree_and_journal_iter, journal);
+
+ if (iter->idx > idx ||
+ (iter->idx == idx &&
+ biter->last &&
+ bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+ iter->idx++;
+}
+
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
+{
+ struct journal_key n = {
+ .btree_id = id,
+ .level = level,
+ .k = k,
+ .allocated = true
+ };
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_iter *iter;
+ unsigned idx = journal_key_search(keys, id, level, k->k.p);
+
+ if (idx < keys->nr &&
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
+ if (keys->d[idx].allocated)
+ kfree(keys->d[idx].k);
+ keys->d[idx] = n;
+ return 0;
+ }
+
+ if (keys->nr == keys->size) {
+ struct journal_keys new_keys = {
+ .nr = keys->nr,
+ .size = keys->size * 2,
+ .journal_seq_base = keys->journal_seq_base,
+ };
+
+ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+ if (!new_keys.d) {
+ bch_err(c, "%s: error allocating new key array (size %zu)",
+ __func__, new_keys.size);
+ return -ENOMEM;
+ }
- return l < journal_keys->nr ? journal_keys->d + l : NULL;
+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+ kvfree(keys->d);
+ *keys = new_keys;
+ }
+
+ array_insert_item(keys->d, keys->nr, idx, n);
+
+ list_for_each_entry(iter, &c->journal_iters, list)
+ journal_iter_fix(c, iter, idx);
+
+ return 0;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bpos pos)
+{
+ struct bkey_i *whiteout =
+ kmalloc(sizeof(struct bkey), GFP_KERNEL);
+ int ret;
+
+ if (!whiteout) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_init(&whiteout->k);
+ whiteout->k.p = pos;
+
+ ret = bch2_journal_key_insert(c, id, level, whiteout);
+ if (ret)
+ kfree(whiteout);
+ return ret;
}
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- if (iter->k &&
- iter->k < iter->keys->d + iter->keys->nr &&
- iter->k->btree_id == iter->btree_id &&
- iter->k->level == iter->level)
- return iter->k->k;
+ struct journal_key *k = iter->idx - iter->keys->nr
+ ? iter->keys->d + iter->idx : NULL;
+
+ if (k &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level)
+ return k->k;
- iter->k = NULL;
+ iter->idx = iter->keys->nr;
return NULL;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
{
- if (iter->k)
- iter->k++;
+ if (iter->idx < iter->keys->nr)
+ iter->idx++;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+ list_del(&iter->list);
}
-static void bch2_journal_iter_init(struct journal_iter *iter,
- struct journal_keys *journal_keys,
+static void bch2_journal_iter_init(struct bch_fs *c,
+ struct journal_iter *iter,
enum btree_id id, unsigned level,
struct bpos pos)
{
iter->btree_id = id;
iter->level = level;
- iter->keys = journal_keys;
- iter->k = journal_key_search(journal_keys, id, level, pos);
+ iter->keys = &c->journal_keys;
+ iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
+ list_add(&iter->list, &c->journal_iters);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
{
- return iter->btree
- ? bch2_btree_iter_peek(iter->btree)
- : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
- iter->b, &iter->unpacked);
+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+ iter->b, &iter->unpacked);
}
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
{
- if (iter->btree)
- bch2_btree_iter_next(iter->btree);
- else
- bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
}
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -159,7 +257,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
if (iter->b &&
bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
- iter->journal.k = NULL;
+ iter->journal.idx = iter->journal.keys->nr;
iter->last = none;
return bkey_s_c_null;
}
@@ -180,31 +278,50 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
return bch2_btree_and_journal_iter_peek(iter);
}
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
- struct btree_trans *trans,
- struct journal_keys *journal_keys,
- enum btree_id id, struct bpos pos)
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
{
- memset(iter, 0, sizeof(*iter));
-
- iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH);
- bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+ bch2_journal_iter_exit(&iter->journal);
}
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct journal_keys *journal_keys,
+ struct bch_fs *c,
struct btree *b)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(&iter->journal, journal_keys,
+ bch2_journal_iter_init(c, &iter->journal,
b->c.btree_id, b->c.level, b->data->min_key);
}
/* Walk btree, overlaying keys from the journal: */
+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
+ struct btree_and_journal_iter iter)
+{
+ unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+
+ BUG_ON(!b->c.level);
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (i < nr &&
+ (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+
+ bch2_btree_node_prefetch(c, NULL, tmp.k,
+ b->c.btree_id, b->c.level - 1);
+
+ bch2_btree_and_journal_iter_advance(&iter);
+ i++;
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
struct journal_keys *journal_keys,
enum btree_id btree_id,
@@ -213,9 +330,12 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
+ struct bkey_buf tmp;
+ struct btree *child;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+ bch2_bkey_buf_init(&tmp);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
ret = key_fn(c, btree_id, b->c.level, k);
@@ -223,34 +343,34 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
break;
if (b->c.level) {
- struct btree *child;
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- if (b->c.level > 0) {
- child = bch2_btree_node_get_noiter(c, &tmp.k,
- b->c.btree_id, b->c.level - 1);
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
+ child = bch2_btree_node_get_noiter(c, tmp.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+
+ ret = PTR_ERR_OR_ZERO(child);
+ if (ret)
+ break;
- ret = (node_fn ? node_fn(c, b) : 0) ?:
- bch2_btree_and_journal_walk_recurse(c, child,
- journal_keys, btree_id, node_fn, key_fn);
- six_unlock_read(&child->c.lock);
+ btree_and_journal_iter_prefetch(c, b, iter);
- if (ret)
- break;
- }
+ ret = (node_fn ? node_fn(c, b) : 0) ?:
+ bch2_btree_and_journal_walk_recurse(c, child,
+ journal_keys, btree_id, node_fn, key_fn);
+ six_unlock_read(&child->c.lock);
+
+ if (ret)
+ break;
} else {
bch2_btree_and_journal_iter_advance(&iter);
}
}
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&tmp, c);
return ret;
}
@@ -306,6 +426,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_free(struct journal_keys *keys)
{
+ struct journal_key *i;
+
+ for (i = keys->d; i < keys->d + keys->nr; i++)
+ if (i->allocated)
+ kfree(i->k);
+
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
@@ -334,7 +460,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
nr_keys++;
}
- keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+ keys.size = roundup_pow_of_two(nr_keys);
+
+ keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
if (!keys.d)
goto err;
@@ -384,115 +512,6 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
- struct bkey_i *k)
-{
- struct btree_trans trans;
- struct btree_iter *iter, *split_iter;
- /*
- * We might cause compressed extents to be split, so we need to pass in
- * a disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i *split;
- struct bpos atomic_end;
- /*
- * Some extents aren't equivalent - w.r.t. what the triggers do
- * - if they're split:
- */
- bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
- k->k.type == KEY_TYPE_reflink_p;
- bool remark = false;
- int ret;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-retry:
- bch2_trans_begin(&trans);
-
- iter = bch2_trans_get_iter(&trans, btree_id,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- do {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
-
- atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
-
- split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
- ret = PTR_ERR_OR_ZERO(split);
- if (ret)
- goto err;
-
- if (!remark &&
- remark_if_split &&
- bkey_cmp(atomic_end, k->k.p) < 0) {
- ret = bch2_disk_reservation_add(c, &disk_res,
- k->k.size *
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
-
- remark = true;
- }
-
- bkey_copy(split, k);
- bch2_cut_front(iter->pos, split);
- bch2_cut_back(atomic_end, split);
-
- split_iter = bch2_trans_copy_iter(&trans, iter);
-
- /*
- * It's important that we don't go through the
- * extent_handle_overwrites() and extent_update_to_keys() path
- * here: journal replay is supposed to treat extents like
- * regular keys
- */
- __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
- bch2_trans_update(&trans, split_iter, split,
- BTREE_TRIGGER_NORUN);
- bch2_trans_iter_put(&trans, split_iter);
-
- bch2_btree_iter_set_pos(iter, split->k.p);
-
- if (remark) {
- ret = bch2_trans_mark_key(&trans,
- bkey_s_c_null,
- bkey_i_to_s_c(split),
- 0, split->k.size,
- BTREE_TRIGGER_INSERT);
- if (ret)
- goto err;
- }
- } while (bkey_cmp(iter->pos, k->k.p) < 0);
-
- if (remark) {
- ret = bch2_trans_mark_key(&trans,
- bkey_i_to_s_c(k),
- bkey_s_c_null,
- 0, -((s64) k->k.size),
- BTREE_TRIGGER_OVERWRITE);
- if (ret)
- goto err;
- }
-
- ret = bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY);
-err:
- bch2_trans_iter_put(&trans, iter);
-
- if (ret == -EINTR)
- goto retry;
-
- bch2_disk_reservation_put(c, &disk_res);
-
- return bch2_trans_exit(&trans) ?: ret;
-}
-
static int __bch2_journal_replay_key(struct btree_trans *trans,
enum btree_id id, unsigned level,
struct bkey_i *k)
@@ -518,14 +537,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
return ret;
}
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
{
- return bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_journal_replay_key(&trans, id, level, k));
+ unsigned commit_flags = BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW;
+
+ if (!k->allocated)
+ commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+
+ return bch2_trans_do(c, NULL, NULL, commit_flags,
+ __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
}
static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
@@ -601,7 +622,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (i->level) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+ ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
@@ -629,9 +650,7 @@ static int bch2_journal_replay(struct bch_fs *c,
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
- ret = i->k->k.size
- ? bch2_extent_replay_key(c, i->btree_id, i->k)
- : bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+ ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
@@ -643,7 +662,8 @@ static int bch2_journal_replay(struct bch_fs *c,
bch2_journal_flush_all_pins(j);
return bch2_journal_error(j);
err:
- bch_err(c, "journal replay: error %d while replaying key", ret);
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+ ret, bch2_btree_ids[i->btree_id], i->level);
return ret;
}
@@ -700,10 +720,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
+
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
+ case BCH_JSET_ENTRY_dev_usage: {
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+ unsigned i;
+
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
+ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
+
+ for (i = 0; i < nr_types; i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+
+ break;
+ }
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
@@ -722,6 +763,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1);
break;
}
+ case BCH_JSET_ENTRY_clock: {
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+ }
}
return ret;
@@ -736,9 +783,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret;
if (clean) {
- c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
@@ -751,9 +795,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore)
continue;
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
@@ -817,13 +858,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0;
}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock %u doesn't match journal %u after clean shutdown",
- clean->read_clock, j->read_clock);
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock write clock %u doesn't match journal %u after clean shutdown",
- clean->write_clock, j->write_clock);
-
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
@@ -899,7 +933,7 @@ static int read_btree_roots(struct bch_fs *c)
if (i == BTREE_ID_ALLOC &&
c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
continue;
}
@@ -909,7 +943,7 @@ static int read_btree_roots(struct bch_fs *c)
"invalid btree root %s",
bch2_btree_ids[i]);
if (i == BTREE_ID_ALLOC)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
ret = bch2_btree_root_read(c, i, &r->key, r->level);
@@ -919,7 +953,7 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s",
bch2_btree_ids[i]);
if (i == BTREE_ID_ALLOC)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
}
@@ -936,7 +970,7 @@ int bch2_fs_recovery(struct bch_fs *c)
struct bch_sb_field_clean *clean = NULL;
struct jset *last_journal_entry = NULL;
u64 blacklist_seq, journal_seq;
- bool write_sb = false, need_write_alloc = false;
+ bool write_sb = false;
int ret;
if (c->sb.clean)
@@ -949,6 +983,13 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
+ bch_info(c, "alloc_v2 feature bit not set, fsck required");
+ c->opts.fsck = true;
+ c->opts.fix_errors = FSCK_OPT_YES;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2;
+ }
+
if (!c->replicas.entries ||
c->opts.rebuild_replicas) {
bch_info(c, "building replicas info");
@@ -979,7 +1020,7 @@ int bch2_fs_recovery(struct bch_fs *c)
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
@@ -1020,7 +1061,7 @@ use_clean:
}
if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
}
@@ -1072,36 +1113,20 @@ use_clean:
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
- if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
- /*
- * interior btree node updates aren't consistent with the
- * journal; after an unclean shutdown we have to walk all
- * pointers to metadata:
- */
- bch_info(c, "starting metadata mark and sweep");
- err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, true);
- if (ret < 0)
- goto err;
- if (ret)
- need_write_alloc = true;
- bch_verbose(c, "mark and sweep done");
- }
-
if (c->opts.fsck ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, false);
- if (ret < 0)
- goto err;
+ ret = bch2_gc(c, true);
if (ret)
- need_write_alloc = true;
+ goto err;
bch_verbose(c, "mark and sweep done");
}
+ bch2_stripes_heap_start(c);
+
clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
@@ -1122,7 +1147,8 @@ use_clean:
goto err;
bch_verbose(c, "journal replay done");
- if (need_write_alloc && !c->opts.nochanges) {
+ if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
+ !c->opts.nochanges) {
/*
* note that even when filesystem was clean there might be work
* to do here, if we ran gc (because of fsck) which recalculated
@@ -1137,8 +1163,6 @@ use_clean:
goto err;
}
bch_verbose(c, "alloc write done");
-
- set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
}
if (!c->sb.clean) {
@@ -1177,6 +1201,21 @@ use_clean:
bch_verbose(c, "quotas done");
}
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+ struct bch_move_stats stats = { 0 };
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
+
mutex_lock(&c->sb_lock);
if (c->opts.version_upgrade) {
if (c->sb.version < bcachefs_metadata_version_new_versioning)
@@ -1188,7 +1227,7 @@ use_clean:
}
if (!test_bit(BCH_FS_ERROR, &c->flags)) {
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
write_sb = true;
}
@@ -1248,6 +1287,8 @@ int bch2_fs_initialize(struct bch_fs *c)
le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1312,8 +1353,10 @@ int bch2_fs_initialize(struct bch_fs *c)
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL));
- if (ret)
+ if (ret) {
+ bch_err(c, "error creating lost+found");
goto err;
+ }
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a66827c9addf..fa91851b9ed7 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -6,10 +6,11 @@
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
struct journal_iter {
+ struct list_head list;
enum btree_id btree_id;
unsigned level;
+ size_t idx;
struct journal_keys *keys;
- struct journal_key *k;
};
/*
@@ -17,8 +18,6 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
- struct btree_iter *btree;
-
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@@ -32,16 +31,18 @@ struct btree_and_journal_iter {
} last;
};
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
+
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
- struct btree_trans *,
- struct journal_keys *,
- enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct journal_keys *,
+ struct bch_fs *,
struct btree *);
typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 8abcbfb3bd64..930547de3309 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "extents.h"
#include "inode.h"
@@ -198,8 +198,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct btree_trans trans;
struct btree_iter *dst_iter, *src_iter;
struct bkey_s_c src_k;
- BKEY_PADDED(k) new_dst;
- struct bkey_on_stack new_src;
+ struct bkey_buf new_dst, new_src;
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos dst_want, src_want;
u64 src_done, dst_done;
@@ -216,7 +215,8 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_end.offset += remap_sectors;
src_end.offset += remap_sectors;
- bkey_on_stack_init(&new_src);
+ bch2_bkey_buf_init(&new_dst);
+ bch2_bkey_buf_init(&new_src);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
@@ -257,7 +257,7 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
if (src_k.k->type != KEY_TYPE_reflink_p) {
- bkey_on_stack_reassemble(&new_src, c, src_k);
+ bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
bch2_cut_front(src_iter->pos, new_src.k);
@@ -275,7 +275,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bkey_s_c_reflink_p src_p =
bkey_s_c_to_reflink_p(src_k);
struct bkey_i_reflink_p *dst_p =
- bkey_reflink_p_init(&new_dst.k);
+ bkey_reflink_p_init(new_dst.k);
u64 offset = le64_to_cpu(src_p.v->idx) +
(src_iter->pos.offset -
@@ -286,12 +286,12 @@ s64 bch2_remap_range(struct bch_fs *c,
BUG();
}
- new_dst.k.k.p = dst_iter->pos;
- bch2_key_resize(&new_dst.k.k,
+ new_dst.k->k.p = dst_iter->pos;
+ bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_iter->pos.offset,
dst_end.offset - dst_iter->pos.offset));
- ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+ ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
NULL, journal_seq,
new_i_size, i_sectors_delta);
if (ret)
@@ -333,7 +333,8 @@ err:
} while (ret2 == -EINTR);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&new_src, c);
+ bch2_bkey_buf_exit(&new_src, c);
+ bch2_bkey_buf_exit(&new_dst, c);
percpu_ref_put(&c->writes);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 00a197b65e0b..be73b458e4f6 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
/* Replicas tracking - in memory: */
-static inline int u8_cmp(u8 l, u8 r)
-{
- return cmp_int(l, r);
-}
-
static void verify_replicas_entry(struct bch_replicas_entry *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -31,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
@@ -127,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
break;
}
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@@ -147,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
static struct bch_replicas_cpu
@@ -164,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
BUG_ON(!new_entry->data_type);
verify_replicas_entry(new_entry);
- new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries)
return new;
@@ -202,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
- replicas_entry_sort(search);
+ bch2_replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search);
}
@@ -287,13 +282,13 @@ static int replicas_table_update(struct bch_fs *c,
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
- sizeof(u64), GFP_NOIO)))
+ sizeof(u64), GFP_KERNEL)))
goto err;
- if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
- !(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+ !(new_scratch = kmalloc(bytes, GFP_KERNEL)) ||
(c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
goto err;
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
@@ -553,7 +548,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
- GFP_NOIO);
+ GFP_KERNEL);
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
@@ -603,7 +598,11 @@ retry:
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
- bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
+ c->usage_base->replicas[i] ||
+ percpu_u64_get(&c->usage[0]->replicas[i]) ||
+ percpu_u64_get(&c->usage[1]->replicas[i]) ||
+ percpu_u64_get(&c->usage[2]->replicas[i]) ||
+ percpu_u64_get(&c->usage[3]->replicas[i]))
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}
@@ -672,7 +671,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
nr++;
}
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
@@ -682,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
for_each_replicas_entry(sb_r, e) {
dst = cpu_replicas_entry(cpu_r, idx++);
memcpy(dst, e, replicas_entry_bytes(e));
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
@@ -704,7 +703,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
entry_size += sizeof(struct bch_replicas_entry) -
sizeof(struct bch_replicas_entry_v0);
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
@@ -719,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
dst->nr_devs = e->nr_devs;
dst->nr_required = 1;
memcpy(dst->devs, e->devs, e->nr_devs);
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
@@ -959,94 +958,48 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
- struct bch_devs_mask online_devs)
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+ unsigned flags, bool print)
{
- struct bch_sb_field_members *mi;
struct bch_replicas_entry *e;
- unsigned i, nr_online, nr_offline;
- struct replicas_status ret;
-
- memset(&ret, 0, sizeof(ret));
-
- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
- ret.replicas[i].redundancy = INT_MAX;
-
- mi = bch2_sb_get_members(c->disk_sb.sb);
+ bool ret = true;
percpu_down_read(&c->mark_lock);
-
for_each_cpu_replicas_entry(&c->replicas, e) {
- if (e->data_type >= ARRAY_SIZE(ret.replicas))
- panic("e %p data_type %u\n", e, e->data_type);
+ unsigned i, nr_online = 0, dflags = 0;
+ bool metadata = e->data_type < BCH_DATA_user;
- nr_online = nr_offline = 0;
+ for (i = 0; i < e->nr_devs; i++)
+ nr_online += test_bit(e->devs[i], devs.d);
- for (i = 0; i < e->nr_devs; i++) {
- BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
- e->devs[i]));
+ if (nr_online < e->nr_required)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_LOST
+ : BCH_FORCE_IF_DATA_LOST;
- if (test_bit(e->devs[i], online_devs.d))
- nr_online++;
- else
- nr_offline++;
- }
+ if (nr_online < e->nr_devs)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_DEGRADED
+ : BCH_FORCE_IF_DATA_DEGRADED;
- ret.replicas[e->data_type].redundancy =
- min(ret.replicas[e->data_type].redundancy,
- (int) nr_online - (int) e->nr_required);
+ if (dflags & ~flags) {
+ if (print) {
+ char buf[100];
- ret.replicas[e->data_type].nr_offline =
- max(ret.replicas[e->data_type].nr_offline,
- nr_offline);
- }
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
+ bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+ nr_online, buf);
+ }
+ ret = false;
+ break;
+ }
+ }
percpu_up_read(&c->mark_lock);
- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
- if (ret.replicas[i].redundancy == INT_MAX)
- ret.replicas[i].redundancy = 0;
-
return ret;
}
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
- return __bch2_replicas_status(c, bch2_online_devs(c));
-}
-
-static bool have_enough_devs(struct replicas_status s,
- enum bch_data_type type,
- bool force_if_degraded,
- bool force_if_lost)
-{
- return (!s.replicas[type].nr_offline || force_if_degraded) &&
- (s.replicas[type].redundancy >= 0 || force_if_lost);
-}
-
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
- return (have_enough_devs(s, BCH_DATA_journal,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_btree,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_user,
- flags & BCH_FORCE_IF_DATA_DEGRADED,
- flags & BCH_FORCE_IF_DATA_LOST));
-}
-
-int bch2_replicas_online(struct bch_fs *c, bool meta)
-{
- struct replicas_status s = bch2_replicas_status(c);
-
- return (meta
- ? min(s.replicas[BCH_DATA_journal].redundancy,
- s.replicas[BCH_DATA_btree].redundancy)
- : s.replicas[BCH_DATA_user].redundancy) + 1;
-}
-
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_entry *e;
@@ -1066,8 +1019,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
int bch2_fs_replicas_init(struct bch_fs *c)
{
- c->journal.entry_u64s_reserved +=
- reserve_journal_replicas(c, &c->replicas);
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->replicas_journal_res,
+ reserve_journal_replicas(c, &c->replicas));
return replicas_table_update(c, &c->replicas);
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 8b95164fbb56..9c8fd3d98247 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -5,6 +5,7 @@
#include "eytzinger.h"
#include "replicas_types.h"
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
@@ -38,19 +39,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
e->devs[0] = dev;
}
-struct replicas_status {
- struct {
- int redundancy;
- unsigned nr_offline;
- } replicas[BCH_DATA_NR];
-};
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+ unsigned, bool);
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
- struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-int bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 78835bd2d6bc..86f1feff3aaa 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -276,19 +276,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
return "Bad number of member devices";
if (!BCH_SB_META_REPLICAS_WANT(sb) ||
- BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_META_REPLICAS_REQ(sb) ||
- BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
- BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
- BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
@@ -375,7 +375,6 @@ static void bch2_sb_update(struct bch_fs *c)
ca->mi = bch2_mi_to_cpu(mi->members + i);
}
-/* doesn't copy member info */
static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
{
struct bch_sb_field *src_f, *dst_f;
@@ -767,15 +766,13 @@ int bch2_write_super(struct bch_fs *c)
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
/*
* If we would be able to mount _without_ the devices we successfully
@@ -786,6 +783,7 @@ int bch2_write_super(struct bch_fs *c)
* mount with the devices we did successfully write to:
*/
if (bch2_fs_fatal_err_on(!nr_wrote ||
+ !can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
"Unable to write superblock to sufficient devices"))
@@ -954,40 +952,35 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;
}
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
- memset(entry, 0, u64s * sizeof(u64));
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+ memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
-}
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
- entry_init_u64s(entry, u64s);
+ *end = vstruct_next(*end);
+ return entry;
}
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry *entry,
- u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+ struct jset_entry **end,
+ u64 journal_seq)
{
- unsigned i;
+ struct bch_dev *ca;
+ unsigned i, dev;
percpu_down_write(&c->mark_lock);
@@ -1000,58 +993,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
-
- entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
-
- entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
- entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+ struct jset_entry_data_usage, entry);
- entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
+ }
- entry = vstruct_next(entry);
+ for_each_member_device(ca, c, dev) {
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+ struct jset_entry_dev_usage *u =
+ container_of(jset_entry_init(end, b),
+ struct jset_entry_dev_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
+ u->dev = cpu_to_le32(dev);
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+ }
}
percpu_up_write(&c->mark_lock);
- return entry;
+ for (i = 0; i < 2; i++) {
+ struct jset_entry_clock *clock =
+ container_of(jset_entry_init(end, sizeof(*clock)),
+ struct jset_entry_clock, entry);
+
+ clock->entry.type = BCH_JSET_ENTRY_clock;
+ clock->rw = i;
+ clock->time = atomic64_read(&c->io_clock[i].now);
+ }
}
void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1066,8 +1078,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
@@ -1080,15 +1092,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
- sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
- entry = bch2_journal_super_entries_add_common(c, entry, 0);
+ bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 7a068158efca..1a35124f5f47 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+ struct jset_entry **, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2d5a9b1c6a9d..b94bbca42446 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -148,42 +148,21 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
-int bch2_congested(void *data, int bdi_bits)
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
- struct bch_fs *c = data;
- struct backing_dev_info *bdi;
struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ unsigned i, nr = 0, u64s =
+ ((sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+ sizeof(u64);
rcu_read_lock();
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- const struct bch_devs_mask *devs =
- bch2_target_to_mask(c, c->opts.foreground_target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_member_device_rcu(ca, c, i, devs) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- }
+ for_each_member_device_rcu(ca, c, i, NULL)
+ nr++;
rcu_read_unlock();
- return ret;
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->dev_usage_journal_res, u64s * nr);
}
/* Filesystem RO/RW: */
@@ -212,9 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
@@ -273,10 +249,7 @@ nowrote_alloc:
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
*/
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_btree_flush_all_writes(c);
- else
- bch2_btree_verify_flushed(c);
+ bch2_btree_flush_all_writes(c);
/*
* After stopping journal:
@@ -440,9 +413,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
@@ -454,6 +424,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ for_each_rw_member(ca, c, i)
+ bch2_wake_allocator(ca);
+
ret = bch2_journal_reclaim_start(&c->journal);
if (ret) {
bch_err(c, "error starting journal reclaim: %i", ret);
@@ -725,6 +698,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_blacklist_entries_gc);
INIT_LIST_HEAD(&c->journal_entries);
+ INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
@@ -825,6 +799,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_dev_alloc(c, i))
goto err;
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->btree_root_journal_res,
+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+ bch2_dev_usage_journal_reserve(c);
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->clock_journal_res,
+ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
mutex_lock(&bch_fs_list_lock);
err = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
@@ -1022,6 +1004,8 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
+ bch2_dev_allocator_stop(ca);
+
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
@@ -1190,6 +1174,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
if (!ca)
goto err;
+ ca->fs = c;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+ bch2_dev_allocator_start(ca)) {
+ bch2_dev_free(ca);
+ goto err;
+ }
+
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1260,13 +1252,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
- if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
- mutex_lock(&c->sb_lock);
- bch2_mark_dev_superblock(ca->fs, ca, 0);
- mutex_unlock(&c->sb_lock);
- }
-
bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1)
@@ -1292,7 +1277,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_devs_mask new_online_devs;
- struct replicas_status s;
struct bch_dev *ca2;
int i, nr_rw = 0, required;
@@ -1328,9 +1312,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
new_online_devs = bch2_online_devs(c);
__clear_bit(ca->dev_idx, new_online_devs.d);
- s = __bch2_replicas_status(c, new_online_devs);
-
- return bch2_have_enough_devs(s, flags);
+ return bch2_have_enough_devs(c, new_online_devs, flags, false);
default:
BUG();
}
@@ -1338,14 +1320,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
static bool bch2_fs_may_start(struct bch_fs *c)
{
- struct replicas_status s;
struct bch_sb_field_members *mi;
struct bch_dev *ca;
- unsigned i, flags = c->opts.degraded
- ? BCH_FORCE_IF_DEGRADED
- : 0;
+ unsigned i, flags = 0;
+
+ if (c->opts.very_degraded)
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
- if (!c->opts.degraded) {
+ if (c->opts.degraded)
+ flags |= BCH_FORCE_IF_DEGRADED;
+
+ if (!c->opts.degraded &&
+ !c->opts.very_degraded) {
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1365,9 +1351,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
- s = bch2_replicas_status(c);
-
- return bch2_have_enough_devs(s, flags);
+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@@ -1568,6 +1552,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
+
+ bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@@ -1577,19 +1563,6 @@ err:
return ret;
}
-static void dev_usage_clear(struct bch_dev *ca)
-{
- struct bucket_array *buckets;
-
- percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
- up_read(&ca->bucket_lock);
-}
-
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
@@ -1640,15 +1613,13 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
- bch2_mark_dev_superblock(ca->fs, ca, 0);
+ bch2_mark_dev_superblock(NULL, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
if (ret)
goto err;
- dev_usage_clear(ca);
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1699,15 +1670,15 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
- bch2_mark_dev_superblock(c, ca, 0);
-
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- err = "alloc write failed";
- ret = bch2_dev_alloc_write(c, ca, 0);
+ bch2_dev_usage_journal_reserve(c);
+
+ err = "error marking superblock";
+ ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)
- goto err;
+ goto err_late;
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
@@ -1728,6 +1699,7 @@ err:
bch_err(c, "Unable to add device: %s", err);
return ret;
err_late:
+ up_write(&c->state_lock);
bch_err(c, "Error going rw after adding device: %s", err);
return -EINVAL;
}
@@ -1763,6 +1735,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
ca = bch_dev_locked(c, dev_idx);
+
+ if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+ err = "bch2_trans_mark_dev_sb() error";
+ goto err;
+ }
+
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 048ffec622af..02c81f3555c3 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 20406ebd6f5b..069973a38f12 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -20,7 +20,7 @@ struct bch_devs_mask {
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_REPLICAS_MAX + 1];
+ u8 devs[BCH_BKEY_PTRS_MAX];
};
struct bch_member_cpu {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cc13fc258115..bc4c3a77ea62 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -199,9 +199,6 @@ read_attribute(new_stripes);
rw_attribute(pd_controllers_update_seconds);
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
-
read_attribute(io_timers_read);
read_attribute(io_timers_write);
@@ -347,9 +344,6 @@ SHOW(bch2_fs)
sysfs_print(promote_whole_extents, c->promote_whole_extents);
- sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true));
- sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false));
-
/* Debugging: */
if (attr == &sysfs_alloc_debug)
@@ -475,7 +469,7 @@ STORE(bch2_fs)
*/
#if 0
down_read(&c->state_lock);
- bch2_gc(c, NULL, false, false);
+ bch2_gc(c, false, false);
up_read(&c->state_lock);
#else
bch2_gc_gens(c);
@@ -520,9 +514,6 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_node_size,
&sysfs_btree_cache_size,
- &sysfs_meta_replicas_have,
- &sysfs_data_replicas_have,
-
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
@@ -705,7 +696,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{
int rw = (private ? 1 : 0);
- return bucket_last_io(c, bucket(ca, b), rw);
+ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +709,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
- return bucket_gc_gen(ca, b);
+ return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)
@@ -797,63 +788,42 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
nr[c->open_buckets[i].type]++;
pr_buf(out,
- "free_inc: %zu/%zu\n"
- "free[RESERVE_BTREE]: %zu/%zu\n"
- "free[RESERVE_MOVINGGC]: %zu/%zu\n"
- "free[RESERVE_NONE]: %zu/%zu\n"
- "buckets:\n"
- " capacity: %llu\n"
- " alloc: %llu\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " available: %lli\n"
- "sectors:\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " fragmented: %llu\n"
- " copygc threshold: %llu\n"
- "freelist_wait: %s\n"
- "open buckets: %u/%u (reserved %u)\n"
- "open_buckets_wait: %s\n"
- "open_buckets_btree: %u\n"
- "open_buckets_user: %u\n"
- "btree reserve cache: %u\n",
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets_alloc,
- stats.buckets[BCH_DATA_sb],
- stats.buckets[BCH_DATA_journal],
- stats.buckets[BCH_DATA_btree],
- stats.buckets[BCH_DATA_user],
- stats.buckets[BCH_DATA_cached],
- stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- stats.sectors[BCH_DATA_sb],
- stats.sectors[BCH_DATA_journal],
- stats.sectors[BCH_DATA_btree],
- stats.sectors[BCH_DATA_user],
- stats.sectors[BCH_DATA_cached],
- stats.sectors_ec,
- stats.sectors_fragmented,
- c->copygc_threshold,
- c->freelist_wait.list.first ? "waiting" : "empty",
- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
- BTREE_NODE_OPEN_BUCKET_RESERVE,
- c->open_buckets_wait.list.first ? "waiting" : "empty",
- nr[BCH_DATA_btree],
- nr[BCH_DATA_user],
- c->btree_reserve_cache_nr);
+ "\t\t buckets\t sectors fragmented\n"
+ "capacity%16llu\n",
+ ca->mi.nbuckets - ca->mi.first_bucket);
+
+ for (i = 1; i < BCH_DATA_NR; i++)
+ pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+ bch2_data_types[i], stats.d[i].buckets,
+ stats.d[i].sectors, stats.d[i].fragmented);
+
+ pr_buf(out,
+ "ec\t%16llu\n"
+ "available%15llu\n"
+ "alloc\t%16llu\n"
+ "\n"
+ "free_inc\t\t%zu/%zu\n"
+ "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+ "free[RESERVE_NONE]\t%zu/%zu\n"
+ "freelist_wait\t\t%s\n"
+ "open buckets\t\t%u/%u (reserved %u)\n"
+ "open_buckets_wait\t%s\n"
+ "open_buckets_btree\t%u\n"
+ "open_buckets_user\t%u\n"
+ "btree reserve cache\t%u\n",
+ stats.buckets_ec,
+ __dev_buckets_available(ca, stats),
+ stats.buckets_alloc,
+ fifo_used(&ca->free_inc), ca->free_inc.size,
+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
+ c->freelist_wait.list.first ? "waiting" : "empty",
+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+ BTREE_NODE_OPEN_BUCKET_RESERVE,
+ c->open_buckets_wait.list.first ? "waiting" : "empty",
+ nr[BCH_DATA_btree],
+ nr[BCH_DATA_user],
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index e8a7df61ff5c..c69b05deec41 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -88,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
@@ -653,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
@@ -776,4 +747,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
#define cmp_int(l, r) ((l > r) - (l < r))
+static inline int u8_cmp(u8 l, u8 r)
+{
+ return cmp_int(l, r);
+}
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
index 3acee748e052..49d46ed2e18e 100644
--- a/kernel/locking/six.c
+++ b/kernel/locking/six.c
@@ -15,7 +15,7 @@
#endif
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
+#define six_release(l) lock_release(l, _RET_IP_)
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */