summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-12-10 18:10:06 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-12-10 18:28:28 -0500
commit73f27dc83d8bd75a25b5d60f8806b0c36d3d8ed2 (patch)
treea33e6a2edc8f7fc4134b29f7446686b47c986cf1
parentd61d8760d1142bdaf250e656892f341a1fb18dcd (diff)
Merge with 55e26c434e bcachefs: Always check if we need disk res in extent update path
-rw-r--r--fs/bcachefs/Kconfig1
-rw-r--r--fs/bcachefs/alloc_background.c5
-rw-r--r--fs/bcachefs/bcachefs.h18
-rw-r--r--fs/bcachefs/bcachefs_format.h7
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h14
-rw-r--r--fs/bcachefs/bkey_methods.c18
-rw-r--r--fs/bcachefs/bset.c94
-rw-r--r--fs/bcachefs/btree_cache.c17
-rw-r--r--fs/bcachefs/btree_cache.h1
-rw-r--r--fs/bcachefs/btree_gc.c7
-rw-r--r--fs/bcachefs/btree_io.c58
-rw-r--r--fs/bcachefs/btree_io.h19
-rw-r--r--fs/bcachefs/btree_iter.c190
-rw-r--r--fs/bcachefs/btree_iter.h36
-rw-r--r--fs/bcachefs/btree_key_cache.c251
-rw-r--r--fs/bcachefs/btree_key_cache.h21
-rw-r--r--fs/bcachefs/btree_types.h28
-rw-r--r--fs/bcachefs/btree_update.h4
-rw-r--r--fs/bcachefs/btree_update_interior.c130
-rw-r--r--fs/bcachefs/btree_update_interior.h4
-rw-r--r--fs/bcachefs/btree_update_leaf.c119
-rw-r--r--fs/bcachefs/buckets.c303
-rw-r--r--fs/bcachefs/buckets.h2
-rw-r--r--fs/bcachefs/chardev.c26
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/compress.c2
-rw-r--r--fs/bcachefs/ec.c32
-rw-r--r--fs/bcachefs/error.h29
-rw-r--r--fs/bcachefs/fs-io.c488
-rw-r--r--fs/bcachefs/fs-io.h7
-rw-r--r--fs/bcachefs/fs.c32
-rw-r--r--fs/bcachefs/fs.h8
-rw-r--r--fs/bcachefs/fsck.c16
-rw-r--r--fs/bcachefs/inode.c49
-rw-r--r--fs/bcachefs/inode.h2
-rw-r--r--fs/bcachefs/io.c123
-rw-r--r--fs/bcachefs/journal.c519
-rw-r--r--fs/bcachefs/journal.h61
-rw-r--r--fs/bcachefs/journal_io.c366
-rw-r--r--fs/bcachefs/journal_io.h3
-rw-r--r--fs/bcachefs/journal_reclaim.c418
-rw-r--r--fs/bcachefs/journal_reclaim.h50
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c5
-rw-r--r--fs/bcachefs/journal_types.h62
-rw-r--r--fs/bcachefs/move.c4
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/rebalance.c2
-rw-r--r--fs/bcachefs/recovery.c188
-rw-r--r--fs/bcachefs/replicas.c50
-rw-r--r--fs/bcachefs/str_hash.h7
-rw-r--r--fs/bcachefs/super-io.c11
-rw-r--r--fs/bcachefs/super.c62
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/sysfs.c16
-rw-r--r--fs/bcachefs/tests.c256
-rw-r--r--fs/bcachefs/tests.h2
-rw-r--r--fs/bcachefs/util.h31
-rw-r--r--include/trace/events/bcachefs.h79
59 files changed, 2458 insertions, 1937 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5594af719b2a..57c5d58c2d87 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -20,6 +20,7 @@ config BCACHEFS_FS
select SIXLOCKS
select RAID6_PQ
select XOR_BLOCKS
+ select SRCU
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 97508de9f721..09a7f8c8583a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -505,9 +505,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
@@ -1456,7 +1453,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
return 0;
p = kthread_create(bch2_allocator_thread, ca,
- "bch_alloc[%s]", ca->name);
+ "bch-alloc/%s", ca->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9749cde23cf3..eb5b40804773 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -193,6 +193,7 @@
#include <linux/semaphore.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
+#include <linux/srcu.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
@@ -213,9 +214,11 @@
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
-#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
-#define bch2_fmt(_c, fmt) fmt "\n"
+#define bch2_fmt(_c, fmt) fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
#endif
#define bch_info(c, fmt, ...) \
@@ -228,8 +231,11 @@
printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
#define bch_err_ratelimited(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_verbose(c, fmt, ...) \
do { \
@@ -642,12 +648,13 @@ struct bch_fs {
mempool_t btree_iters_pool;
struct btree_iter_buf __percpu *btree_iters_bufs;
+ struct srcu_struct btree_trans_barrier;
+
struct btree_key_cache btree_key_cache;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
- struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
@@ -666,6 +673,7 @@ struct bch_fs {
unsigned bucket_size_max;
atomic64_t sectors_available;
+ struct mutex sectors_available_lock;
struct bch_fs_pcpu __percpu *pcpu;
@@ -673,7 +681,7 @@ struct bch_fs {
seqcount_t usage_lock;
struct bch_fs_usage *usage_base;
- struct bch_fs_usage __percpu *usage[2];
+ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_fs_usage __percpu *usage_gc;
/* single element mempool: */
@@ -751,7 +759,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 94b5418587e3..02a76c3d3acb 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
- x(new_varint, 15)
+ x(new_varint, 15) \
+ x(journal_no_flush, 16)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
- (1ULL << BCH_FEATURE_new_varint))\
+ (1ULL << BCH_FEATURE_new_varint)| \
+ (1ULL << BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -1575,6 +1577,7 @@ struct jset {
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
#define BCH_JOURNAL_BUCKETS_MIN 8
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index d71157a3e073..0e626b098d91 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -73,6 +73,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -329,4 +330,17 @@ struct bch_ioctl_disk_resize {
__u64 nbuckets;
};
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev - member to resize
+ * @nbuckets - new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 99b7fce2bfd3..f5779795a4b2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
if (k) {
- pr_buf(out, "u64s %u type %s ", k->u64s,
- bch2_bkey_types[k->type]);
+ pr_buf(out, "u64s %u type ", k->u64s);
+
+ if (k->type < KEY_TYPE_MAX)
+ pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+ else
+ pr_buf(out, "%u ", k->type);
bch2_bpos_to_text(out, k->p);
@@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+ if (k.k->type < KEY_TYPE_MAX) {
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
- if (likely(ops->val_to_text))
- ops->val_to_text(out, c, k);
+ if (likely(ops->val_to_text))
+ ops->val_to_text(out, c, k);
+ } else {
+ pr_buf(out, "(invalid type %u)", k.k->type);
+ }
}
void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 26716657453f..1c7318c6e46f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
return (u16) v;
}
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *l, *r;
+ struct bkey_packed *l = is_power_of_2(j)
+ ? min_key
+ : tree_to_prev_bkey(b, t, j >> ffs(j));
+ struct bkey_packed *r = is_power_of_2(j + 1)
+ ? max_key
+ : tree_to_bkey(b, t, j >> (ffz(j) + 1));
unsigned mantissa;
int shift, exponent, high_bit;
- if (is_power_of_2(j)) {
- l = min_key;
-
- if (!l->u64s) {
- if (!bkey_pack_pos(l, b->data->min_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = b->data->min_key;
- bkey_copy(l, &tmp);
- }
- }
- } else {
- l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
- EBUG_ON(m < l);
- }
-
- if (is_power_of_2(j + 1)) {
- r = max_key;
-
- if (!r->u64s) {
- if (!bkey_pack_pos(r, t->max_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = t->max_key;
- bkey_copy(r, &tmp);
- }
- }
- } else {
- r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- EBUG_ON(m > r);
- }
-
/*
* for failed bfloats, the lookup code falls back to comparing against
* the original key.
@@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
f->mantissa = mantissa;
}
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
+{
+ struct bkey_i *k;
+
+ if (is_power_of_2(j) &&
+ !min_key->u64s) {
+ k = (void *) min_key;
+ bkey_init(&k->k);
+ k->k.p = b->data->min_key;
+ }
+
+ if (is_power_of_2(j + 1) &&
+ !max_key->u64s) {
+ k = (void *) max_key;
+ bkey_init(&k->k);
+ k->k.p = t->max_key;
+ }
+
+ __make_bfloat(b, t, j, min_key, max_key);
+}
+
/* bytes remaining - only valid for last bset: */
static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
@@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *k;
@@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
}
}
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
- struct bkey_packed min_key, max_key;
+ struct bkey_i min_key, max_key;
unsigned j, cacheline = 1;
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
retry:
@@ -789,9 +780,16 @@ retry:
t->max_key = bkey_unpack_pos(b, prev);
+ bkey_init(&min_key.k);
+ min_key.k.p = b->data->min_key;
+ bkey_init(&max_key.k);
+ max_key.k.p = t->max_key;
+
/* Then we build the tree */
eytzinger1_for_each(j, t->size)
- make_bfloat(b, t, j, &min_key, &max_key);
+ __make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 50ea92feae0f..09774f56f11c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -81,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
@@ -329,9 +328,9 @@ restart:
clear_btree_node_accessed(b);
}
- memalloc_nofs_restore(flags);
mutex_unlock(&bc->lock);
out:
+ memalloc_nofs_restore(flags);
return (unsigned long) freed * btree_pages(c);
}
@@ -382,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_data_free(c, b);
}
+ BUG_ON(atomic_read(&c->btree_cache.dirty));
+
while (!list_empty(&bc->freed)) {
b = list_first_entry(&bc->freed, struct btree, list);
list_del(&b->list);
@@ -446,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
- register_shrinker(&bc->shrink);
+ ret = register_shrinker(&bc->shrink);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
@@ -1063,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
stats.floats,
stats.failed);
}
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used);
+ pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty));
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 8a19e60e9258..e766ef552ce7 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c)
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ba4acc112ed3..6268ea637d19 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c,
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
- unsigned i;
c->ec_stripes_heap.used = 0;
@@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c,
}
};
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
bch2_dev_usage_from_buckets(c);
@@ -1427,7 +1426,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
BUG_ON(c->gc_thread);
- p = kthread_create(bch2_gc_thread, c, "bch_gc");
+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 10a00085cdd6..87f97ccb3f1f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bch2_btree_iter_reinit_node(iter, b);
}
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
+{
+ pr_buf(out, "%s level %u/%u\n ",
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level,
+ c->btree_roots[b->c.btree_id].level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct btree *b, struct bset *i,
unsigned offset, int write)
{
- pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
- "pos ",
- write ? "before write " : "",
- b->c.btree_id, b->c.level,
- c->btree_roots[b->c.btree_id].level);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ pr_buf(out, "error validating btree node %sat btree ",
+ write ? "before write " : "");
+ btree_pos_to_text(out, c, b);
- pr_buf(out, " node offset %u", b->written);
+ pr_buf(out, "\n node offset %u", b->written);
if (i)
pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
}
@@ -628,21 +635,26 @@ enum btree_validate_ret {
({ \
__label__ out; \
char _buf[300]; \
+ char *buf = _buf; \
struct printbuf out = PBUF(_buf); \
\
+ buf = kmalloc(4096, GFP_ATOMIC); \
+ if (buf) \
+ out = _PBUF(buf, 4986); \
+ \
btree_err_msg(&out, c, b, i, b->written, write); \
pr_buf(&out, ": " msg, ##__VA_ARGS__); \
\
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", _buf); \
+ mustfix_fsck_err(c, "%s", buf); \
goto out; \
} \
\
switch (write) { \
case READ: \
- bch_err(c, "%s", _buf); \
+ bch_err(c, "%s", buf); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
@@ -663,7 +675,7 @@ enum btree_validate_ret {
} \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", _buf); \
+ bch_err(c, "corrupt metadata before write: %s", buf); \
\
if (bch2_fs_inconsistent(c)) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
@@ -672,6 +684,8 @@ enum btree_validate_ret {
break; \
} \
out: \
+ if (buf != _buf) \
+ kfree(buf); \
true; \
})
@@ -1104,6 +1118,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
+ char buf[200];
+ struct printbuf out;
bool can_retry;
goto start;
@@ -1123,8 +1139,10 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_status = BLK_STS_REMOVED;
}
start:
- bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
- bch2_blk_status_to_str(bio->bi_status));
+ out = PBUF(buf);
+ btree_pos_to_text(&out, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
@@ -1408,7 +1426,7 @@ static void btree_node_write_endio(struct bio *bio)
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
@@ -1442,8 +1460,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
- if (ret)
+ if (ret) {
bch2_inconsistent_error(c);
+ dump_stack();
+ }
return ret;
}
@@ -1486,6 +1506,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (!btree_node_may_write(b))
return;
+ if (old & (1 << BTREE_NODE_never_write))
+ return;
+
if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b);
continue;
@@ -1498,6 +1521,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ atomic_dec(&c->btree_cache.dirty);
+
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
@@ -1530,6 +1555,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
seq = max(seq, le64_to_cpu(i->journal_seq));
}
+ BUG_ON(b->written && !seq);
+
+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+ bytes += 8;
+
data = btree_bounce_alloc(c, bytes, &used_mempool);
if (!b->written) {
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index b859a067c78b..1a4b11e99cc4 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -14,6 +14,23 @@ struct btree_write;
struct btree;
struct btree_iter;
+static inline bool btree_node_dirty(struct btree *b)
+{
+ return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_dec(&c->btree_cache.dirty);
+}
+
struct btree_read_bio {
struct bch_fs *c;
u64 start_time;
@@ -104,7 +121,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 58f1a3dd97d3..21253be5aab6 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -346,7 +346,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
{
struct btree_iter *iter;
- trans_for_each_iter_all(trans, iter)
+ trans_for_each_iter(trans, iter)
bch2_btree_iter_verify_locks(iter);
}
#else
@@ -875,9 +875,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
+ bch2_dump_btree_node(iter->trans->c, l->b);
bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
- buf, b->key.k.p.inode, b->key.k.p.offset);
+ panic("parent iter doesn't point to new node:\n"
+ "iter pos %s %llu:%llu\n"
+ "iter key %s\n"
+ "new node %llu:%llu-%llu:%llu\n",
+ bch2_btree_ids[iter->btree_id],
+ iter->pos.inode,
+ iter->pos.offset,
+ buf,
+ b->data->min_key.inode,
+ b->data->min_key.offset,
+ b->key.k.p.inode, b->key.k.p.offset);
}
if (!parent_locked)
@@ -2002,110 +2012,46 @@ int bch2_trans_iter_free(struct btree_trans *trans,
return bch2_trans_iter_put(trans, iter);
}
-#if 0
-static int bch2_trans_realloc_iters(struct btree_trans *trans,
- unsigned new_size)
+noinline __cold
+static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
{
- void *p, *new_iters, *new_updates, *new_updates2;
- size_t iters_bytes;
- size_t updates_bytes;
-
- new_size = roundup_pow_of_two(new_size);
-
- BUG_ON(new_size > BTREE_ITER_MAX);
- if (new_size <= trans->size)
- return 0;
-
- BUG_ON(trans->used_mempool);
-
- bch2_trans_unlock(trans);
-
- iters_bytes = sizeof(struct btree_iter) * new_size;
- updates_bytes = sizeof(struct btree_insert_entry) * new_size;
-
- p = kmalloc(iters_bytes +
- updates_bytes +
- updates_bytes, GFP_NOFS);
- if (p)
- goto success;
-
- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
- new_size = BTREE_ITER_MAX;
+ struct btree_iter *iter;
+ struct btree_insert_entry *i;
- trans->used_mempool = true;
-success:
- new_iters = p; p += iters_bytes;
- new_updates = p; p += updates_bytes;
- new_updates2 = p; p += updates_bytes;
-
- memcpy(new_iters, trans->iters,
- sizeof(struct btree_iter) * trans->nr_iters);
- memcpy(new_updates, trans->updates,
- sizeof(struct btree_insert_entry) * trans->nr_updates);
- memcpy(new_updates2, trans->updates2,
- sizeof(struct btree_insert_entry) * trans->nr_updates2);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- memset(trans->iters, POISON_FREE,
- sizeof(struct btree_iter) * trans->nr_iters +
- sizeof(struct btree_insert_entry) * trans->nr_iters);
-
- kfree(trans->iters);
-
- trans->iters = new_iters;
- trans->updates = new_updates;
- trans->updates2 = new_updates2;
- trans->size = new_size;
-
- if (trans->iters_live) {
- trace_trans_restart_iters_realloced(trans->ip, trans->size);
- return -EINTR;
+ trans_for_each_iter(trans, iter)
+ printk(KERN_ERR "iter: btree %s pos %llu:%llu%s%s%s %ps\n",
+ bch2_btree_ids[iter->btree_id],
+ iter->pos.inode,
+ iter->pos.offset,
+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+ (void *) iter->ip_allocated);
+
+ trans_for_each_update(trans, i) {
+ char buf[300];
+
+ bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
+ printk(KERN_ERR "update: btree %s %s\n",
+ bch2_btree_ids[i->iter->btree_id], buf);
}
-
- return 0;
+ panic("trans iter oveflow\n");
}
-#endif
static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
{
- unsigned idx = __ffs64(~trans->iters_linked);
-
- if (idx < trans->nr_iters)
- goto got_slot;
+ unsigned idx;
- if (trans->nr_iters == trans->size) {
- struct btree_iter *iter;
+ if (unlikely(trans->iters_linked ==
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ btree_trans_iter_alloc_fail(trans);
- BUG_ON(trans->size < BTREE_ITER_MAX);
+ idx = __ffs64(~trans->iters_linked);
- trans_for_each_iter(trans, iter) {
- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
- bch2_btree_ids[iter->btree_id],
- iter->pos.inode,
- iter->pos.offset,
- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
- }
-
- panic("trans iter oveflow\n");
-#if 0
- ret = bch2_trans_realloc_iters(trans, trans->size * 2);
- if (ret)
- return ERR_PTR(ret);
-#endif
- }
-
- idx = trans->nr_iters++;
- BUG_ON(trans->nr_iters > trans->size);
-
- trans->iters[idx].idx = idx;
-got_slot:
- BUG_ON(trans->iters_linked & (1ULL << idx));
- trans->iters_linked |= 1ULL << idx;
- trans->iters[idx].flags = 0;
+ trans->iters_linked |= 1ULL << idx;
+ trans->iters[idx].idx = idx;
+ trans->iters[idx].flags = 0;
return &trans->iters[idx];
}
@@ -2141,8 +2087,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
{
struct btree_iter *iter, *best = NULL;
- BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
-
trans_for_each_iter(trans, iter) {
if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
@@ -2160,16 +2104,10 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
if (!best) {
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
} else if ((trans->iters_live & (1ULL << best->idx)) ||
(best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
btree_iter_copy(iter, best);
} else {
iter = best;
@@ -2203,9 +2141,8 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos, flags);
- if (!IS_ERR(iter))
- __bch2_btree_iter_set_pos(iter, pos,
- btree_node_type_is_extents(btree_id));
+ __bch2_btree_iter_set_pos(iter, pos,
+ btree_node_type_is_extents(btree_id));
return iter;
}
@@ -2221,7 +2158,6 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
flags|BTREE_ITER_NODES);
unsigned i;
- BUG_ON(IS_ERR(iter));
BUG_ON(bkey_cmp(iter->pos, pos));
iter->locks_want = locks_want;
@@ -2241,9 +2177,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
struct btree_iter *iter;
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
btree_iter_copy(iter, src);
trans->iters_live |= 1ULL << iter->idx;
@@ -2318,7 +2251,6 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->iters_touched &= trans->iters_live;
- trans->need_reset = 0;
trans->nr_updates = 0;
trans->nr_updates2 = 0;
trans->mem_top = 0;
@@ -2339,20 +2271,21 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
{
- unsigned new_size = BTREE_ITER_MAX;
- size_t iters_bytes = sizeof(struct btree_iter) * new_size;
- size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- void *p;
+ size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
+ void *p = NULL;
BUG_ON(trans->used_mempool);
- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
- mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+#ifdef __KERNEL__
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+ if (!p)
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
trans->iters = p; p += iters_bytes;
trans->updates = p; p += updates_bytes;
trans->updates2 = p; p += updates_bytes;
- trans->size = new_size;
}
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
@@ -2369,8 +2302,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
*/
bch2_trans_alloc_iters(trans, c);
- if (expected_mem_bytes)
- bch2_trans_preload_mem(trans, expected_mem_bytes);
+ if (expected_mem_bytes) {
+ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+ }
+
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->pid = current->pid;
@@ -2392,12 +2329,19 @@ int bch2_trans_exit(struct btree_trans *trans)
mutex_unlock(&trans->c->btree_trans_lock);
#endif
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
+#ifdef __KERNEL__
+ /*
+ * Userspace doesn't have a real percpu implementation:
+ */
trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
if (trans->iters)
mempool_free(trans->iters, &trans->c->btree_iters_pool);
@@ -2474,6 +2418,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
mempool_exit(&c->btree_iters_pool);
+ cleanup_srcu_struct(&c->btree_trans_barrier);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
@@ -2483,7 +2428,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
- return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+ return init_srcu_struct(&c->btree_trans_barrier) ?:
+ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
sizeof(struct btree_insert_entry) * nr);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index f7a73619c85b..9a7f8d0197ec 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -48,21 +48,22 @@ static inline int btree_iter_err(const struct btree_iter *iter)
/* Iterate over iters within a transaction: */
-#define trans_for_each_iter_all(_trans, _iter) \
- for (_iter = (_trans)->iters; \
- _iter < (_trans)->iters + (_trans)->nr_iters; \
- _iter++)
-
static inline struct btree_iter *
__trans_next_iter(struct btree_trans *trans, unsigned idx)
{
- EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
+ u64 l;
+
+ if (idx == BTREE_ITER_MAX)
+ return NULL;
- for (; idx < trans->nr_iters; idx++)
- if (trans->iters_linked & (1ULL << idx))
- return &trans->iters[idx];
+ l = trans->iters_linked >> idx;
+ if (!l)
+ return NULL;
- return NULL;
+ idx += __ffs64(l);
+ EBUG_ON(idx >= BTREE_ITER_MAX);
+ EBUG_ON(trans->iters[idx].idx != idx);
+ return &trans->iters[idx];
}
#define trans_for_each_iter(_trans, _iter) \
@@ -240,10 +241,9 @@ static inline int bkey_err(struct bkey_s_c k)
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
- for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \
- bch2_trans_get_iter((_trans), (_btree_id), \
- (_start), (_flags))) ?: \
- PTR_ERR_OR_ZERO(((_k) = \
+ for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \
+ (_start), (_flags)), \
+ (_ret) = PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_peek(_iter, _flags)).k); \
!_ret && (_k).k; \
(_ret) = PTR_ERR_OR_ZERO(((_k) = \
@@ -270,9 +270,7 @@ bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
{
struct btree_iter *iter =
__bch2_trans_get_iter(trans, btree_id, pos, flags);
-
- if (!IS_ERR(iter))
- iter->ip_allocated = _THIS_IP_;
+ iter->ip_allocated = _THIS_IP_;
return iter;
}
@@ -284,10 +282,8 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
struct btree_iter *iter =
__bch2_trans_copy_iter(trans, src);
- if (!IS_ERR(iter))
- iter->ip_allocated = _THIS_IP_;
+ iter->ip_allocated = _THIS_IP_;
return iter;
-
}
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 0ee4f78ce67a..244c5dbcd3e9 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -9,8 +9,11 @@
#include "journal.h"
#include "journal_reclaim.h"
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+static struct kmem_cache *bch2_key_cache;
+
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -66,12 +69,22 @@ static void bkey_cached_evict(struct btree_key_cache *c,
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
+
+ c->nr_keys--;
}
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
- list_move(&ck->list, &c->freed);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_move_tail(&ck->list, &bc->freed);
+ bc->nr_freed++;
kfree(ck->k);
ck->k = NULL;
@@ -86,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c)
{
struct bkey_cached *ck;
- list_for_each_entry(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck))
+ list_for_each_entry_reverse(ck, &c->freed, list)
+ if (bkey_cached_lock_for_evict(ck)) {
+ c->nr_freed--;
return ck;
+ }
+
+ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+ if (likely(ck)) {
+ INIT_LIST_HEAD(&ck->list);
+ six_lock_init(&ck->c.lock);
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
+ BUG_ON(!six_trylock_write(&ck->c.lock));
+ return ck;
+ }
list_for_each_entry(ck, &c->clean, list)
if (bkey_cached_lock_for_evict(ck)) {
@@ -96,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c)
return ck;
}
- ck = kzalloc(sizeof(*ck), GFP_NOFS);
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
-
- return ck;
+ return NULL;
}
static struct bkey_cached *
@@ -124,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c,
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->valid = false;
-
- BUG_ON(ck->flags);
+ ck->flags = 1U << BKEY_CACHED_ACCESSED;
if (rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
@@ -135,6 +149,8 @@ btree_key_cache_create(struct btree_key_cache *c,
return NULL;
}
+ c->nr_keys++;
+
list_move(&ck->list, &c->clean);
six_unlock_write(&ck->c.lock);
@@ -153,9 +169,6 @@ static int btree_key_cache_fill(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, ck->key.btree_id,
ck->key.pos, BTREE_ITER_SLOTS);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
@@ -280,6 +293,9 @@ fill:
goto err;
}
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_iter_downgrade(iter);
return ret;
@@ -300,24 +316,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_iter *c_iter = NULL, *b_iter = NULL;
- struct bkey_cached *ck;
+ struct bkey_cached *ck = NULL;
int ret;
b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(b_iter);
- if (ret)
- goto out;
-
c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(c_iter);
- if (ret)
- goto out;
retry:
ret = bch2_btree_iter_traverse(c_iter);
if (ret)
@@ -348,17 +357,22 @@ err:
if (ret == -EINTR)
goto retry;
- BUG_ON(ret && !bch2_journal_error(j));
-
- if (ret)
+ if (ret) {
+ bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+ "error flushing key cache: %i", ret);
goto out;
+ }
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
if (!evict) {
mutex_lock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty--;
+ }
+
list_move_tail(&ck->list, &c->btree_key_cache.clean);
mutex_unlock(&c->btree_key_cache.lock);
} else {
@@ -371,6 +385,11 @@ evict:
six_lock_write(&ck->c.lock, NULL, NULL);
mutex_lock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty--;
+ }
+
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
@@ -391,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached_key key;
struct btree_trans trans;
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+
six_lock_read(&ck->c.lock, NULL, NULL);
key = ck->key;
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_read(&ck->c.lock);
- return;
+ goto unlock;
}
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
btree_key_cache_flush_pos(&trans, key, seq, false);
bch2_trans_exit(&trans);
+unlock:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
}
/*
@@ -428,6 +451,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) iter->l[0].b;
+ bool kick_reclaim = false;
BUG_ON(insert->u64s > ck->u64s);
@@ -448,14 +472,22 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
mutex_lock(&c->btree_key_cache.lock);
- list_del_init(&ck->list);
+ list_move(&ck->list, &c->btree_key_cache.dirty);
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ c->btree_key_cache.nr_dirty++;
+
+ if (bch2_nr_btree_keys_need_flush(c))
+ kick_reclaim = true;
+
mutex_unlock(&c->btree_key_cache.lock);
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
&ck->journal, btree_key_cache_journal_flush);
+
+ if (kick_reclaim)
+ journal_reclaim_kick(&c->journal);
return true;
}
@@ -467,20 +499,107 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
}
#endif
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck, *t;
+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+ unsigned flags;
+
+ /* Return -1 if we can't do anything right now */
+ if (sc->gfp_mask & __GFP_FS)
+ mutex_lock(&bc->lock);
+ else if (!mutex_trylock(&bc->lock))
+ return -1;
+
+ flags = memalloc_nofs_save();
+
+ /*
+ * Newest freed entries are at the end of the list - once we hit one
+ * that's too new to be freed, we can bail out:
+ */
+ list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ bc->nr_freed--;
+ scanned++;
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ list_for_each_entry_safe(ck, t, &bc->clean, list) {
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ else if (bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+ }
+
+ scanned++;
+ if (scanned >= nr) {
+ if (&t->list != &bc->clean)
+ list_move_tail(&bc->clean, &t->list);
+ goto out;
+ }
+ }
+out:
+ memalloc_nofs_restore(flags);
+ mutex_unlock(&bc->lock);
+
+ return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+
+ return bc->nr_keys;
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bkey_cached *ck, *n;
- mutex_lock(&c->lock);
- list_for_each_entry_safe(ck, n, &c->clean, list) {
+ if (bc->shrink.list.next)
+ unregister_shrinker(&bc->shrink);
+
+ mutex_lock(&bc->lock);
+ list_splice(&bc->dirty, &bc->clean);
+
+ list_for_each_entry_safe(ck, n, &bc->clean, list) {
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ bch2_journal_preres_put(&c->journal, &ck->res);
+
kfree(ck->k);
- kfree(ck);
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ bc->nr_keys--;
}
- list_for_each_entry_safe(ck, n, &c->freed, list)
- kfree(ck);
- mutex_unlock(&c->lock);
- rhashtable_destroy(&c->table);
+ BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
+ BUG_ON(bc->nr_keys);
+
+ list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ }
+ mutex_unlock(&bc->lock);
+
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -488,33 +607,47 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
INIT_LIST_HEAD(&c->clean);
+ INIT_LIST_HEAD(&c->dirty);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
- return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ int ret;
+
+ c->shrink.seeks = 1;
+ c->shrink.count_objects = bch2_btree_key_cache_count;
+ c->shrink.scan_objects = bch2_btree_key_cache_scan;
+
+ ret = register_shrinker(&c->shrink);
+ if (ret)
+ return ret;
+
+ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ if (ret)
+ return ret;
+
+ c->table_init_done = true;
+ return 0;
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- struct rhash_head *pos;
- size_t i;
+ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
+ pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys);
+ pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
+}
- mutex_lock(&c->lock);
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+void bch2_btree_key_cache_exit(void)
+{
+ if (bch2_key_cache)
+ kmem_cache_destroy(bch2_key_cache);
+}
- for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- pr_buf(out, "%s:",
- bch2_btree_ids[ck->key.btree_id]);
- bch2_bpos_to_text(out, ck->key.pos);
+int __init bch2_btree_key_cache_init(void)
+{
+ bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+ if (!bch2_key_cache)
+ return -ENOMEM;
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
- pr_buf(out, " journal seq %llu", ck->journal.seq);
- pr_buf(out, "\n");
- }
- }
- mutex_unlock(&c->lock);
+ return 0;
}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index d448264abcc8..dad3e344dcf9 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,6 +1,24 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+ size_t max_dirty = 1024 + nr_keys / 2;
+
+ return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty);
+ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_keys);
+ size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+ return nr_dirty > max_dirty;
+}
+
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
@@ -25,4 +43,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 93721fbc7794..dc7de27112c6 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -158,6 +158,7 @@ struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ atomic_t dirty;
struct shrinker shrink;
/*
@@ -292,8 +293,15 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
+ bool table_init_done;
struct list_head freed;
struct list_head clean;
+ struct list_head dirty;
+ struct shrinker shrink;
+
+ size_t nr_freed;
+ size_t nr_keys;
+ size_t nr_dirty;
};
struct bkey_cached_key {
@@ -301,7 +309,8 @@ struct bkey_cached_key {
struct bpos pos;
} __attribute__((packed, aligned(4)));
-#define BKEY_CACHED_DIRTY 0
+#define BKEY_CACHED_ACCESSED 0
+#define BKEY_CACHED_DIRTY 1
struct bkey_cached {
struct btree_bkey_cached_common c;
@@ -309,6 +318,7 @@ struct bkey_cached {
unsigned long flags;
u8 u64s;
bool valid;
+ u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -345,21 +355,19 @@ struct btree_trans {
pid_t pid;
#endif
unsigned long ip;
+ int srcu_idx;
- u64 iters_linked;
- u64 iters_live;
- u64 iters_touched;
-
- u8 nr_iters;
u8 nr_updates;
u8 nr_updates2;
- u8 size;
unsigned used_mempool:1;
unsigned error:1;
unsigned nounlock:1;
- unsigned need_reset:1;
unsigned in_traverse_all:1;
+ u64 iters_linked;
+ u64 iters_live;
+ u64 iters_touched;
+
unsigned mem_top;
unsigned mem_bytes;
void *mem;
@@ -407,11 +415,11 @@ enum btree_flags {
BTREE_NODE_fake,
BTREE_NODE_old_extent_overwrite,
BTREE_NODE_need_rewrite,
+ BTREE_NODE_never_write,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
@@ -422,6 +430,7 @@ BTREE_FLAG(dying);
BTREE_FLAG(fake);
BTREE_FLAG(old_extent_overwrite);
BTREE_FLAG(need_rewrite);
+BTREE_FLAG(never_write);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -640,6 +649,7 @@ enum btree_insert_ret {
BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_MARK_REPLICAS,
BTREE_INSERT_NEED_JOURNAL_RES,
+ BTREE_INSERT_NEED_JOURNAL_RECLAIM,
};
enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index e0b1bde37484..adb07043cbb3 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+ struct bpos, struct bpos, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4ddd1697ffde..8f96756ba648 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -11,6 +11,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -48,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
- BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+ if (bkey_cmp(next_node, bp.v->min_key)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected next min_key %llu:%llu got %llu:%llu\n",
+ next_node.inode,
+ next_node.offset,
+ bp.v->min_key.inode,
+ bp.v->min_key.offset);
+ }
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
- BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+
+ if (bkey_cmp(k.k->p, b->key.k.p)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected end %llu:%llu got %llu:%llu\n",
+ b->key.k.p.inode,
+ b->key.k.p.offset,
+ k.k->p.inode,
+ k.k->p.offset);
+ }
break;
}
@@ -149,7 +165,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
b->ob.nr = 0;
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
@@ -264,7 +280,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
set_btree_node_accessed(b);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
@@ -503,14 +519,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->new_keys, k) {
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ ret = bch2_trans_mark_key(trans,
+ bkey_s_c_null,
+ bkey_i_to_s_c(k),
0, 0, BTREE_TRIGGER_INSERT);
if (ret)
return ret;
}
for_each_keylist_key(&as->old_keys, k) {
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ ret = bch2_trans_mark_key(trans,
+ bkey_i_to_s_c(k),
+ bkey_s_c_null,
0, 0, BTREE_TRIGGER_OVERWRITE);
if (ret)
return ret;
@@ -523,11 +543,25 @@ static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
struct btree *b = as->b;
+ struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
/*
+ * If we're already in an error state, it might be because a btree node
+ * was never written, and we might be trying to free that same btree
+ * node here, but it won't have been marked as allocated and we'll see
+ * spurious disk usage inconsistencies in the transactional part below
+ * if we don't skip it:
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
+ BUG_ON(!journal_pin_active(&as->journal));
+
+ /*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
@@ -540,16 +574,20 @@ static void btree_update_nodes_written(struct btree_update *as)
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
- btree_update_nodes_written_trans(&trans, as));
- BUG_ON(ret && !bch2_journal_error(&c->journal));
-
+ bch2_trans_init(&trans, c, 0, 512);
+ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ BTREE_INSERT_JOURNAL_RESERVED,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_exit(&trans);
+
+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+ "error %i in btree_update_nodes_written()", ret);
+err:
if (b) {
/*
* @b is the node we did the final insert into:
@@ -569,17 +607,30 @@ static void btree_update_nodes_written(struct btree_update *as)
list_del(&as->write_blocked_list);
- if (!ret && as->b == b) {
+ /*
+ * Node might have been freed, recheck under
+ * btree_interior_update_lock:
+ */
+ if (as->b == b) {
struct bset *i = btree_bset_last(b);
BUG_ON(!b->c.level);
BUG_ON(!btree_node_dirty(b));
- i->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(i->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
+ if (!ret) {
+ i->journal_seq = cpu_to_le64(
+ max(journal_seq,
+ le64_to_cpu(i->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, journal_seq);
+ } else {
+ /*
+ * If we didn't get a journal sequence number we
+ * can't write this btree node, because recovery
+ * won't know to ignore this write:
+ */
+ set_btree_node_never_write(b);
+ }
}
mutex_unlock(&c->btree_interior_update_lock);
@@ -680,17 +731,7 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- /*
- * When we write a new btree root, we have to drop our journal pin
- * _before_ the new nodes are technically reachable; see
- * btree_update_nodes_written().
- *
- * This goes for journal pins that are recursively blocked on us - so,
- * just transfer the journal pin to the new interior update so
- * btree_update_nodes_written() can drop it.
- */
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
- bch2_journal_pin_drop(&c->journal, &child->journal);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -827,7 +868,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
clear_btree_node_need_write(b);
/*
@@ -937,6 +978,10 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
if (ret)
goto err;
+ bch2_journal_pin_add(&c->journal,
+ atomic64_read(&c->journal.seq),
+ &as->journal, NULL);
+
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1018,7 +1063,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
struct bkey_i *insert,
struct btree_node_iter *node_iter)
{
+ struct bch_fs *c = as->c;
struct bkey_packed *k;
+ const char *invalid;
+
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
+ if (invalid) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ dump_stack();
+ }
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
@@ -1034,7 +1091,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(iter, b, node_iter, insert);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
}
@@ -1353,9 +1410,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- if (as->must_rewrite)
- goto split;
-
bch2_btree_node_lock_for_insert(c, b, iter);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
@@ -1363,6 +1417,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
goto split;
}
+ btree_node_interior_verify(c, b);
+
bch2_btree_insert_keys_interior(as, b, iter, keys);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7668225e72c6..45d212730fd7 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -47,7 +47,6 @@ struct btree_update {
BTREE_INTERIOR_UPDATING_AS,
} mode;
- unsigned must_rewrite:1;
unsigned nodes_written:1;
enum btree_id btree_id;
@@ -237,6 +236,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
b->whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
+ /* Always leave one extra u64 for bch2_varint_decode: */
+ used++;
+
return total - used;
}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index e386f8ed3922..e7816afe4a08 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans,
BUG_ON(iter->level);
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(trans->c))
+ return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
if (u64s <= ck->u64s)
return BTREE_INSERT_OK;
@@ -642,20 +646,24 @@ int bch2_trans_commit_error(struct btree_trans *trans,
trace_trans_restart_journal_res_get(trans->ip);
ret = -EINTR;
break;
- default:
- BUG_ON(ret >= 0);
- break;
- }
+ case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+ bch2_trans_unlock(trans);
- if (ret == -EINTR) {
- int ret2 = bch2_btree_iter_traverse_all(trans);
+ do {
+ mutex_lock(&c->journal.reclaim_lock);
+ ret = bch2_journal_reclaim(&c->journal);
+ mutex_unlock(&c->journal.reclaim_lock);
+ } while (!ret && bch2_btree_key_cache_must_wait(c));
- if (ret2) {
- trace_trans_restart_traverse(trans->ip);
- return ret2;
- }
+ if (!ret && bch2_trans_relock(trans))
+ return 0;
- trace_trans_restart_atomic(trans->ip);
+ trace_trans_restart_journal_reclaim(trans->ip);
+ ret = -EINTR;
+ break;
+ default:
+ BUG_ON(ret >= 0);
+ break;
}
return ret;
@@ -699,7 +707,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
- EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+ EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
@@ -732,8 +740,6 @@ static int extent_update_to_keys(struct btree_trans *trans,
return 0;
iter = bch2_trans_copy_iter(trans, orig_iter);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
@@ -752,10 +758,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
int ret = 0;
iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- return ret;
-
k = bch2_btree_iter_peek_with_updates(iter);
while (k.k && !(ret = bkey_err(k))) {
@@ -764,8 +766,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -781,8 +781,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
if (bkey_cmp(k.k->p, end) > 0) {
update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -796,8 +794,6 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_trans_iter_put(trans, update_iter);
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
if ((ret = PTR_ERR_OR_ZERO(update)))
@@ -829,8 +825,6 @@ int __bch2_trans_commit(struct btree_trans *trans)
unsigned u64s;
int ret = 0;
- BUG_ON(trans->need_reset);
-
if (!trans->nr_updates)
goto out_noupdates;
@@ -1023,10 +1017,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
*/
if (trans->iters_live & (1ULL << i->iter->idx)) {
i->iter = bch2_trans_copy_iter(trans, i->iter);
- if (IS_ERR(i->iter)) {
- trans->need_reset = true;
- return PTR_ERR(i->iter);
- }
i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
bch2_trans_iter_put(trans, i->iter);
@@ -1036,7 +1026,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_pos(i->iter, n.k->k.p);
}
- EBUG_ON(trans->nr_updates >= trans->nr_iters);
+ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
@@ -1051,8 +1041,6 @@ int __bch2_btree_insert(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, 0);
@@ -1076,13 +1064,29 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
__bch2_btree_insert(&trans, id, k));
}
-int bch2_btree_delete_at_range(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
{
+ struct bkey_i k;
+
+ bkey_init(&k.k);
+ k.k.p = iter->pos;
+
+ bch2_trans_update(trans, iter, &k, 0);
+ return bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+ struct bpos start, struct bpos end,
+ u64 *journal_seq)
+{
+ struct btree_iter *iter;
struct bkey_s_c k;
int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
retry:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -1094,6 +1098,10 @@ retry:
bkey_init(&delete.k);
/*
+ * This could probably be more efficient for extents:
+ */
+
+ /*
* For extents, iter.pos won't necessarily be the same as
* bkey_start_pos(k.k) (for non extents they always will be the
* same). It's important that we delete starting from iter.pos
@@ -1132,22 +1140,8 @@ retry:
goto retry;
}
+ bch2_trans_iter_put(trans, iter);
return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_i k;
-
- bkey_init(&k.k);
- k.k.p = iter->pos;
-
- bch2_trans_update(trans, iter, &k, 0);
- return bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|flags);
}
/*
@@ -1159,21 +1153,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
u64 *journal_seq)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
-
- /*
- * XXX: whether we need mem/more iters depends on whether this btree id
- * has triggers
- */
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
- iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
- ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
- ret = bch2_trans_exit(&trans) ?: ret;
-
- BUG_ON(ret == -EINTR);
- return ret;
+ return bch2_trans_do(c, NULL, journal_seq, 0,
+ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 82f1cc4ca693..1934b845ea15 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
percpu_down_write(&c->mark_lock);
usage = c->usage_base;
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
for (i = 0; i < BCH_REPLICAS_MAX; i++)
usage->reserved += usage->persistent_reserved[i];
@@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
{
return this_cpu_ptr(gc
? c->usage_gc
- : c->usage[journal_seq & 1]);
+ : c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
{
ssize_t offset = v - (u64 *) c->usage_base;
- unsigned seq;
+ unsigned i, seq;
u64 ret;
BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
@@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
do {
seq = read_seqcount_begin(&c->usage_lock);
- ret = *v +
- percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
- percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+ ret = *v;
+
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
@@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
- unsigned seq, v, u64s = fs_usage_u64s(c);
+ unsigned seq, i, v, u64s = fs_usage_u64s(c);
retry:
ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
@@ -251,8 +252,8 @@ retry:
do {
seq = read_seqcount_begin(&c->usage_lock);
memcpy(ret, c->usage_base, u64s * sizeof(u64));
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
@@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
unsigned u64s = fs_usage_u64s(c);
- BUG_ON(idx >= 2);
+ BUG_ON(idx >= ARRAY_SIZE(c->usage));
preempt_disable();
write_seqcount_begin(&c->usage_lock);
@@ -323,7 +324,7 @@ static u64 reserve_factor(u64 r)
static u64 avail_factor(u64 r)
{
- return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
@@ -1333,10 +1334,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_inode:
- if (!(flags & BTREE_TRIGGER_OVERWRITE))
- fs_usage->nr_inodes++;
- else
- fs_usage->nr_inodes--;
+ fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
+ fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
break;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1400,10 +1399,10 @@ int bch2_mark_update(struct btree_trans *trans,
old = (struct bkey_s_c) { &unpacked, NULL };
if (!btree_node_type_is_extents(iter->btree_id)) {
+ /* iterators should be uptodate, shouldn't get errors here: */
if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
- _old = bch2_btree_node_iter_peek(&node_iter, b);
- if (_old)
- old = bkey_disassemble(b, _old, &unpacked);
+ old = bch2_btree_iter_peek_slot(iter);
+ BUG_ON(bkey_err(old));
} else {
struct bkey_cached *ck = (void *) iter->l[0].b;
@@ -1576,9 +1575,6 @@ static int trans_get_key(struct btree_trans *trans,
*iter = bch2_trans_get_iter(trans, btree_id, pos,
flags|BTREE_ITER_INTENT);
- if (IS_ERR(*iter))
- return PTR_ERR(*iter);
-
*k = __bch2_btree_iter_peek(*iter, flags);
ret = bkey_err(*k);
if (ret)
@@ -1606,9 +1602,6 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
@@ -1754,59 +1747,92 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
return 0;
}
+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, bool parity)
+{
+ struct bkey_i_alloc *a;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ int ret;
+
+ ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+ if (ret)
+ return ret;
+
+ if (parity) {
+ u.dirty_sectors += sectors;
+ u.data_type = u.dirty_sectors
+ ? BCH_DATA_parity
+ : 0;
+ }
+
+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+
+ bkey_alloc_init(&a->k_i);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+ bch2_trans_update(trans, iter, &a->k_i, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static int bch2_trans_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c k,
+ struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
struct bch_replicas_padded r;
- struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
- struct btree_iter *iter;
- bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
- s64 sectors = le16_to_cpu(s->sectors);
unsigned i;
int ret = 0;
- if (deleting)
- sectors = -sectors;
-
- bch2_bkey_to_replicas(&r.e, k);
- update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
-
/*
- * The allocator code doesn't necessarily update bucket gens in the
- * btree when incrementing them, right before handing out new buckets -
- * we just need to persist those updates here along with the new stripe:
+ * If the pointers aren't changing, we don't need to do anything:
*/
+ if (new_s && old_s &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
- for (i = 0; i < s->nr_blocks && !ret; i++) {
- bool parity = i >= nr_data;
+ if (new_s) {
+ unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
+ s64 sectors = le16_to_cpu(new_s->sectors);
- ret = bch2_trans_start_alloc_update(trans, &iter,
- &s->ptrs[i], &u);
- if (ret)
- break;
+ bch2_bkey_to_replicas(&r.e, new);
+ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ bool parity = i >= nr_data;
- if (parity) {
- u.dirty_sectors += sectors;
- u.data_type = u.dirty_sectors
- ? BCH_DATA_parity
- : 0;
+ ret = bch2_trans_mark_stripe_alloc_ref(trans,
+ &new_s->ptrs[i], sectors, parity);
+ if (ret)
+ return ret;
}
+ }
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto put_iter;
-
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
-put_iter:
- bch2_trans_iter_put(trans, iter);
+ if (old_s) {
+ unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+ bch2_bkey_to_replicas(&r.e, old);
+ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+
+ for (i = 0; i < old_s->nr_blocks; i++) {
+ bool parity = i >= nr_data;
+
+ ret = bch2_trans_mark_stripe_alloc_ref(trans,
+ &old_s->ptrs[i], sectors, parity);
+ if (ret)
+ return ret;
+ }
}
return ret;
@@ -1905,11 +1931,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+int bch2_trans_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
unsigned offset, s64 sectors, unsigned flags)
{
- struct replicas_delta_list *d;
struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ struct replicas_delta_list *d;
+
+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
@@ -1925,15 +1956,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_user);
case KEY_TYPE_stripe:
- return bch2_trans_mark_stripe(trans, k, flags);
- case KEY_TYPE_inode:
- d = replicas_deltas_realloc(trans, 0);
+ return bch2_trans_mark_stripe(trans, old, new, flags);
+ case KEY_TYPE_inode: {
+ int nr = (new.k->type == KEY_TYPE_inode) -
+ (old.k->type == KEY_TYPE_inode);
+
+ if (nr) {
+ d = replicas_deltas_realloc(trans, 0);
+ d->nr_inodes += nr;
+ }
- if (!(flags & BTREE_TRIGGER_OVERWRITE))
- d->nr_inodes++;
- else
- d->nr_inodes--;
return 0;
+ }
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1957,12 +1991,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
int bch2_trans_mark_update(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
+ struct bkey_i *new,
unsigned flags)
{
- struct btree *b = iter_l(iter)->b;
- struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_k;
+ struct bkey_s_c old;
int ret;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1971,79 +2003,97 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
- 0, insert->k.size, BTREE_TRIGGER_INSERT);
- if (ret)
- return ret;
-
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ if (!btree_node_type_is_extents(iter->btree_id)) {
+ /* iterators should be uptodate, shouldn't get errors here: */
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+ old = bch2_btree_iter_peek_slot(iter);
+ BUG_ON(bkey_err(old));
+ } else {
+ struct bkey_cached *ck = (void *) iter->l[0].b;
- return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
- 0, 0, BTREE_TRIGGER_OVERWRITE);
- }
+ BUG_ON(!ck->valid);
+ old = bkey_i_to_s_c(ck->k);
+ }
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
+ if (old.k->type == new->k.type) {
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+ } else {
+ struct btree *b = iter_l(iter)->b;
+ struct btree_node_iter node_iter = iter_l(iter)->iter;
+ struct bkey_packed *_old;
struct bkey unpacked;
- struct bkey_s_c k;
- unsigned offset = 0;
- s64 sectors = 0;
- unsigned flags = BTREE_TRIGGER_OVERWRITE;
- k = bkey_disassemble(b, _k, &unpacked);
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
- if (btree_node_is_extents(b)
- ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
- : bkey_cmp(insert->k.p, k.k->p))
- break;
+ bkey_init(&unpacked);
+ old = (struct bkey_s_c) { &unpacked, NULL };
+
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+ 0, new->k.size,
+ BTREE_TRIGGER_INSERT);
+ if (ret)
+ return ret;
+
+ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
+ unsigned flags = BTREE_TRIGGER_OVERWRITE;
+ unsigned offset = 0;
+ s64 sectors;
+
+ old = bkey_disassemble(b, _old, &unpacked);
+ sectors = -((s64) old.k->size);
+
+ flags |= BTREE_TRIGGER_OVERWRITE;
+
+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+ return 0;
- if (btree_node_is_extents(b)) {
- switch (bch2_extent_overlap(&insert->k, k.k)) {
+ switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
- sectors = -((s64) k.k->size);
+ sectors = -((s64) old.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
- sectors = bkey_start_offset(&insert->k) -
- k.k->p.offset;
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = bkey_start_offset(&new->k) -
+ old.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
offset = 0;
- sectors = bkey_start_offset(k.k) -
- insert->k.p.offset;
+ sectors = bkey_start_offset(old.k) -
+ new->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
- sectors = -((s64) insert->k.size);
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = -((s64) new->k.size);
flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
- }
- ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
- if (ret)
- return ret;
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+ offset, sectors, flags);
+ if (ret)
+ return ret;
- bch2_btree_node_iter_advance(&node_iter, b);
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
}
- return 0;
+ return ret;
}
/* Disk reservations: */
-static u64 bch2_recalc_sectors_available(struct bch_fs *c)
-{
- percpu_u64_set(&c->pcpu->sectors_available, 0);
-
- return avail_factor(__bch2_fs_usage_read_short(c).free);
-}
-
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read(&c->mark_lock);
@@ -2078,7 +2128,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
if (get < sectors) {
preempt_enable();
- percpu_up_read(&c->mark_lock);
goto recalculate;
}
} while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -2096,9 +2145,10 @@ out:
return 0;
recalculate:
- percpu_down_write(&c->mark_lock);
+ mutex_lock(&c->sectors_available_lock);
- sectors_available = bch2_recalc_sectors_available(c);
+ percpu_u64_set(&c->pcpu->sectors_available, 0);
+ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -2112,7 +2162,8 @@ recalculate:
ret = -ENOSPC;
}
- percpu_up_write(&c->mark_lock);
+ mutex_unlock(&c->sectors_available_lock);
+ percpu_up_read(&c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a3873becbb70..3a5ed1fcaf78 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -264,7 +264,7 @@ int bch2_mark_update(struct btree_trans *, struct btree_iter *,
int bch2_replicas_delta_list_apply(struct bch_fs *,
struct bch_fs_usage *,
struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 0377f9018d27..e7c8969aaad1 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -5,6 +5,7 @@
#include "bcachefs_ioctl.h"
#include "buckets.h"
#include "chardev.h"
+#include "journal.h"
#include "move.h"
#include "replicas.h"
#include "super.h"
@@ -340,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+ ctx->thread = kthread_create(bch2_data_thread, ctx,
+ "bch-data/%s", c->name);
if (IS_ERR(ctx->thread)) {
ret = PTR_ERR(ctx->thread);
goto err;
@@ -563,6 +565,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
return ret;
}
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_journal arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -619,6 +641,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
default:
return -ENOTTY;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -463,7 +464,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 0d68a277cfd7..aebf46bb1d21 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d7ba0e7fc3b3..eb03adc2d533 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
len << 9);
if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"checksum error while doing reconstruct read (%u:%u)",
i, j);
clear_bit(i, buf->valid);
@@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
unsigned bytes = buf->size << 9;
if (ec_nr_failed(buf) > v->nr_redundant) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
return -1;
}
@@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: stripe not found");
kfree(buf);
return bch2_trans_exit(&trans) ?: -EIO;
@@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr_stale(ca, ptr)) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: stale pointer");
clear_bit(i, buf->valid);
continue;
@@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
ret = -EIO;
goto err;
@@ -874,7 +874,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
for_each_keylist_key(&s->keys, k) {
ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
if (ret) {
- bch_err(c, "error creating stripe: error updating pointers");
+ bch_err(c, "error creating stripe: error %i updating pointers", ret);
break;
}
}
@@ -1341,16 +1341,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
if (!h)
return NULL;
- if (!h->s && ec_new_stripe_alloc(c, h)) {
- bch2_ec_stripe_head_put(c, h);
- return NULL;
- }
-
- if (!h->s->allocated) {
- if (!h->s->existing_stripe &&
- (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
- //pr_info("got existing stripe %llu", idx);
+ if (!h->s) {
+ if (ec_new_stripe_alloc(c, h)) {
+ bch2_ec_stripe_head_put(c, h);
+ return NULL;
+ }
+ idx = get_existing_stripe(c, target, algo, redundancy);
+ if (idx >= 0) {
h->s->existing_stripe = true;
h->s->existing_stripe_idx = idx;
if (get_stripe_key(c, idx, &h->s->stripe)) {
@@ -1364,7 +1362,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
ec_block_io(c, &h->s->stripe, READ, i, &cl);
}
}
+ }
+ if (!h->s->allocated) {
if (!h->s->existing_stripe &&
!h->s->res.sectors) {
ret = bch2_disk_reservation_get(c, &h->s->res,
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 94b53312fbbd..0e49fd728e44 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *);
/* Logs message and handles the error: */
#define bch2_dev_io_error(ca, fmt, ...) \
do { \
- printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
- "IO error on %s for " fmt), \
+ printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \
(ca)->name, ##__VA_ARGS__); \
bch2_io_error(ca); \
} while (0)
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \
+do { \
+ printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
+ (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \
+ bch2_io_error(ca); \
+} while (0)
+
#define bch2_dev_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
@@ -196,16 +202,13 @@ do { \
_ret; \
})
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, \
- "IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...) \
-do { \
- __bcache_io_error(c, fmt, ##__VA_ARGS__); \
- (bio)->bi_status = BLK_STS_IOERR; \
-} while (0)
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \
+({ \
+ bool _ret = (cond); \
+ \
+ if (_ret) \
+ bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+ _ret; \
+})
#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 7262e320ce25..dc16a7731e38 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,22 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+static inline struct address_space *faults_disabled_mapping(void)
+{
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+ current->faults_disabled_mapping =
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
struct quota_res {
u64 sectors;
};
@@ -71,6 +87,24 @@ struct dio_read {
struct bch_read_bio rbio;
};
+/* stub version */
+static int add_to_page_cache_lru_vec(struct address_space *mapping,
+ struct page **pages,
+ unsigned nr_pages,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ int i, err = 0;
+
+ for (i = 0; i < nr_pages; i++) {
+ err = add_to_page_cache_lru(pages[i], mapping,
+ offset + i, gfp_mask);
+ if (err)
+ break;
+ }
+
+ return i ?: err;
+}
+
/* pagecache_block must be held */
static int write_invalidate_inode_pages_range(struct address_space *mapping,
loff_t start, loff_t end)
@@ -265,28 +299,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
@@ -300,13 +319,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
@@ -514,10 +527,35 @@ static void bch2_set_page_dirty(struct bch_fs *c,
vm_fault_t bch2_page_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct address_space *fdm = faults_disabled_mapping();
struct bch_inode_info *inode = file_bch_inode(file);
int ret;
+ if (fdm == mapping)
+ return VM_FAULT_SIGBUS;
+
+ /* Lock ordering: */
+ if (fdm > mapping) {
+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+ goto got_lock;
+
+ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+ /* Signal that lock has been dropped: */
+ set_fdm_dropped_locks();
+ return VM_FAULT_SIGBUS;
+ }
+
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
ret = filemap_fault(vmf);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
@@ -604,18 +642,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
@@ -629,10 +661,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -651,31 +683,29 @@ struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
@@ -683,41 +713,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
@@ -778,11 +776,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -879,17 +874,18 @@ retry:
goto retry;
if (ret) {
- bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bch_err_inum_ratelimited(c, inum,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
bkey_on_stack_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
@@ -898,7 +894,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
@@ -933,8 +929,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
@@ -1034,32 +1028,35 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1083,7 +1080,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1237,7 +1234,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1806,29 +1803,53 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
- bool sync = dio->sync;
+ unsigned unaligned, iter_count;
+ bool sync = dio->sync, dropped_locks;
long ret;
if (dio->loop)
goto loop;
while (1) {
+ iter_count = dio->iter.count;
+
if (kthread)
- use_mm(dio->mm);
+ kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ dropped_locks = fdm_dropped_locks();
+
current->faults_disabled_mapping = NULL;
if (kthread)
- unuse_mm(dio->mm);
+ kthread_unuse_mm(dio->mm);
+
+ /*
+ * If the fault handler returned an error but also signalled
+ * that it dropped & retook ei_pagecache_lock, we just need to
+ * re-shoot down the page cache and retry:
+ */
+ if (dropped_locks && ret)
+ ret = 0;
if (unlikely(ret < 0))
goto err;
+ if (unlikely(dropped_locks)) {
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter_count - 1);
+ if (unlikely(ret))
+ goto err;
+
+ if (!bio->bi_iter.bi_size)
+ continue;
+ }
+
unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
bio->bi_iter.bi_size -= unaligned;
iov_iter_revert(&dio->iter, unaligned);
@@ -1838,7 +1859,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1901,9 +1922,15 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
- if (!dio->iter.count || dio->op.error)
+
+ if (dio->op.error) {
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+ break;
+ }
+
+ if (!dio->iter.count)
break;
bio_reset(bio);
@@ -2291,7 +2318,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
if (ret)
goto err;
- BUG_ON(inode->v.i_size < inode_u.bi_size);
+ WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+ inode->v.i_size < inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
ret = bch2_extend(inode, &inode_u, iattr);
@@ -2475,10 +2503,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, src_start >> 9),
BTREE_ITER_INTENT);
- BUG_ON(IS_ERR_OR_NULL(src));
-
dst = bch2_trans_copy_iter(&trans, src);
- BUG_ON(IS_ERR_OR_NULL(dst));
while (1) {
struct disk_reservation disk_res =
@@ -2818,235 +2843,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3241,8 +3037,8 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
int pg_offset;
loff_t ret = -1;
- page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ page = find_lock_page(mapping, index);
+ if (!page)
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..2537a3d25ede 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
@@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c04d90b17622..e3edca4d265b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -91,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock)
__pagecache_lock_put(lock, 1);
}
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+ return __pagecache_lock_tryget(lock, 1);
+}
+
void bch2_pagecache_add_get(struct pagecache_lock *lock)
{
__pagecache_lock_get(lock, 1);
@@ -271,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
if (!tmpfile)
mutex_lock(&dir->ei_update_lock);
- bch2_trans_init(&trans, c, 8, 1024);
+ bch2_trans_init(&trans, c, 8,
+ 2048 + (!tmpfile ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
@@ -886,6 +892,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bool have_extent = false;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
@@ -989,15 +999,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -1015,7 +1016,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1085,7 +1086,7 @@ static const struct address_space_operations bch_address_space_operations = {
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
@@ -1150,6 +1151,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
+ inode->ei_flags = 0;
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_str_hash = bch2_hash_info_init(c, bi);
@@ -1251,7 +1253,7 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino);
+ bch2_inode_rm(c, inode->v.i_ino, true);
}
}
@@ -1570,9 +1572,7 @@ got_sb:
if (ret)
goto err_put_super;
- sb->s_bdi->congested_fn = bch2_congested;
- sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index eda903a45325..3df85ffb450c 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -26,12 +26,14 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock)
}
void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
void bch2_pagecache_add_get(struct pagecache_lock *);
void bch2_pagecache_block_put(struct pagecache_lock *);
void bch2_pagecache_block_get(struct pagecache_lock *);
struct bch_inode_info {
struct inode v;
+ unsigned long ei_flags;
struct mutex ei_update_lock;
u64 ei_journal_seq;
@@ -49,6 +51,12 @@ struct bch_inode_info {
struct bch_inode_unpacked ei_inode;
};
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR 0
+
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0c5035270846..39f872de0c18 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans,
return 0;
iter = bch2_trans_copy_iter(trans, h->chain);
- BUG_ON(IS_ERR(iter));
for_each_btree_key_continue(iter, 0, k2, ret) {
if (bkey_cmp(k2.k->p, k.k->p) >= 0)
@@ -265,10 +264,8 @@ static void hash_set_chain_start(struct btree_trans *trans,
hash_stop_chain(trans, h);
if (!hole) {
- if (!h->chain) {
+ if (!h->chain)
h->chain = bch2_trans_copy_iter(trans, k_iter);
- BUG_ON(IS_ERR(h->chain));
- }
h->chain_end = k.k->p.offset;
}
@@ -440,9 +437,6 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans,
bch2_cut_front(cut_at, u);
u_iter = bch2_trans_copy_iter(trans, iter);
- ret = PTR_ERR_OR_ZERO(u_iter);
- if (ret)
- return ret;
/*
* We don't want to go through the
@@ -485,7 +479,11 @@ static int check_extents(struct bch_fs *c)
BTREE_ITER_INTENT);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+ /*
+ * due to retry errors we might see the same extent twice:
+ */
+ if (bkey_cmp(prev.k->k.p, k.k->p) &&
+ bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
char buf1[200];
char buf2[200];
@@ -1254,7 +1252,7 @@ static int check_inode(struct btree_trans *trans,
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum);
+ ret = bch2_inode_rm(c, u.bi_inum, false);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 42371de7f72a..bf1c7319669c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -302,9 +302,6 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
BTREE_ITER_CACHED|flags);
- if (IS_ERR(iter))
- return iter;
-
k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
@@ -537,10 +534,12 @@ found_slot:
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- return bch2_inode_write(trans, iter, inode_u);
+ ret = bch2_inode_write(trans, iter, inode_u);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -551,6 +550,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
u64 bi_generation;
int ret;
+ bch2_trans_init(&trans, c, 0, 0);
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
@@ -559,37 +560,34 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_XATTRS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
- start, end, NULL);
+ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS,
+ start, end, NULL);
if (ret)
- return ret;
-
- bch2_trans_init(&trans, c, 0, 0);
+ goto err;
retry:
bch2_trans_begin(&trans);
bi_generation = 0;
- ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
- if (ret) {
- if (ret != -EINTR)
- bch_err(c, "error flushing btree key cache: %i", ret);
- goto err;
+ if (cached) {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_cached(iter);
+ } else {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
}
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
-
ret = bkey_err(k);
if (ret)
goto err;
- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c,
"inode %llu not found when deleting",
inode_nr);
@@ -639,9 +637,6 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
POS(0, inode_nr), BTREE_ITER_CACHED);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index ef7e885dce0c..dbdfcf63d079 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 5251e1983c72..15b58a33c8ff 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -135,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -186,36 +186,33 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
static int sum_sector_overwrites(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *new,
- bool may_allocate,
bool *maybe_extending,
- s64 *delta)
+ s64 *i_sectors_delta,
+ s64 *disk_sectors_delta)
{
struct btree_iter *iter;
struct bkey_s_c old;
int ret = 0;
- *maybe_extending = true;
- *delta = 0;
+ *maybe_extending = true;
+ *i_sectors_delta = 0;
+ *disk_sectors_delta = 0;
iter = bch2_trans_copy_iter(trans, extent_iter);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
- if (!may_allocate &&
- bch2_bkey_nr_ptrs_fully_allocated(old) <
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
- ret = -ENOSPC;
- break;
- }
+ s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+ max(bkey_start_offset(&new->k),
+ bkey_start_offset(old.k));
- *delta += (min(new->k.p.offset,
- old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k))) *
+ *i_sectors_delta += sectors *
(bkey_extent_is_allocation(&new->k) -
bkey_extent_is_allocation(old.k));
+ *disk_sectors_delta += sectors *
+ (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
+ bch2_bkey_nr_ptrs_fully_allocated(old));
+
if (bkey_cmp(old.k->p, new->k.p) >= 0) {
/*
* Check if there's already data above where we're
@@ -249,12 +246,12 @@ int bch2_extent_update(struct btree_trans *trans,
struct disk_reservation *disk_res,
u64 *journal_seq,
u64 new_i_size,
- s64 *i_sectors_delta)
+ s64 *i_sectors_delta_total)
{
/* this must live until after bch2_trans_commit(): */
struct bkey_inode_buf inode_p;
bool extending = false;
- s64 delta = 0;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
int ret;
ret = bch2_extent_trim_atomic(k, iter);
@@ -262,16 +259,30 @@ int bch2_extent_update(struct btree_trans *trans,
return ret;
ret = sum_sector_overwrites(trans, iter, k,
- disk_res && disk_res->sectors != 0,
- &extending, &delta);
+ &extending,
+ &i_sectors_delta,
+ &disk_sectors_delta);
if (ret)
return ret;
+ if (disk_res &&
+ disk_sectors_delta > (s64) disk_res->sectors) {
+ pr_info("disk_sectors_delta %lli disk_res %llu",
+ disk_sectors_delta,
+ disk_res->sectors);
+
+ ret = bch2_disk_reservation_add(trans->c, disk_res,
+ disk_sectors_delta - disk_res->sectors,
+ 0);
+ if (ret)
+ return ret;
+ }
+
new_i_size = extending
? min(k->k.p.offset << 9, new_i_size)
: 0;
- if (delta || new_i_size) {
+ if (i_sectors_delta || new_i_size) {
struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
@@ -298,9 +309,9 @@ int bch2_extent_update(struct btree_trans *trans,
else
new_i_size = 0;
- inode_u.bi_sectors += delta;
+ inode_u.bi_sectors += i_sectors_delta;
- if (delta || new_i_size) {
+ if (i_sectors_delta || new_i_size) {
bch2_inode_pack(trans->c, &inode_p, &inode_u);
bch2_trans_update(trans, inode_iter,
&inode_p.inode.k_i, 0);
@@ -315,10 +326,12 @@ int bch2_extent_update(struct btree_trans *trans,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE);
- if (!ret && i_sectors_delta)
- *i_sectors_delta += delta;
+ if (ret)
+ return ret;
- return ret;
+ if (i_sectors_delta_total)
+ *i_sectors_delta_total += i_sectors_delta;
+ return 0;
}
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
@@ -578,7 +591,8 @@ static void __bch2_write_index(struct bch_write_op *op)
op->written += sectors_start - keylist_sectors(keys);
if (ret) {
- __bcache_io_error(c, "btree IO error %i", ret);
+ bch_err_inum_ratelimited(c, op->pos.inode,
+ "write error %i from btree update", ret);
op->error = ret;
}
}
@@ -623,7 +637,10 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ op->pos.inode,
+ op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+ "data write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
set_bit(wbio->dev, op->failed.d);
@@ -1281,15 +1298,14 @@ void bch2_write(struct closure *cl)
wbio_init(bio)->put_bio = false;
if (bio_sectors(bio) & (c->opts.block_size - 1)) {
- __bcache_io_error(c, "misaligned write");
+ bch_err_inum_ratelimited(c, op->pos.inode,
+ "misaligned write");
op->error = -EIO;
goto err;
}
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
- if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
- __bcache_io_error(c, "read only");
op->error = -EROFS;
goto err;
}
@@ -1718,7 +1734,8 @@ retry:
* reading a btree node
*/
BUG_ON(!ret);
- __bcache_io_error(c, "btree IO error: %i", ret);
+ bch_err_inum_ratelimited(c, inode,
+ "read error %i from btree lookup", ret);
err:
rbio->bio.bi_status = BLK_STS_IOERR;
out:
@@ -1790,9 +1807,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if ((ret = PTR_ERR_OR_ZERO(iter)))
- goto out;
-
k = bch2_btree_iter_peek_slot(iter);
if ((ret = bkey_err(k)))
goto out;
@@ -1925,17 +1939,15 @@ csum_err:
return;
}
- bch2_dev_io_error(ca,
- "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
- rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
- __bcache_io_error(c, "decompression error, inode %llu offset %llu",
- rbio->pos.inode,
- (u64) rbio->bvec_iter.bi_sector);
+ bch_err_inum_ratelimited(c, rbio->pos.inode,
+ "decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
@@ -1957,7 +1969,14 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+ /*
+ * XXX: rbio->pos is not what we want here when reading from indirect
+ * extents
+ */
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ rbio->pos.inode,
+ rbio->pos.offset,
+ "data read error: %s",
bch2_blk_status_to_str(bio->bi_status))) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
@@ -2000,10 +2019,6 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
POS(0, reflink_offset),
BTREE_ITER_SLOTS);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- return ret;
-
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@@ -2011,7 +2026,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_reflink_v &&
k.k->type != KEY_TYPE_indirect_inline_data) {
- __bcache_io_error(trans->c,
+ bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
"pointer to nonexistent indirect extent");
ret = -EIO;
goto err;
@@ -2057,7 +2072,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto hole;
if (pick_ret < 0) {
- __bcache_io_error(c, "no device to read from");
+ bch_err_inum_ratelimited(c, k.k->p.inode,
+ "no device to read from");
goto err;
}
@@ -2207,7 +2223,8 @@ get_bio:
if (!rbio->pick.idx) {
if (!rbio->have_ioref) {
- __bcache_io_error(c, "no device to read from");
+ bch_err_inum_ratelimited(c, k.k->p.inode,
+ "no device to read from");
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -2357,7 +2374,9 @@ err:
if (ret == -EINTR)
goto retry;
- bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
+ bch_err_inum_ratelimited(c, inode,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
goto out;
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index c2cafd3892a4..d54424829378 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -18,7 +18,19 @@
#include <trace/events/bcachefs.h>
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+
+ lockdep_assert_held(&j->lock);
+
+ return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq >= last_unwritten_seq(j);
+}
static bool __journal_entry_is_open(union journal_res_state state)
{
@@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j)
return __journal_entry_is_open(j->reservations);
}
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+ struct journal_buf *buf = NULL;
+
+ EBUG_ON(seq > journal_cur_seq(j));
+ EBUG_ON(seq == journal_cur_seq(j) &&
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+ if (journal_seq_unwritten(j, seq)) {
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+ }
+ return buf;
+}
+
static void journal_pin_new_entry(struct journal *j, int count)
{
struct journal_entry_pin_list *p;
@@ -51,6 +79,10 @@ static void bch2_journal_buf_init(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+
memset(buf->has_inode, 0, sizeof(buf->has_inode));
memset(buf->data, 0, sizeof(*buf->data));
@@ -72,21 +104,15 @@ void bch2_journal_halt(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ j->err_seq = journal_cur_seq(j);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
/* journal entry close/open: */
-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+void __bch2_journal_buf_put(struct journal *j)
{
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
@@ -99,7 +125,6 @@ static bool __journal_entry_close(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
- bool set_need_write = false;
unsigned sectors;
lockdep_assert_held(&j->lock);
@@ -118,15 +143,13 @@ static bool __journal_entry_close(struct journal *j)
if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
set_bit(JOURNAL_NEED_WRITE, &j->flags);
j->need_write_time = local_clock();
- set_need_write = true;
}
- if (new.prev_buf_unwritten)
- return false;
-
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
- new.prev_buf_unwritten = 1;
+
+ if (new.idx == new.unwritten_idx)
+ return false;
BUG_ON(journal_state_count(new, new.idx));
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -139,8 +162,6 @@ static bool __journal_entry_close(struct journal *j)
BUG_ON(sectors > buf->sectors);
buf->sectors = sectors;
- bkey_extent_init(&buf->key);
-
/*
* We have to set last_seq here, _before_ opening a new journal entry:
*
@@ -162,29 +183,44 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
- if (journal_entry_empty(buf->data))
- clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
- else
- set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
+ clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
- bch2_journal_buf_put(j, old.idx, set_need_write);
+ bch2_journal_buf_put(j, old.idx);
return true;
}
+static bool journal_entry_want_write(struct journal *j)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ bool ret = false;
+
+ /*
+ * Don't close it yet if we already have a write in flight, but do set
+ * NEED_WRITE:
+ */
+ if (s.idx != s.unwritten_idx)
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+ else
+ ret = __journal_entry_close(j);
+
+ return ret;
+}
+
static bool journal_entry_close(struct journal *j)
{
bool ret;
spin_lock(&j->lock);
- ret = __journal_entry_close(j);
+ ret = journal_entry_want_write(j);
spin_unlock(&j->lock);
return ret;
@@ -202,16 +238,19 @@ static bool journal_entry_close(struct journal *j)
*/
static int journal_entry_open(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
int u64s;
u64 v;
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
if (j->blocked)
- return -EAGAIN;
+ return cur_entry_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
@@ -227,7 +266,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
- return -ENOSPC;
+ return cur_entry_journal_full;
/*
* Must be set before marking the journal entry as open:
@@ -239,7 +278,7 @@ static int journal_entry_open(struct journal *j)
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EROFS;
+ return cur_entry_insufficient_devices;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
@@ -263,8 +302,8 @@ static int journal_entry_open(struct journal *j)
static bool journal_quiesced(struct journal *j)
{
- union journal_res_state state = READ_ONCE(j->reservations);
- bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+ union journal_res_state s = READ_ONCE(j->reservations);
+ bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
if (!ret)
journal_entry_close(j);
@@ -291,17 +330,29 @@ static void journal_write_work(struct work_struct *work)
u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
{
size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- u64 seq = 0;
+ union journal_res_state s;
+ unsigned i;
+ u64 seq;
- if (!test_bit(h, j->buf[0].has_inode) &&
- !test_bit(h, j->buf[1].has_inode))
- return 0;
spin_lock(&j->lock);
- if (test_bit(h, journal_cur_buf(j)->has_inode))
- seq = journal_cur_seq(j);
- else if (test_bit(h, journal_prev_buf(j)->has_inode))
- seq = journal_cur_seq(j) - 1;
+ seq = journal_cur_seq(j);
+ s = READ_ONCE(j->reservations);
+ i = s.idx;
+
+ while (1) {
+ if (test_bit(h, j->buf[i].has_inode))
+ goto out;
+
+ if (i == s.unwritten_idx)
+ break;
+
+ i = (i - 1) & JOURNAL_BUF_MASK;
+ seq--;
+ }
+
+ seq = 0;
+out:
spin_unlock(&j->lock);
return seq;
@@ -352,7 +403,7 @@ retry:
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = -ENOSPC;
+ ret = cur_entry_journal_full;
goto unlock;
}
@@ -375,14 +426,16 @@ retry:
* there's still a previous one in flight:
*/
trace_journal_entry_full(c);
- ret = -EAGAIN;
+ ret = cur_entry_blocked;
} else {
ret = journal_entry_open(j);
}
unlock:
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
+ if ((ret && ret != cur_entry_insufficient_devices) &&
+ !j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
+ trace_journal_full(c);
+ }
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -390,32 +443,25 @@ unlock:
if (!ret)
goto retry;
- if (ret == -ENOSPC) {
- WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
- "JOURNAL_RES_GET_RESERVED set but journal full");
-
- /*
- * Journal is full - can't rely on reclaim from work item due to
- * freezing:
- */
- trace_journal_full(c);
-
- if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
- if (can_discard) {
- bch2_journal_do_discards(j);
- goto retry;
- }
-
- if (mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ if ((ret == cur_entry_journal_full ||
+ ret == cur_entry_journal_pin_full) &&
+ !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+ if (can_discard) {
+ bch2_journal_do_discards(j);
+ goto retry;
}
- ret = -EAGAIN;
+ if (mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
}
- return ret;
+ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
@@ -448,8 +494,10 @@ static bool journal_preres_available(struct journal *j,
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
- if (!ret)
- bch2_journal_reclaim_work(&j->reclaim_work.work);
+ if (!ret && mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
return ret;
}
@@ -503,168 +551,80 @@ out:
/* journal flushing: */
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
- u64 seq;
-
- spin_lock(&j->lock);
- seq = journal_cur_seq(j);
- if (j->reservations.prev_buf_unwritten)
- seq--;
- spin_unlock(&j->lock);
-
- return seq;
-}
-
/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
*
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
*/
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+ struct closure *parent)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int ret;
-
- spin_lock(&j->lock);
-
- /*
- * Can't try to open more than one sequence number ahead:
- */
- BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+ struct journal_buf *buf;
+ int ret = 0;
- if (journal_cur_seq(j) > seq ||
- journal_entry_is_open(j)) {
- spin_unlock(&j->lock);
- return 0;
- }
+ if (seq <= j->flushed_seq_ondisk)
+ return 1;
- if (journal_cur_seq(j) < seq &&
- !__journal_entry_close(j)) {
- /* haven't finished writing out the previous one: */
- trace_journal_entry_full(c);
- ret = -EAGAIN;
- } else {
- BUG_ON(journal_cur_seq(j) != seq);
+ spin_lock(&j->lock);
- ret = journal_entry_open(j);
+ /* Recheck under lock: */
+ if (j->err_seq && seq >= j->err_seq) {
+ ret = -EIO;
+ goto out;
}
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
-
- if (ret == -EAGAIN || ret == -ENOSPC)
- closure_wait(&j->async_wait, cl);
-
- spin_unlock(&j->lock);
-
- if (ret == -ENOSPC) {
- trace_journal_full(c);
- bch2_journal_reclaim_work(&j->reclaim_work.work);
- ret = -EAGAIN;
+ if (seq <= j->flushed_seq_ondisk) {
+ ret = 1;
+ goto out;
}
- return ret;
-}
-
-static int journal_seq_error(struct journal *j, u64 seq)
-{
- union journal_res_state state = READ_ONCE(j->reservations);
-
- if (seq == journal_cur_seq(j))
- return bch2_journal_error(j);
-
- if (seq + 1 == journal_cur_seq(j) &&
- !state.prev_buf_unwritten &&
- seq > j->seq_ondisk)
- return -EIO;
-
- return 0;
-}
+ /* if seq was written, but not flushed - flush a newer one instead */
+ seq = max(seq, last_unwritten_seq(j));
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
- /* seq should be for a journal entry that has been opened: */
- BUG_ON(seq > journal_cur_seq(j));
- BUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+recheck_need_open:
+ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ struct journal_res res = { 0 };
- if (seq == journal_cur_seq(j))
- return journal_cur_buf(j);
- if (seq + 1 == journal_cur_seq(j) &&
- j->reservations.prev_buf_unwritten)
- return journal_prev_buf(j);
- return NULL;
-}
+ spin_unlock(&j->lock);
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
- spin_lock(&j->lock);
+ seq = res.seq;
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
- if ((buf = journal_seq_to_buf(j, seq))) {
- if (!closure_wait(&buf->wait, parent))
+ if (parent && !closure_wait(&buf->wait, parent))
BUG();
- if (seq == journal_cur_seq(j)) {
- smp_mb();
- if (bch2_journal_error(j))
- closure_wake_up(&buf->wait);
- }
- }
-
- spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
-
- spin_lock(&j->lock);
+ bch2_journal_res_put(j, &res);
- if (parent &&
- (buf = journal_seq_to_buf(j, seq)))
- if (!closure_wait(&buf->wait, parent))
- BUG();
-
- if (seq == journal_cur_seq(j))
- __journal_entry_close(j);
- spin_unlock(&j->lock);
-}
+ spin_lock(&j->lock);
+ goto want_write;
+ }
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
- int ret;
+ /*
+ * if write was kicked off without a flush, flush the next sequence
+ * number instead
+ */
+ buf = journal_seq_to_buf(j, seq);
+ if (buf->noflush) {
+ seq++;
+ goto recheck_need_open;
+ }
- spin_lock(&j->lock);
- ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
+ buf->must_flush = true;
+ if (parent && !closure_wait(&buf->wait, parent))
+ BUG();
+want_write:
if (seq == journal_cur_seq(j))
- __journal_entry_close(j);
+ journal_entry_want_write(j);
+out:
spin_unlock(&j->lock);
-
return ret;
}
@@ -673,28 +633,13 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
u64 start_time = local_clock();
int ret, ret2;
- ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+ ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
bch2_time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
- struct journal_res res;
-
- memset(&res, 0, sizeof(res));
-
- bch2_journal_res_get(j, &res, jset_u64s(0), 0);
- bch2_journal_res_put(j, &res);
-
- bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
int bch2_journal_meta(struct journal *j)
{
struct journal_res res;
@@ -790,16 +735,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (nr <= ja->nr)
return 0;
- ret = -ENOMEM;
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
+ if (!new_buckets || !new_bucket_seq) {
+ ret = -ENOMEM;
goto err;
+ }
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets) {
+ ret = -ENOSPC;
goto err;
+ }
/*
* We may be called from the device add path, before the new device has
@@ -828,8 +776,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
}
} else {
+ rcu_read_lock();
ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
false, cl);
+ rcu_read_unlock();
if (IS_ERR(ob)) {
ret = cl ? -EAGAIN : -ENOSPC;
goto err;
@@ -843,6 +793,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
spin_lock(&c->journal.lock);
}
+ /*
+ * XXX
+ * For resize at runtime, we should be writing the new
+ * superblock before inserting into the journal array
+ */
+
pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
__array_insert_item(ja->buckets, ja->nr, pos);
__array_insert_item(ja->bucket_seq, ja->nr, pos);
@@ -875,9 +831,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (!new_fs)
bch2_open_bucket_put(c, ob);
}
-
- ret = 0;
err:
+ bch2_sb_resize_journal(&ca->disk_sb,
+ ja->nr + sizeof(*journal_buckets) / sizeof(u64));
kfree(new_bucket_seq);
kfree(new_buckets);
@@ -955,15 +911,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
union journal_res_state state;
- struct journal_buf *w;
- bool ret;
+ bool ret = false;
+ unsigned i;
spin_lock(&j->lock);
state = READ_ONCE(j->reservations);
- w = j->buf + !state.idx;
+ i = state.idx;
- ret = state.prev_buf_unwritten &&
- bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
+ while (i != state.unwritten_idx) {
+ i = (i - 1) & JOURNAL_BUF_MASK;
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+ ret = true;
+ }
spin_unlock(&j->lock);
return ret;
@@ -989,10 +948,11 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+ (journal_entry_is_open(j) ||
+ j->last_empty_seq + 1 != journal_cur_seq(j)));
cancel_delayed_work_sync(&j->write_work);
- cancel_delayed_work_sync(&j->reclaim_work);
+ bch2_journal_reclaim_stop(j);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -1045,8 +1005,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
+ j->last_flush_write = jiffies;
journal_pin_new_entry(j, 1);
+
+ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+
bch2_journal_buf_init(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
@@ -1100,8 +1064,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
- kvpfree(j->buf[1].data, j->buf[1].buf_size);
- kvpfree(j->buf[0].data, j->buf[0].buf_size);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvpfree(j->buf[i].data, j->buf[i].buf_size);
free_fifo(&j->pin);
}
@@ -1109,6 +1075,7 @@ int bch2_fs_journal_init(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
+ unsigned i;
int ret = 0;
pr_verbose_init(c->opts, "");
@@ -1117,15 +1084,12 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
init_waitqueue_head(&j->pin_flush_wait);
mutex_init(&j->reclaim_lock);
mutex_init(&j->discard_lock);
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
@@ -1137,13 +1101,20 @@ int bch2_fs_journal_init(struct journal *j)
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ if (!j->buf[i].data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
j->pin.front = j->pin.back = 1;
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1152,15 +1123,14 @@ out:
/* debug: */
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
- unsigned iter;
+ unsigned i;
rcu_read_lock();
- spin_lock(&j->lock);
s = READ_ONCE(j->reservations);
pr_buf(out,
@@ -1169,7 +1139,12 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
+ "nr flush writes:\t%llu\n"
+ "nr noflush writes:\t%llu\n"
+ "nr direct reclaim:\t%llu\n"
+ "nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
+ "current entry error:\t%u\n"
"current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
@@ -1177,7 +1152,12 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
- j->cur_entry_sectors);
+ j->nr_flush_writes,
+ j->nr_noflush_writes,
+ j->nr_direct_reclaim,
+ j->nr_background_reclaim,
+ j->cur_entry_sectors,
+ j->cur_entry_error);
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
@@ -1194,16 +1174,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
pr_buf(out,
- "current entry refs:\t%u\n"
- "prev entry unwritten:\t",
- journal_state_count(s, s.idx));
-
- if (s.prev_buf_unwritten)
- pr_buf(out, "yes, ref %u sectors %u\n",
- journal_state_count(s, !s.idx),
- journal_prev_buf(j)->sectors);
- else
- pr_buf(out, "no\n");
+ "current entry:\t\tidx %u refcount %u\n",
+ s.idx, journal_state_count(s, s.idx));
+
+ i = s.idx;
+ while (i != s.unwritten_idx) {
+ i = (i - 1) & JOURNAL_BUF_MASK;
+
+ pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
+ i, journal_state_count(s, i), j->buf[i].sectors);
+ }
pr_buf(out,
"need write:\t\t%i\n"
@@ -1211,7 +1191,21 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- for_each_member_device_rcu(ca, c, iter,
+ pr_buf(out, "space:\n");
+ pr_buf(out, "\tdiscarded\t%u:%u\n",
+ j->space[journal_space_discarded].next_entry,
+ j->space[journal_space_discarded].total);
+ pr_buf(out, "\tclean ondisk\t%u:%u\n",
+ j->space[journal_space_clean_ondisk].next_entry,
+ j->space[journal_space_clean_ondisk].total);
+ pr_buf(out, "\tclean\t\t%u:%u\n",
+ j->space[journal_space_clean].next_entry,
+ j->space[journal_space_clean].total);
+ pr_buf(out, "\ttotal\t\t%u:%u\n",
+ j->space[journal_space_total].next_entry,
+ j->space[journal_space_total].total);
+
+ for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -1221,12 +1215,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
pr_buf(out,
"dev %u:\n"
"\tnr\t\t%u\n"
+ "\tbucket size\t%u\n"
"\tavailable\t%u:%u\n"
- "\tdiscard_idx\t\t%u\n"
- "\tdirty_idx_ondisk\t%u (seq %llu)\n"
- "\tdirty_idx\t\t%u (seq %llu)\n"
+ "\tdiscard_idx\t%u\n"
+ "\tdirty_ondisk\t%u (seq %llu)\n"
+ "\tdirty_idx\t%u (seq %llu)\n"
"\tcur_idx\t\t%u (seq %llu)\n",
- iter, ja->nr,
+ i, ja->nr, ca->mi.bucket_size,
bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
ja->sectors_free,
ja->discard_idx,
@@ -1235,10 +1230,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
}
- spin_unlock(&j->lock);
rcu_read_unlock();
}
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+ spin_lock(&j->lock);
+ __bch2_journal_debug_to_text(out, j);
+ spin_unlock(&j->lock);
+}
+
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
struct journal_entry_pin_list *pin_list;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 0cbe9df384f9..1db1f190a168 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
return j->buf + j->reservations.idx;
}
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
- return j->buf + !j->reservations.idx;
-}
-
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
@@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
static inline u64 journal_cur_seq(struct journal *j)
{
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
{
- return idx == 0 ? s.buf0_count : s.buf1_count;
+ switch (idx) {
+ case 0: return s.buf0_count;
+ case 1: return s.buf1_count;
+ case 2: return s.buf2_count;
+ case 3: return s.buf3_count;
+ }
+ BUG();
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
s->buf1_count += s->idx == 1;
+ s->buf2_count += s->idx == 2;
+ s->buf3_count += s->idx == 3;
}
static inline void bch2_journal_set_has_inode(struct journal *j,
@@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j)
return true;
}
-void __bch2_journal_buf_put(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *);
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
- bool need_write_just_set)
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
+ .buf2_count = idx == 2,
+ .buf3_count = idx == 3,
}).v, &j->reservations.counter);
- if (!journal_state_count(s, idx)) {
- EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
- __bch2_journal_buf_put(j, need_write_just_set);
- }
+
+ EBUG_ON(((s.idx - idx) & 3) >
+ ((s.idx - s.unwritten_idx) & 3));
+
+ if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
+ __bch2_journal_buf_put(j);
}
/*
@@ -282,14 +288,14 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
- bch2_journal_buf_put(j, res->idx, false);
+ bch2_journal_buf_put(j, res->idx);
res->ref = 0;
}
@@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j,
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
return 0;
- if (flags & JOURNAL_RES_GET_CHECK)
- return 1;
-
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
+
+ /*
+ * If the refcount would overflow, we have to wait:
+ * XXX - tracepoint this:
+ */
+ if (!journal_state_count(new, new.idx))
+ return 0;
+
+ if (flags & JOURNAL_RES_GET_CHECK)
+ return 1;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
@@ -371,7 +384,7 @@ out:
static inline bool journal_check_may_get_unreserved(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
- bool ret = s.reserved <= s.remaining &&
+ bool ret = s.reserved < s.remaining &&
fifo_free(&j->pin) > 8;
lockdep_assert_held(&j->lock);
@@ -464,13 +477,8 @@ void bch2_journal_entry_res_resize(struct journal *,
struct journal_entry_res *,
unsigned);
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
@@ -500,6 +508,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bd0e6b371701..0e6fbe2f6a75 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -10,10 +10,27 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "replicas.h"
#include <trace/events/bcachefs.h>
+static void __journal_replay_free(struct journal_replay *i)
+{
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+ i->ignore = true;
+
+ if (!c->opts.read_entire_journal)
+ __journal_replay_free(i);
+}
+
struct journal_list {
struct closure cl;
struct mutex lock;
@@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct bch_devs_list devs = { .nr = 0 };
struct list_head *where;
size_t bytes = vstruct_bytes(j);
- __le64 last_seq;
+ u64 last_seq = 0;
int ret;
- last_seq = !list_empty(jlist->head)
- ? list_last_entry(jlist->head, struct journal_replay,
- list)->j.last_seq
- : 0;
-
- if (!c->opts.read_entire_journal) {
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
+ list_for_each_entry_reverse(i, jlist->head, list) {
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
}
+ }
+
+ /* Is this entry older than the range we need? */
+ if (!c->opts.read_entire_journal &&
+ le64_to_cpu(j->seq) < last_seq) {
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+ goto out;
+ }
- /* Drop entries we don't need anymore */
+ /* Drop entries we don't need anymore */
+ if (!JSET_NO_FLUSH(j)) {
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ journal_replay_free(c, i);
}
}
@@ -81,9 +99,7 @@ add:
if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
if (i->bad) {
devs = i->devs;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ __journal_replay_free(i);
} else if (bad) {
goto found;
} else {
@@ -105,6 +121,7 @@ add:
list_add(&i->list, where);
i->devs = devs;
i->bad = bad;
+ i->ignore = false;
memcpy(&i->j, j, bytes);
found:
if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@@ -161,6 +178,8 @@ static void journal_entry_null_range(void *start, void *end)
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define FSCK_DELETED_KEY 5
+
static int journal_validate_key(struct bch_fs *c, struct jset *jset,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
@@ -173,28 +192,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in journal: k->u64s 0", type)) {
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in journal: extends past end of journal entry",
- type)) {
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in journal: bad format %u",
- type, k->k.format)) {
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ k->k.format)) {
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (!write)
@@ -208,13 +241,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
- type, invalid, buf);
-
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
+ type, le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ invalid, buf);
+
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (write)
@@ -230,15 +268,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
struct jset_entry *entry,
int write)
{
- struct bkey_i *k;
+ struct bkey_i *k = entry->start;
- vstruct_for_each(entry, k) {
+ while (k != vstruct_last(entry)) {
int ret = journal_validate_key(c, jset, entry,
entry->level,
entry->btree_id,
k, "key", write);
- if (ret)
- return ret;
+ if (ret == FSCK_DELETED_KEY)
+ continue;
+
+ k = bkey_next(k);
}
return 0;
@@ -432,46 +472,45 @@ static int jset_validate(struct bch_fs *c,
"%s sector %llu seq %llu: unknown journal entry version %u",
ca->name, sector, le64_to_cpu(jset->seq),
version)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ /* don't try to continue: */
+ return EINVAL;
}
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ ret = JOURNAL_ENTRY_BAD;
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
}
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca->name, sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset)))
- return JOURNAL_ENTRY_BAD;
+ JSET_CSUM_TYPE(jset))) {
+ ret = JOURNAL_ENTRY_BAD;
+ goto bad_csum_type;
+ }
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
"%s sector %llu seq %llu: journal checksum bad",
- ca->name, sector, le64_to_cpu(jset->seq))) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
+ ca->name, sector, le64_to_cpu(jset->seq)))
+ ret = JOURNAL_ENTRY_BAD;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
-
+bad_csum_type:
if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
"invalid journal entry: last_seq > seq")) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
-
- return 0;
fsck_err:
return ret;
}
@@ -536,7 +575,7 @@ reread:
bio_put(bio);
if (bch2_dev_io_err_on(ret, ca,
- "journal read from sector %llu",
+ "journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal"))
return -EIO;
@@ -677,14 +716,16 @@ err:
goto out;
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+ u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i;
+ struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
+ u64 seq, last_seq = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
@@ -713,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (jlist.ret)
return jlist.ret;
+ if (list_empty(list)) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
+ i = list_last_entry(list, struct journal_replay, list);
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ /*
+ * Find most recent flush entry, and ignore newer non flush entries -
+ * those entries will be blacklisted:
+ */
+ list_for_each_entry_safe_reverse(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
+ }
+
+ journal_replay_free(c, i);
+ }
+
+ if (!last_seq) {
+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+ return -1;
+ }
+
+ /* Drop blacklisted entries and entries older than last_seq: */
+ list_for_each_entry_safe(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ seq = le64_to_cpu(i->j.seq);
+ if (seq < last_seq) {
+ journal_replay_free(c, i);
+ continue;
+ }
+
+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ "found blacklisted journal entry %llu", seq);
+
+ journal_replay_free(c, i);
+ }
+ }
+
+ /* Check for missing entries: */
+ seq = last_seq;
+ list_for_each_entry(i, list, list) {
+ if (i->ignore)
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ u64 missing_start, missing_end;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ missing_end = seq - 1;
+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ missing_start, missing_end,
+ last_seq, *blacklist_seq - 1);
+ }
+
+ seq++;
+ }
+
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
+ if (i->ignore)
+ continue;
+
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@@ -746,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
entries++;
}
- if (!list_empty(list)) {
- i = list_last_entry(list, struct journal_replay, list);
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+ keys, entries, *start_seq);
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, le64_to_cpu(i->j.seq));
- }
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
fsck_err:
return ret;
}
@@ -929,36 +1055,51 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
buf->buf_size = new_size;
}
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+ return j->buf + j->reservations.unwritten_idx;
+}
+
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
+ union journal_res_state old, new;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
+ u64 v;
+ int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
if (!devs.nr) {
bch_err(c, "unable to write journal to sufficient devices");
- goto err;
+ err = -EIO;
+ } else {
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+ if (bch2_mark_replicas(c, &replicas.e))
+ err = -EIO;
}
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
- if (bch2_mark_replicas(c, &replicas.e))
- goto err;
+ if (err)
+ bch2_fatal_error(c);
spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = devs;
j->seq_ondisk = seq;
- j->last_seq_ondisk = last_seq;
- bch2_journal_space_available(j);
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+
+ if (!w->noflush) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = last_seq;
+ }
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -967,14 +1108,21 @@ static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+ journal_reclaim_kick(&c->journal);
+
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
- BUG_ON(!j->reservations.prev_buf_unwritten);
- atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
- &j->reservations.counter);
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(new.idx == new.unwritten_idx);
+
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ bch2_journal_space_available(j);
closure_wake_up(&w->wait);
journal_wake(j);
@@ -982,11 +1130,10 @@ out:
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
- return;
-err:
- bch2_fatal_error(c);
- spin_lock(&j->lock);
- goto out;
+
+ if (new.unwritten_idx != new.idx &&
+ !journal_state_count(new, new.unwritten_idx))
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
static void journal_write_endio(struct bio *bio)
@@ -994,10 +1141,10 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
@@ -1014,7 +1161,7 @@ void bch2_journal_write(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
@@ -1023,13 +1170,29 @@ void bch2_journal_write(struct closure *cl)
unsigned i, sectors, bytes, u64s;
int ret;
- bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
+ spin_lock(&j->lock);
+ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+ !w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(jset, true);
+ jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ }
+ spin_unlock(&j->lock);
+
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
@@ -1067,6 +1230,9 @@ void bch2_journal_write(struct closure *cl)
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+ if (journal_entry_empty(jset))
+ j->last_empty_seq = le64_to_cpu(jset->seq);
+
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
@@ -1148,8 +1314,9 @@ retry_alloc:
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE,
- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+ if (!JSET_NO_FLUSH(jset))
+ bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
@@ -1158,20 +1325,21 @@ retry_alloc:
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- closure_bio_submit(bio, cl);
- }
-
+ if (!JSET_NO_FLUSH(jset)) {
+ for_each_rw_member(ca, c, i)
+ if (journal_flushes_device(ca) &&
+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+ percpu_ref_get(&ca->io_ref);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_opf = REQ_OP_FLUSH;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ closure_bio_submit(bio, cl);
+ }
+ }
no_io:
bch2_bucket_seq_cleanup(c);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 6958ee0f8cf2..6b4c80968f52 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -11,6 +11,7 @@ struct journal_replay {
struct bch_devs_list devs;
/* checksum error, but we may want to try using it anyways: */
bool bad;
+ bool ignore;
/* must be last: */
struct jset j;
};
@@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 18e45296e7de..b77d4e7f42d6 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -1,12 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -53,82 +59,108 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
old.v, new.v)) != old.v);
}
-static struct journal_space {
- unsigned next_entry;
- unsigned remaining;
-} __journal_space_available(struct journal *j, unsigned nr_devs_want,
- enum journal_space_from from)
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned sectors_next_entry = UINT_MAX;
- unsigned sectors_total = UINT_MAX;
- unsigned i, nr_devs = 0;
- unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
- ? journal_prev_buf(j)->sectors
- : 0;
+ unsigned sectors = 0;
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_this_device, sectors_this_device;
+ while (!sectors && *idx != j->reservations.idx) {
+ sectors = j->buf[*idx].sectors;
- if (!ja->nr)
- continue;
+ *idx = (*idx + 1) & JOURNAL_BUF_MASK;
+ }
- buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
- sectors_this_device = ja->sectors_free;
+ return sectors;
+}
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- if (unwritten_sectors >= sectors_this_device) {
- if (!buckets_this_device)
- continue;
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+ enum journal_space_from from)
+{
+ struct journal_device *ja = &ca->journal;
+ unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
- }
+ if (from == journal_space_total)
+ return (struct journal_space) {
+ .next_entry = ca->mi.bucket_size,
+ .total = ca->mi.bucket_size * ja->nr,
+ };
- sectors_this_device -= unwritten_sectors;
+ buckets = bch2_journal_dev_buckets_available(j, ja, from);
+ sectors = ja->sectors_free;
- if (sectors_this_device < ca->mi.bucket_size &&
- buckets_this_device) {
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
+ /*
+ * We that we don't allocate the space for a journal entry
+ * until we write it out - thus, account for it here:
+ */
+ while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ if (unwritten >= sectors) {
+ if (!buckets) {
+ sectors = 0;
+ break;
+ }
+
+ buckets--;
+ sectors = ca->mi.bucket_size;
}
- if (!sectors_this_device)
+ sectors -= unwritten;
+ }
+
+ if (sectors < ca->mi.bucket_size && buckets) {
+ buckets--;
+ sectors = ca->mi.bucket_size;
+ }
+
+ return (struct journal_space) {
+ .next_entry = sectors,
+ .total = sectors + buckets * ca->mi.bucket_size,
+ };
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+ enum journal_space_from from)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned i, pos, nr_devs = 0;
+ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_journal]) {
+ if (!ca->journal.nr)
continue;
- sectors_next_entry = min(sectors_next_entry,
- sectors_this_device);
+ space = journal_dev_space_available(j, ca, from);
+ if (!space.next_entry)
+ continue;
- sectors_total = min(sectors_total,
- buckets_this_device * ca->mi.bucket_size +
- sectors_this_device);
+ for (pos = 0; pos < nr_devs; pos++)
+ if (space.total > dev_space[pos].total)
+ break;
- nr_devs++;
+ array_insert_item(dev_space, nr_devs, pos, space);
}
rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
- return (struct journal_space) {
- .next_entry = sectors_next_entry,
- .remaining = max_t(int, 0, sectors_total - sectors_next_entry),
- };
+ /*
+ * We sorted largest to smallest, and we want the smallest out of the
+ * @nr_devs_want largest devices:
+ */
+ return dev_space[nr_devs_want - 1];
}
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- struct journal_space discarded, clean_ondisk, clean;
- unsigned overhead, u64s_remaining = 0;
+ unsigned clean, clean_ondisk, total;
+ s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@@ -164,31 +196,53 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = -EROFS;
- goto out;
- }
-
- if (!fifo_free(&j->pin)) {
- ret = -ENOSPC;
+ ret = cur_entry_insufficient_devices;
goto out;
}
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
- discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
- clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
- clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
+ for (i = 0; i < journal_space_nr; i++)
+ j->space[i] = __journal_space_available(j, nr_devs_want, i);
- if (!discarded.next_entry)
- ret = -ENOSPC;
+ clean_ondisk = j->space[journal_space_clean_ondisk].total;
+ clean = j->space[journal_space_clean].total;
+ total = j->space[journal_space_total].total;
- overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
- journal_entry_overhead(j);
- u64s_remaining = clean.remaining << 6;
- u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
- u64s_remaining /= 4;
+ if (!clean_ondisk &&
+ j->reservations.idx ==
+ j->reservations.unwritten_idx) {
+ char *buf = kmalloc(4096, GFP_ATOMIC);
+
+ bch_err(c, "journal stuck");
+ if (buf) {
+ __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+ pr_err("\n%s", buf);
+ kfree(buf);
+ }
+
+ bch2_fatal_error(c);
+ ret = cur_entry_journal_stuck;
+ } else if (!j->space[journal_space_discarded].next_entry)
+ ret = cur_entry_journal_full;
+ else if (!fifo_free(&j->pin))
+ ret = cur_entry_journal_pin_full;
+
+ if ((j->space[journal_space_clean_ondisk].next_entry <
+ j->space[journal_space_clean_ondisk].total) &&
+ (clean - clean_ondisk <= total / 8) &&
+ (clean_ondisk * 2 > clean ))
+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ else
+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+ u64s_remaining = (u64) clean << 6;
+ u64s_remaining -= (u64) total << 3;
+ u64s_remaining = max(0LL, u64s_remaining);
+ u64s_remaining /= 2;
+ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
out:
- j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
+ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_check_may_get_unreserved(j);
@@ -263,6 +317,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
@@ -271,6 +326,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
bch2_journal_space_available(j);
}
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ if (atomic_dec_and_test(&pin_list->count))
+ bch2_journal_reclaim_fast(j);
+}
+
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
@@ -314,11 +377,14 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+ struct journal_entry_pin_list *pin_list;
+
+ spin_lock(&j->lock);
+ pin_list = journal_seq_pin(j, seq);
__journal_pin_drop(j, pin);
@@ -329,45 +395,6 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
pin->flush = flush_fn;
list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-}
-
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
- spin_unlock(&j->lock);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-void bch2_journal_pin_update(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (journal_pin_active(pin) && pin->seq < seq)
- return;
-
- spin_lock(&j->lock);
-
- if (pin->seq != seq) {
- bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
- } else {
- struct journal_entry_pin_list *pin_list =
- journal_seq_pin(j, seq);
-
- /*
- * If the pin is already pinning the right sequence number, it
- * still might've already been flushed:
- */
- list_move(&pin->list, &pin_list->list);
- }
-
spin_unlock(&j->lock);
/*
@@ -377,20 +404,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq,
journal_wake(j);
}
-void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src) &&
- (!journal_pin_active(dst) || src->seq < dst->seq))
- bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
-
- spin_unlock(&j->lock);
-}
-
/**
* bch2_journal_pin_flush: ensure journal pin callback is no longer running
*/
@@ -431,7 +444,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
list_move(&ret->list, &pin_list->flushed);
BUG_ON(j->flush_in_progress);
j->flush_in_progress = ret;
- j->last_flushed = jiffies;
}
spin_unlock(&j->lock);
@@ -440,17 +452,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
}
/* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
- unsigned min_nr)
+static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_nr)
{
struct journal_entry_pin *pin;
- bool ret = false;
- u64 seq;
+ u64 seq, ret = 0;
lockdep_assert_held(&j->reclaim_lock);
- while ((pin = journal_get_next_pin(j, min_nr
- ? U64_MAX : seq_to_flush, &seq))) {
+ while (1) {
+ cond_resched();
+
+ j->last_flushed = jiffies;
+
+ pin = journal_get_next_pin(j, min_nr
+ ? U64_MAX : seq_to_flush, &seq);
+ if (!pin)
+ break;
+
if (min_nr)
min_nr--;
@@ -459,7 +478,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
BUG_ON(j->flush_in_progress != pin);
j->flush_in_progress = NULL;
wake_up(&j->pin_flush_wait);
- ret = true;
+ ret++;
}
return ret;
@@ -523,15 +542,33 @@ static u64 journal_seq_to_flush(struct journal *j)
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
-void bch2_journal_reclaim(struct journal *j)
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- unsigned min_nr = 0;
- u64 seq_to_flush = 0;
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ u64 seq_to_flush, nr_flushed = 0;
+ size_t min_nr;
+ unsigned flags;
+ int ret = 0;
+ /*
+ * We can't invoke memory reclaim while holding the reclaim_lock -
+ * journal reclaim is required to make progress for memory reclaim
+ * (cleaning the caches), so we can't get stuck in memory reclaim while
+ * we're holding the reclaim lock:
+ */
lockdep_assert_held(&j->reclaim_lock);
+ flags = memalloc_noreclaim_save();
do {
+ if (kthread && kthread_should_stop())
+ break;
+
+ if (bch2_journal_error(j)) {
+ ret = -EIO;
+ break;
+ }
+
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@@ -547,21 +584,110 @@ void bch2_journal_reclaim(struct journal *j)
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
min_nr = 1;
- } while (journal_flush_pins(j, seq_to_flush, min_nr));
- if (!bch2_journal_error(j))
- queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
+ if (atomic_read(&c->btree_cache.dirty) * 4 >
+ c->btree_cache.used * 3)
+ min_nr = 1;
+
+ if (fifo_free(&j->pin) <= 32)
+ min_nr = 1;
+
+ min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
+
+ trace_journal_reclaim_start(c,
+ min_nr,
+ j->prereserved.reserved,
+ j->prereserved.remaining,
+ atomic_read(&c->btree_cache.dirty),
+ c->btree_cache.used,
+ c->btree_key_cache.nr_dirty,
+ c->btree_key_cache.nr_keys);
+
+ nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+
+ if (direct)
+ j->nr_direct_reclaim += nr_flushed;
+ else
+ j->nr_background_reclaim += nr_flushed;
+ trace_journal_reclaim_finish(c, nr_flushed);
+ } while (min_nr && nr_flushed);
+
+ memalloc_noreclaim_restore(flags);
+
+ return ret;
}
-void bch2_journal_reclaim_work(struct work_struct *work)
+int bch2_journal_reclaim(struct journal *j)
{
- struct journal *j = container_of(to_delayed_work(work),
- struct journal, reclaim_work);
+ return __bch2_journal_reclaim(j, true);
+}
- mutex_lock(&j->reclaim_lock);
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
+static int bch2_journal_reclaim_thread(void *arg)
+{
+ struct journal *j = arg;
+ unsigned long next;
+ int ret = 0;
+
+ set_freezable();
+
+ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
+ while (!ret && !kthread_should_stop()) {
+ j->reclaim_kicked = false;
+
+ mutex_lock(&j->reclaim_lock);
+ ret = __bch2_journal_reclaim(j, false);
+ mutex_unlock(&j->reclaim_lock);
+
+ next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ if (j->reclaim_kicked)
+ break;
+ if (time_after_eq(jiffies, next))
+ break;
+ schedule_timeout(next - jiffies);
+ try_to_freeze();
+
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+ struct task_struct *p = j->reclaim_thread;
+
+ j->reclaim_thread = NULL;
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct task_struct *p;
+
+ if (j->reclaim_thread)
+ return 0;
+
+ p = kthread_create(bch2_journal_reclaim_thread, j,
+ "bch-reclaim/%s", c->name);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ get_task_struct(p);
+ j->reclaim_thread = p;
+ wake_up_process(p);
+ return 0;
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
@@ -575,7 +701,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0);
+ *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
spin_lock(&j->lock);
/*
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 8128907a7623..f02caa3d49ea 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -4,11 +4,16 @@
#define JOURNAL_PIN (32 * 1024)
-enum journal_space_from {
- journal_space_discarded,
- journal_space_clean_ondisk,
- journal_space_clean,
-};
+static inline void journal_reclaim_kick(struct journal *j)
+{
+ struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+ if (p && !j->reclaim_kicked) {
+ j->reclaim_kicked = true;
+ if (p)
+ wake_up_process(p);
+ }
+}
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
@@ -28,34 +33,45 @@ journal_seq_pin(struct journal *j, u64 seq)
return &j->pin.data[seq & j->pin.mask];
}
+void __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
- __bch2_journal_pin_add(j, seq, pin, flush_fn);
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-void bch2_journal_pin_update(struct journal *, u64,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
+static inline void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
+{
+ if (journal_pin_active(src))
+ bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+}
-void bch2_journal_pin_copy(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_do_discards(struct journal *);
-void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
bool bch2_journal_flush_pins(struct journal *, u64);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index d0f1bbf8f6a7..e1b63f3879f4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -118,7 +118,7 @@ out_write_sb:
out:
mutex_unlock(&c->sb_lock);
- return ret;
+ return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
- BUG_ON(c->journal_seq_blacklist_table);
-
if (!bl)
return 0;
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
journal_seq_blacklist_table_cmp,
NULL);
+ kfree(c->journal_seq_blacklist_table);
c->journal_seq_blacklist_table = t;
return 0;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 154b51b891d3..67ee47eb17a7 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -9,11 +9,13 @@
#include "super_types.h"
#include "fifo.h"
-struct journal_res;
+#define JOURNAL_BUF_BITS 2
+#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
/*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
*/
struct journal_buf {
struct jset *data;
@@ -27,6 +29,8 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
+ bool noflush; /* write has already been kicked off, and was noflush */
+ bool must_flush; /* something wants a flush */
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
@@ -81,10 +85,12 @@ union journal_res_state {
struct {
u64 cur_entry_offset:20,
- idx:1,
- prev_buf_unwritten:1,
- buf0_count:21,
- buf1_count:21;
+ idx:2,
+ unwritten_idx:2,
+ buf0_count:10,
+ buf1_count:10,
+ buf2_count:10,
+ buf3_count:10;
};
};
@@ -116,6 +122,20 @@ union journal_preres_state {
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
+struct journal_space {
+ /* Units of 512 bytes sectors: */
+ unsigned next_entry; /* How big the next journal entry can be */
+ unsigned total;
+};
+
+enum journal_space_from {
+ journal_space_discarded,
+ journal_space_clean_ondisk,
+ journal_space_clean,
+ journal_space_total,
+ journal_space_nr,
+};
+
/*
* JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
* either because something's waiting on the write to complete or because it's
@@ -127,8 +147,8 @@ enum {
JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
- JOURNAL_NOT_EMPTY,
JOURNAL_MAY_GET_UNRESERVED,
+ JOURNAL_MAY_SKIP_FLUSH,
};
/* Embedded in struct bch_fs */
@@ -147,7 +167,14 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- int cur_entry_error;
+ enum {
+ cur_entry_ok,
+ cur_entry_blocked,
+ cur_entry_journal_full,
+ cur_entry_journal_pin_full,
+ cur_entry_journal_stuck,
+ cur_entry_insufficient_devices,
+ } cur_entry_error;
union journal_preres_state prereserved;
@@ -160,7 +187,7 @@ struct journal {
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
- struct journal_buf buf[2];
+ struct journal_buf buf[JOURNAL_BUF_NR];
spinlock_t lock;
@@ -180,7 +207,10 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
+ u64 flushed_seq_ondisk;
u64 last_seq_ondisk;
+ u64 err_seq;
+ u64 last_empty_seq;
/*
* FIFO of journal entries whose btree updates have not yet been
@@ -203,14 +233,20 @@ struct journal {
struct journal_entry_pin_list *data;
} pin;
+ struct journal_space space[journal_space_nr];
+
u64 replay_journal_seq;
u64 replay_journal_seq_end;
struct write_point wp;
spinlock_t err_lock;
- struct delayed_work reclaim_work;
struct mutex reclaim_lock;
+ struct task_struct *reclaim_thread;
+ bool reclaim_kicked;
+ u64 nr_direct_reclaim;
+ u64 nr_background_reclaim;
+
unsigned long last_flushed;
struct journal_entry_pin *flush_in_progress;
wait_queue_head_t pin_flush_wait;
@@ -221,11 +257,15 @@ struct journal {
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
+ unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
+ u64 nr_flush_writes;
+ u64 nr_noflush_writes;
+
struct time_stats *write_time;
struct time_stats *delay_time;
struct time_stats *blocked_time;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index fbeaa3b67326..6633d21f604a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -326,12 +326,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index ddfda1ef8a79..2c5daed58aca 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
copygc_heap *h = &c->copygc_heap;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ struct extent_ptr_decoded p = { 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c)
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
if (IS_ERR(t))
return PTR_ERR(t);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 44d2651be970..c3373c48fa81 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c)
if (c->opts.nochanges)
return 0;
- p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
if (IS_ERR(p))
return PTR_ERR(p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1745cfac6b26..1883a1faf380 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -187,7 +187,7 @@ void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
{
memset(iter, 0, sizeof(*iter));
- iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+ iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH);
bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
}
@@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
{
- struct journal_replay *p;
+ struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL };
@@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (list_empty(journal_entries))
return keys;
- keys.journal_seq_base =
- le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
-
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ if (!keys.journal_seq_base)
+ keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+ for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
-
keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
if (!keys.d)
goto err;
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+ for_each_jset_key(k, _n, entry, &i->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
- .journal_seq = le64_to_cpu(p->j.seq) -
+ .journal_seq = le64_to_cpu(i->j.seq) -
keys.journal_seq_base,
- .journal_offset = k->_data - p->j._data,
+ .journal_offset = k->_data - i->j._data,
};
}
@@ -443,9 +443,6 @@ retry:
bch2_cut_back(atomic_end, split);
split_iter = bch2_trans_copy_iter(&trans, iter);
- ret = PTR_ERR_OR_ZERO(split_iter);
- if (ret)
- goto err;
/*
* It's important that we don't go through the
@@ -456,11 +453,14 @@ retry:
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split,
BTREE_TRIGGER_NORUN);
+ bch2_trans_iter_put(&trans, split_iter);
bch2_btree_iter_set_pos(iter, split->k.p);
if (remark) {
- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
+ ret = bch2_trans_mark_key(&trans,
+ bkey_s_c_null,
+ bkey_i_to_s_c(split),
0, split->k.size,
BTREE_TRIGGER_INSERT);
if (ret)
@@ -469,7 +469,9 @@ retry:
} while (bkey_cmp(iter->pos, k->k.p) < 0);
if (remark) {
- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
+ ret = bch2_trans_mark_key(&trans,
+ bkey_i_to_s_c(k),
+ bkey_s_c_null,
0, -((s64) k->k.size),
BTREE_TRIGGER_OVERWRITE);
if (ret)
@@ -481,6 +483,8 @@ retry:
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
err:
+ bch2_trans_iter_put(&trans, iter);
+
if (ret == -EINTR)
goto retry;
@@ -499,8 +503,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
iter = bch2_trans_get_node_iter(trans, id, k->k.p,
BTREE_MAX_DEPTH, level,
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
/*
* iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
@@ -535,8 +537,7 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter) ?:
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+ ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
return ret;
}
@@ -613,6 +614,7 @@ static int bch2_journal_replay(struct bch_fs *c,
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+ journal_reclaim_kick(j);
j->replay_journal_seq = seq;
@@ -645,46 +647,6 @@ err:
return ret;
}
-static bool journal_empty(struct list_head *journal)
-{
- return list_empty(journal) ||
- journal_entry_empty(&list_last_entry(journal,
- struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
- struct list_head *journal)
-{
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
- u64 start_seq = le64_to_cpu(i->j.last_seq);
- u64 end_seq = le64_to_cpu(i->j.seq);
- u64 seq = start_seq;
- int ret = 0;
-
- list_for_each_entry(i, journal, list) {
- if (le64_to_cpu(i->j.seq) < start_seq)
- continue;
-
- fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- seq, le64_to_cpu(i->j.seq) - 1,
- start_seq, end_seq);
-
- seq = le64_to_cpu(i->j.seq);
-
- fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
- "found blacklisted journal entry %llu", seq);
-
- do {
- seq++;
- } while (bch2_journal_seq_is_blacklisted(c, seq, false));
- }
-fsck_err:
- return ret;
-}
-
/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
@@ -769,6 +731,7 @@ static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
+ struct journal_replay *i;
struct jset_entry *entry;
int ret;
@@ -784,18 +747,19 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
+ list_for_each_entry(i, journal, list) {
+ if (i->ignore)
+ continue;
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
+ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
- list_for_each_entry(i, journal, list)
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
+ }
}
bch2_fs_usage_initialize(c);
@@ -844,9 +808,6 @@ static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
- if (!c->sb.clean || !j)
- return 0;
-
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
@@ -973,7 +934,8 @@ int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
- u64 journal_seq;
+ struct jset *last_journal_entry = NULL;
+ u64 blacklist_seq, journal_seq;
bool write_sb = false, need_write_alloc = false;
int ret;
@@ -993,24 +955,38 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
+ ret = bch2_blacklist_table_initialize(c);
+ if (ret) {
+ bch_err(c, "error initializing blacklist table");
+ goto err;
+ }
+
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct jset *j;
+ struct journal_replay *i;
- ret = bch2_journal_read(c, &c->journal_entries);
+ ret = bch2_journal_read(c, &c->journal_entries,
+ &blacklist_seq, &journal_seq);
if (ret)
goto err;
- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+ list_for_each_entry_reverse(i, &c->journal_entries, list)
+ if (!i->ignore) {
+ last_journal_entry = &i->j;
+ break;
+ }
+
+ if (mustfix_fsck_err_on(c->sb.clean &&
+ last_journal_entry &&
+ !journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
- if (!c->sb.clean && list_empty(&c->journal_entries)) {
- bch_err(c, "no journal entries found");
- ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
- goto err;
+ if (!last_journal_entry) {
+ fsck_err_on(!c->sb.clean, c, "no journal entries found");
+ goto use_clean;
}
c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1019,16 +995,21 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- j = &list_last_entry(&c->journal_entries,
- struct journal_replay, list)->j;
-
- ret = verify_superblock_clean(c, &clean, j);
- if (ret)
+ if (c->sb.clean && last_journal_entry) {
+ ret = verify_superblock_clean(c, &clean,
+ last_journal_entry);
+ if (ret)
+ goto err;
+ }
+ } else {
+use_clean:
+ if (!clean) {
+ bch_err(c, "no superblock clean section found");
+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
- journal_seq = le64_to_cpu(j->seq) + 1;
- } else {
- journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+ }
+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
@@ -1047,30 +1028,23 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
- if (!c->sb.clean) {
+ /*
+ * After an unclean shutdown, skip then next few journal sequence
+ * numbers as they may have been referenced by btree writes that
+ * happened before their corresponding journal writes - those btree
+ * writes need to be ignored, by skipping and blacklisting the next few
+ * journal sequence numbers:
+ */
+ if (!c->sb.clean)
+ journal_seq += 8;
+
+ if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
- journal_seq,
- journal_seq + 4);
+ blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
-
- journal_seq += 4;
-
- /*
- * The superblock needs to be written before we do any btree
- * node writes: it will be in the read_write() path
- */
- }
-
- ret = bch2_blacklist_table_initialize(c);
-
- if (!list_empty(&c->journal_entries)) {
- ret = verify_journal_entries_not_blacklisted_or_missing(c,
- &c->journal_entries);
- if (ret)
- goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 91518c0d6794..00a197b65e0b 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
- struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
struct bch_fs_usage *new_scratch = NULL;
struct bch_fs_usage __percpu *new_gc = NULL;
struct bch_fs_usage *new_base = NULL;
- unsigned bytes = sizeof(struct bch_fs_usage) +
+ unsigned i, bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
- int ret = -ENOMEM;
+ int ret = 0;
+
+ memset(new_usage, 0, sizeof(new_usage));
+
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+ sizeof(u64), GFP_NOIO)))
+ goto err;
if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
- !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO)) ||
- !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO)) ||
!(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
(c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
- bch_err(c, "error updating replicas table: memory allocation failure");
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
goto err;
- }
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (c->usage[i])
+ __replicas_table_update_pcpu(new_usage[i], new_r,
+ c->usage[i], &c->replicas);
if (c->usage_base)
__replicas_table_update(new_base, new_r,
c->usage_base, &c->replicas);
- if (c->usage[0])
- __replicas_table_update_pcpu(new_usage[0], new_r,
- c->usage[0], &c->replicas);
- if (c->usage[1])
- __replicas_table_update_pcpu(new_usage[1], new_r,
- c->usage[1], &c->replicas);
if (c->usage_gc)
__replicas_table_update_pcpu(new_gc, new_r,
c->usage_gc, &c->replicas);
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ swap(c->usage[i], new_usage[i]);
swap(c->usage_base, new_base);
- swap(c->usage[0], new_usage[0]);
- swap(c->usage[1], new_usage[1]);
swap(c->usage_scratch, new_scratch);
swap(c->usage_gc, new_gc);
swap(c->replicas, *new_r);
- ret = 0;
-err:
+out:
free_percpu(new_gc);
kfree(new_scratch);
free_percpu(new_usage[1]);
free_percpu(new_usage[0]);
kfree(new_base);
return ret;
+err:
+ bch_err(c, "error updating replicas table: memory allocation failure");
+ ret = -ENOMEM;
+ goto out;
}
static unsigned reserve_journal_replicas(struct bch_fs *c,
@@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
struct bch_replicas_cpu n;
if (!__replicas_has_entry(&c->replicas_gc, e) &&
- (c->usage_base->replicas[i] ||
- percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]))) {
+ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
n = cpu_replicas_add_entry(&c->replicas_gc, e);
if (!n.entries) {
ret = -ENOSPC;
@@ -603,9 +603,7 @@ retry:
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
- c->usage_base->replicas[i] ||
- percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]))
+ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index dea9b7252b88..1ecf72c9487c 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -205,8 +205,6 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
int ret;
iter = bch2_trans_copy_iter(trans, start);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
bch2_btree_iter_next_slot(iter);
@@ -253,11 +251,8 @@ int bch2_hash_set(struct btree_trans *trans,
}
if (!slot &&
- !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+ !(flags & BCH_HASH_SET_MUST_REPLACE))
slot = bch2_trans_copy_iter(trans, iter);
- if (IS_ERR(slot))
- return PTR_ERR(slot);
- }
if (k.k->type != KEY_TYPE_whiteout)
goto not_found;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index cee6cc938734..78835bd2d6bc 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -614,9 +614,6 @@ got_super:
bdev_logical_block_size(sb->bdev))
goto err;
- if (sb->mode & FMODE_WRITE)
- bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
ret = 0;
sb->have_layout = true;
out:
@@ -636,7 +633,7 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@@ -995,10 +992,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_down_write(&c->mark_lock);
if (!journal_seq) {
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
} else {
- bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
}
{
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index f606de540895..651fbc5d52b1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -49,7 +49,6 @@
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/idr.h>
-#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/random.h>
@@ -149,44 +148,6 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
-int bch2_congested(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
- struct backing_dev_info *bdi;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- rcu_read_lock();
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- const struct bch_devs_mask *devs =
- bch2_target_to_mask(c, c->opts.foreground_target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_member_device_rcu(ca, c, i, devs) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/* Filesystem RO/RW: */
/*
@@ -297,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
void bch2_fs_read_only(struct bch_fs *c)
{
if (!test_bit(BCH_FS_RW, &c->flags)) {
- cancel_delayed_work_sync(&c->journal.reclaim_work);
+ BUG_ON(c->journal.reclaim_thread);
return;
}
@@ -455,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err(c, "error starting journal reclaim: %i", ret);
+ return ret;
+ }
+
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -463,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
percpu_ref_reinit(&c->writes);
set_bit(BCH_FS_RW, &c->flags);
-
- queue_delayed_work(c->journal_reclaim_wq,
- &c->journal.reclaim_work, 0);
return 0;
err:
__bch2_fs_read_only(c);
@@ -511,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
kfree(c->usage_scratch);
- free_percpu(c->usage[1]);
- free_percpu(c->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ free_percpu(c->usage[i]);
kfree(c->usage_base);
if (c->btree_iters_bufs)
@@ -533,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c)
kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
- if (c->journal_reclaim_wq)
- destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->wq)
@@ -754,6 +716,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init_early(&c->btree_cache);
+ mutex_init(&c->sectors_available_lock);
+
if (percpu_init_rwsem(&c->mark_lock))
goto err;
@@ -788,8 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -2056,6 +2018,7 @@ static void bcachefs_exit(void)
bch2_debug_exit();
bch2_vfs_exit();
bch2_chardev_exit();
+ bch2_btree_key_cache_exit();
if (bcachefs_kset)
kset_unregister(bcachefs_kset);
}
@@ -2065,6 +2028,7 @@ static int __init bcachefs_init(void)
bch2_bkey_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+ bch2_btree_key_cache_init() ||
bch2_chardev_init() ||
bch2_vfs_init() ||
bch2_debug_init())
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 048ffec622af..02c81f3555c3 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index d7ad293aff4d..cc13fc258115 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -165,6 +165,7 @@ read_attribute(journal_debug);
read_attribute(journal_pins);
read_attribute(btree_updates);
read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
read_attribute(stripes_heap);
@@ -374,6 +375,11 @@ SHOW(bch2_fs)
return out.pos - buf;
}
+ if (attr == &sysfs_btree_cache) {
+ bch2_btree_cache_to_text(&out, c);
+ return out.pos - buf;
+ }
+
if (attr == &sysfs_btree_key_cache) {
bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
return out.pos - buf;
@@ -458,7 +464,7 @@ STORE(bch2_fs)
/* Debugging: */
if (attr == &sysfs_trigger_journal_flush)
- bch2_journal_meta_async(&c->journal, NULL);
+ bch2_journal_meta(&c->journal);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
@@ -497,10 +503,11 @@ STORE(bch2_fs)
if (threads_str &&
!(ret = kstrtouint(threads_str, 10, &threads)) &&
!(ret = bch2_strtoull_h(nr_str, &nr)))
- bch2_btree_perf_test(c, test, nr, threads);
- else
- size = ret;
+ ret = bch2_btree_perf_test(c, test, nr, threads);
kfree(tmp);
+
+ if (ret)
+ size = ret;
}
#endif
return size;
@@ -550,6 +557,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_pins,
&sysfs_btree_updates,
&sysfs_dirty_btree_nodes,
+ &sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
&sysfs_stripes_heap,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 4dcace650416..f1d09e3ada09 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c)
/* unit tests */
-static void test_delete(struct bch_fs *c, u64 nr)
+static int test_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -41,24 +41,37 @@ static void test_delete(struct bch_fs *c, u64 nr)
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "lookup error in test_delete: %i", ret);
+ goto err;
+ }
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in test_delete: %i", ret);
+ goto err;
+ }
pr_info("deleting once");
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "delete error (first) in test_delete: %i", ret);
+ goto err;
+ }
pr_info("deleting twice");
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
-
+ if (ret) {
+ bch_err(c, "delete error (second) in test_delete: %i", ret);
+ goto err;
+ }
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_delete_written(struct bch_fs *c, u64 nr)
+static int test_delete_written(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -73,27 +86,37 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "lookup error in test_delete_written: %i", ret);
+ goto err;
+ }
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in test_delete_written: %i", ret);
+ goto err;
+ }
bch2_journal_flush_all_pins(&c->journal);
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
-
+ if (ret) {
+ bch_err(c, "delete error in test_delete_written: %i", ret);
+ goto err;
+ }
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate(struct bch_fs *c, u64 nr)
+static int test_iterate(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -109,7 +132,10 @@ static void test_iterate(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
@@ -132,17 +158,18 @@ static void test_iterate(struct bch_fs *c, u64 nr)
BUG_ON(k.k->p.offset != --i);
BUG_ON(i);
-
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -159,7 +186,10 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_extents: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
@@ -182,17 +212,18 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
}
BUG_ON(i);
-
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_slots(struct bch_fs *c, u64 nr)
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -208,7 +239,10 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_slots: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
@@ -240,17 +274,18 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
if (i == nr * 2)
break;
}
-
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -267,7 +302,10 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
@@ -299,15 +337,16 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
if (i == nr)
break;
}
-
+err:
bch2_trans_exit(&trans);
+ return 0;
}
/*
* XXX: we really want to make sure we've got a btree with depth > 0 for these
* tests
*/
-static void test_peek_end(struct bch_fs *c, u64 nr)
+static int test_peek_end(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -324,9 +363,10 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
BUG_ON(k.k);
bch2_trans_exit(&trans);
+ return 0;
}
-static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -343,14 +383,15 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr)
BUG_ON(k.k);
bch2_trans_exit(&trans);
+ return 0;
}
/* extent unit tests */
u64 test_version;
-static void insert_test_extent(struct bch_fs *c,
- u64 start, u64 end)
+static int insert_test_extent(struct bch_fs *c,
+ u64 start, u64 end)
{
struct bkey_i_cookie k;
int ret;
@@ -364,42 +405,47 @@ static void insert_test_extent(struct bch_fs *c,
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret)
+ bch_err(c, "insert error in insert_test_extent: %i", ret);
+ return ret;
}
-static void __test_extent_overwrite(struct bch_fs *c,
+static int __test_extent_overwrite(struct bch_fs *c,
u64 e1_start, u64 e1_end,
u64 e2_start, u64 e2_end)
{
- insert_test_extent(c, e1_start, e1_end);
- insert_test_extent(c, e2_start, e2_end);
+ int ret;
+
+ ret = insert_test_extent(c, e1_start, e1_end) ?:
+ insert_test_extent(c, e2_start, e2_end);
delete_test_keys(c);
+ return ret;
}
-static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 0, 32);
- __test_extent_overwrite(c, 8, 64, 0, 32);
+ return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+ __test_extent_overwrite(c, 8, 64, 0, 32);
}
-static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 32, 64);
- __test_extent_overwrite(c, 0, 64, 32, 72);
+ return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 0, 64, 32, 72);
}
-static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 32, 40);
+ return __test_extent_overwrite(c, 0, 64, 32, 40);
}
-static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 32, 64, 0, 64);
- __test_extent_overwrite(c, 32, 64, 0, 128);
- __test_extent_overwrite(c, 32, 64, 32, 64);
- __test_extent_overwrite(c, 32, 64, 32, 128);
+ return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 0, 128) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 128);
}
/* perf tests */
@@ -415,11 +461,11 @@ static u64 test_rand(void)
return v;
}
-static void rand_insert(struct bch_fs *c, u64 nr)
+static int rand_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct bkey_i_cookie k;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
@@ -430,48 +476,63 @@ static void rand_insert(struct bch_fs *c, u64 nr)
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in rand_insert: %i", ret);
+ break;
+ }
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void rand_lookup(struct bch_fs *c, u64 nr)
+static int rand_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
- POS(0, test_rand()), 0);
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
k = bch2_btree_iter_peek(iter);
- bch2_trans_iter_free(&trans, iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch_err(c, "error in rand_lookup: %i", ret);
+ break;
+ }
}
+ bch2_trans_iter_free(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void rand_mixed(struct bch_fs *c, u64 nr)
+static int rand_mixed(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
- POS(0, test_rand()), 0);
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch_err(c, "lookup error in rand_mixed: %i", ret);
+ break;
+ }
if (!(i & 3) && k.k) {
struct bkey_i_cookie k;
@@ -481,14 +542,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in rand_mixed: %i", ret);
+ break;
+ }
}
-
- bch2_trans_iter_free(&trans, iter);
}
+ bch2_trans_iter_free(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
static int __do_delete(struct btree_trans *trans, struct bpos pos)
@@ -500,15 +563,14 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- goto err;
-
k = bch2_btree_iter_peek(iter);
ret = bkey_err(k);
if (ret)
goto err;
+ if (!k.k)
+ goto err;
+
bkey_init(&delete.k);
delete.k.p = k.k->p;
@@ -518,10 +580,10 @@ err:
return ret;
}
-static void rand_delete(struct bch_fs *c, u64 nr)
+static int rand_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
@@ -531,19 +593,23 @@ static void rand_delete(struct bch_fs *c, u64 nr)
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in rand_delete: %i", ret);
+ break;
+ }
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_insert(struct bch_fs *c, u64 nr)
+static int seq_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_i_cookie insert;
- int ret;
+ int ret = 0;
u64 i = 0;
bkey_cookie_init(&insert.k_i);
@@ -556,35 +622,39 @@ static void seq_insert(struct bch_fs *c, u64 nr)
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &insert.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in seq_insert: %i", ret);
+ break;
+ }
if (++i == nr)
break;
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_lookup(struct bch_fs *c, u64 nr)
+static int seq_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
;
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_overwrite(struct bch_fs *c, u64 nr)
+static int seq_overwrite(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -596,23 +666,28 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &u.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in seq_overwrite: %i", ret);
+ break;
+ }
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_delete(struct bch_fs *c, u64 nr)
+static int seq_delete(struct bch_fs *c, u64 nr)
{
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
POS(0, 0), POS(0, U64_MAX),
NULL);
- BUG_ON(ret);
+ if (ret)
+ bch_err(c, "error in seq_delete: %i", ret);
+ return ret;
}
-typedef void (*perf_test_fn)(struct bch_fs *, u64);
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
struct test_job {
struct bch_fs *c;
@@ -628,11 +703,13 @@ struct test_job {
u64 start;
u64 finish;
+ int ret;
};
static int btree_perf_test_thread(void *data)
{
struct test_job *j = data;
+ int ret;
if (atomic_dec_and_test(&j->ready)) {
wake_up(&j->ready_wait);
@@ -641,7 +718,9 @@ static int btree_perf_test_thread(void *data)
wait_event(j->ready_wait, !atomic_read(&j->ready));
}
- j->fn(j->c, j->nr / j->nr_threads);
+ ret = j->fn(j->c, j->nr / j->nr_threads);
+ if (ret)
+ j->ret = ret;
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
@@ -651,8 +730,8 @@ static int btree_perf_test_thread(void *data)
return 0;
}
-void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
- u64 nr, unsigned nr_threads)
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+ u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
char name_buf[20], nr_buf[20], per_sec_buf[20];
@@ -695,7 +774,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
if (!j.fn) {
pr_err("unknown test %s", testname);
- return;
+ return -EINVAL;
}
//pr_info("running test %s:", testname);
@@ -720,6 +799,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
time / NSEC_PER_SEC,
time * nr_threads / nr,
per_sec_buf);
+ return j.ret;
}
#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
index 551d0764225e..c73b18aea7e0 100644
--- a/fs/bcachefs/tests.h
+++ b/fs/bcachefs/tests.h
@@ -6,7 +6,7 @@ struct bch_fs;
#ifdef CONFIG_BCACHEFS_TESTS
-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
#else
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index e8a7df61ff5c..6e5335440b4b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -88,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
@@ -653,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index ba2c55559796..d4cb7a298cc2 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write,
TP_ARGS(bio)
);
+TRACE_EVENT(journal_reclaim_start,
+ TP_PROTO(struct bch_fs *c, u64 min_nr,
+ u64 prereserved, u64 prereserved_total,
+ u64 btree_cache_dirty, u64 btree_cache_total,
+ u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+ TP_ARGS(c, min_nr, prereserved, prereserved_total,
+ btree_cache_dirty, btree_cache_total,
+ btree_key_cache_dirty, btree_key_cache_total),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, min_nr )
+ __field(u64, prereserved )
+ __field(u64, prereserved_total )
+ __field(u64, btree_cache_dirty )
+ __field(u64, btree_cache_total )
+ __field(u64, btree_key_cache_dirty )
+ __field(u64, btree_key_cache_total )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->min_nr = min_nr;
+ __entry->prereserved = prereserved;
+ __entry->prereserved_total = prereserved_total;
+ __entry->btree_cache_dirty = btree_cache_dirty;
+ __entry->btree_cache_total = btree_cache_total;
+ __entry->btree_key_cache_dirty = btree_key_cache_dirty;
+ __entry->btree_key_cache_total = btree_key_cache_total;
+ ),
+
+ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ __entry->uuid,
+ __entry->min_nr,
+ __entry->prereserved,
+ __entry->prereserved_total,
+ __entry->btree_cache_dirty,
+ __entry->btree_cache_total,
+ __entry->btree_key_cache_dirty,
+ __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+ TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+ TP_ARGS(c, nr_flushed),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, nr_flushed )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->nr_flushed = nr_flushed;
+ ),
+
+ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -513,7 +572,7 @@ TRACE_EVENT(transaction_restart_ip,
__entry->ip = ip;
),
- TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
);
DECLARE_EVENT_CLASS(transaction_restart,
@@ -528,7 +587,7 @@ DECLARE_EVENT_CLASS(transaction_restart,
__entry->ip = ip;
),
- TP_printk("%pf", (void *) __entry->ip)
+ TP_printk("%ps", (void *) __entry->ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
@@ -568,7 +627,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_iter_type = want_iter_type;
),
- TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+ TP_printk("%ps %pS because %u have %u:%u want %u:%u",
(void *) __entry->trans_ip,
(void *) __entry->caller_ip,
__entry->reason,
@@ -592,7 +651,7 @@ TRACE_EVENT(trans_restart_iters_realloced,
__entry->nr = nr;
),
- TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr)
+ TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
);
TRACE_EVENT(trans_restart_mem_realloced,
@@ -609,7 +668,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes = bytes;
),
- TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes)
+ TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
@@ -622,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
TP_ARGS(ip)
);
+DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
+ TP_PROTO(unsigned long ip),
+ TP_ARGS(ip)
+);
+
DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
TP_PROTO(unsigned long ip),
TP_ARGS(ip)
@@ -657,11 +721,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse,
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_atomic,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
-);
-
DECLARE_EVENT_CLASS(node_lock_fail,
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
TP_ARGS(level, iter_seq, node, node_seq),