summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-11-07 11:36:03 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-11-07 13:11:03 -0500
commit76fb8739030612bd51ef2c9573d2bf849e9910d4 (patch)
tree1e8d34c2e30e39265e5d0e627b51fd03a1436cbf
parent7d679fb274b1a05bf06894c681e7ab6393f4742f (diff)
Merge with 1d669389f7: bcachefs: use a radix tree for inum bitmap in fsck
-rw-r--r--fs/bcachefs/Kconfig6
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/alloc_background.c155
-rw-r--r--fs/bcachefs/alloc_background.h14
-rw-r--r--fs/bcachefs/alloc_foreground.c2
-rw-r--r--fs/bcachefs/bcachefs.h27
-rw-r--r--fs/bcachefs/bcachefs_format.h30
-rw-r--r--fs/bcachefs/bkey.c8
-rw-r--r--fs/bcachefs/bkey.h48
-rw-r--r--fs/bcachefs/bkey_methods.c8
-rw-r--r--fs/bcachefs/bkey_sort.c10
-rw-r--r--fs/bcachefs/bset.c32
-rw-r--r--fs/bcachefs/bset.h27
-rw-r--r--fs/bcachefs/btree_cache.c32
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_gc.c85
-rw-r--r--fs/bcachefs/btree_io.c18
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c199
-rw-r--r--fs/bcachefs/btree_iter.h7
-rw-r--r--fs/bcachefs/btree_key_cache.c13
-rw-r--r--fs/bcachefs/btree_key_cache.h3
-rw-r--r--fs/bcachefs/btree_locking.h8
-rw-r--r--fs/bcachefs/btree_types.h16
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/btree_update_leaf.c26
-rw-r--r--fs/bcachefs/buckets.c518
-rw-r--r--fs/bcachefs/buckets.h6
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/compress.c4
-rw-r--r--fs/bcachefs/debug.c2
-rw-r--r--fs/bcachefs/debug.h33
-rw-r--r--fs/bcachefs/ec.c38
-rw-r--r--fs/bcachefs/ec.h4
-rw-r--r--fs/bcachefs/extents.c20
-rw-r--r--fs/bcachefs/extents.h30
-rw-r--r--fs/bcachefs/fs-common.c4
-rw-r--r--fs/bcachefs/fs-io.c404
-rw-r--r--fs/bcachefs/fs-io.h7
-rw-r--r--fs/bcachefs/fs.c289
-rw-r--r--fs/bcachefs/fsck.c45
-rw-r--r--fs/bcachefs/inode.c311
-rw-r--r--fs/bcachefs/inode.h21
-rw-r--r--fs/bcachefs/io.c43
-rw-r--r--fs/bcachefs/io.h6
-rw-r--r--fs/bcachefs/journal.c23
-rw-r--r--fs/bcachefs/journal.h3
-rw-r--r--fs/bcachefs/journal_reclaim.c82
-rw-r--r--fs/bcachefs/move.c45
-rw-r--r--fs/bcachefs/move.h3
-rw-r--r--fs/bcachefs/movinggc.c61
-rw-r--r--fs/bcachefs/opts.c7
-rw-r--r--fs/bcachefs/opts.h4
-rw-r--r--fs/bcachefs/rebalance.c1
-rw-r--r--fs/bcachefs/recovery.c58
-rw-r--r--fs/bcachefs/reflink.c74
-rw-r--r--fs/bcachefs/reflink.h11
-rw-r--r--fs/bcachefs/replicas.c20
-rw-r--r--fs/bcachefs/super.c145
-rw-r--r--fs/bcachefs/super.h3
-rw-r--r--fs/bcachefs/sysfs.c19
-rw-r--r--fs/bcachefs/util.c2
-rw-r--r--fs/bcachefs/util.h42
-rw-r--r--fs/bcachefs/varint.c42
-rw-r--r--fs/bcachefs/varint.h8
-rw-r--r--include/trace/events/bcachefs.h43
68 files changed, 1774 insertions, 1527 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 10abddae6a80..5594af719b2a 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -20,7 +20,7 @@ config BCACHEFS_FS
select SIXLOCKS
select RAID6_PQ
select XOR_BLOCKS
- ---help---
+ help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -37,7 +37,7 @@ config BCACHEFS_POSIX_ACL
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
- ---help---
+ help
Enables many extra debugging checks and assertions.
The resulting code will be significantly slower than normal; you
@@ -46,5 +46,5 @@ config BCACHEFS_DEBUG
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
- ---help---
+ help
Include some unit and performance tests for the core btree code
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index d85ced62c0dd..2fbf978424ed 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -56,4 +56,5 @@ bcachefs-y := \
tests.o \
trace.o \
util.o \
+ varint.o \
xattr.o
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9aa0b42b26b6..97508de9f721 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -209,10 +209,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_s_c k)
{
- if (!level)
- bch2_mark_key(c, k, 0, 0, NULL, 0,
- BTREE_TRIGGER_ALLOC_READ|
- BTREE_TRIGGER_NOATOMIC);
+ struct bch_dev *ca;
+ struct bucket *g;
+ struct bkey_alloc_unpacked u;
+
+ if (level || k.k->type != KEY_TYPE_alloc)
+ return 0;
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = __bucket(ca, k.k->p.offset, 0);
+ u = bch2_alloc_unpack(k);
+
+ g->_mark.gen = u.gen;
+ g->_mark.data_type = u.data_type;
+ g->_mark.dirty_sectors = u.dirty_sectors;
+ g->_mark.cached_sectors = u.cached_sectors;
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = u.oldest_gen;
+ g->gen_valid = 1;
return 0;
}
@@ -223,8 +238,11 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
unsigned i;
int ret = 0;
+ down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
NULL, bch2_alloc_read_fn);
+ up_read(&c->gc_lock);
+
if (ret) {
bch_err(c, "error reading alloc info: %i", ret);
return ret;
@@ -253,12 +271,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return 0;
}
-enum alloc_write_ret {
- ALLOC_WROTE,
- ALLOC_NOWROTE,
- ALLOC_END,
-};
-
static int bch2_alloc_write_key(struct btree_trans *trans,
struct btree_iter *iter,
unsigned flags)
@@ -288,26 +300,17 @@ retry:
old_u = bch2_alloc_unpack(k);
- if (iter->pos.inode >= c->sb.nr_devices ||
- !c->devs[iter->pos.inode])
- return ALLOC_END;
-
percpu_down_read(&c->mark_lock);
ca = bch_dev_bkey_exists(c, iter->pos.inode);
ba = bucket_array(ca);
- if (iter->pos.offset >= ba->nbuckets) {
- percpu_up_read(&c->mark_lock);
- return ALLOC_END;
- }
-
g = &ba->b[iter->pos.offset];
m = READ_ONCE(g->mark);
new_u = alloc_mem_to_key(g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
- return ALLOC_NOWROTE;
+ return 0;
a = bkey_alloc_init(&alloc_key.k);
a->k.p = iter->pos;
@@ -325,50 +328,55 @@ err:
return ret;
}
-int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bch_dev *ca;
- unsigned i;
+ u64 first_bucket, nbuckets;
int ret = 0;
+ percpu_down_read(&c->mark_lock);
+ first_bucket = bucket_array(ca)->first_bucket;
+ nbuckets = bucket_array(ca)->nbuckets;
+ percpu_up_read(&c->mark_lock);
+
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, first_bucket),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- for_each_rw_member(ca, c, i) {
- unsigned first_bucket;
+ while (iter->pos.offset < nbuckets) {
+ bch2_trans_cond_resched(&trans);
- percpu_down_read(&c->mark_lock);
- first_bucket = bucket_array(ca)->first_bucket;
- percpu_up_read(&c->mark_lock);
+ ret = bch2_alloc_write_key(&trans, iter, flags);
+ if (ret)
+ break;
+ bch2_btree_iter_next_slot(iter);
+ }
- bch2_btree_iter_set_pos(iter, POS(i, first_bucket));
+ bch2_trans_exit(&trans);
- while (1) {
- bch2_trans_cond_resched(&trans);
+ return ret;
+}
- ret = bch2_alloc_write_key(&trans, iter, flags);
- if (ret < 0 || ret == ALLOC_END)
- break;
- if (ret == ALLOC_WROTE)
- *wrote = true;
- bch2_btree_iter_next_slot(iter);
- }
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
- if (ret < 0) {
+ for_each_rw_member(ca, c, i) {
+ bch2_dev_alloc_write(c, ca, flags);
+ if (ret) {
percpu_ref_put(&ca->io_ref);
break;
}
}
- bch2_trans_exit(&trans);
-
- return ret < 0 ? ret : 0;
+ return ret;
}
/* Bucket IO clocks: */
@@ -481,6 +489,53 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
mutex_init(&clock->lock);
}
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+ struct btree_iter *iter;
+ struct bucket *g;
+ struct bkey_i_alloc *a;
+ struct bkey_alloc_unpacked u;
+ u16 *time;
+ int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
+
+ percpu_down_read(&c->mark_lock);
+ g = bucket(ca, bucket_nr);
+ u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ percpu_up_read(&c->mark_lock);
+
+ bkey_alloc_init(&a->k_i);
+ a->k.p = iter->pos;
+
+ time = rw == READ ? &u.read_time : &u.write_time;
+ if (*time == c->bucket_clock[rw].hand)
+ goto out;
+
+ *time = c->bucket_clock[rw].hand;
+
+ bch2_alloc_pack(a, u);
+
+ ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
/* Background allocator thread: */
/*
@@ -489,8 +544,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
* commands to the newly free buckets, then puts them on the various freelists.
*/
-#define BUCKET_GC_GEN_MAX 96U
-
/**
* wait_buckets_available - wait on reclaimable buckets
*
@@ -1259,18 +1312,6 @@ void bch2_recalc_capacity(struct bch_fs *c)
c->bucket_size_max = bucket_size_max;
- if (c->capacity) {
- bch2_io_timer_add(&c->io_clock[READ],
- &c->bucket_clock[READ].rescale);
- bch2_io_timer_add(&c->io_clock[WRITE],
- &c->bucket_clock[WRITE].rescale);
- } else {
- bch2_io_timer_del(&c->io_clock[READ],
- &c->bucket_clock[READ].rescale);
- bch2_io_timer_del(&c->io_clock[WRITE],
- &c->bucket_clock[WRITE].rescale);
- }
-
/* Wake up case someone was waiting for buckets */
closure_wake_up(&c->freelist_wait);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f6b9f27f0713..d10ff56e4de1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,6 +13,9 @@ struct bkey_alloc_unpacked {
#undef x
};
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX 96U
+
/* returns true if not equal */
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
@@ -28,6 +31,8 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
void bch2_alloc_pack(struct bkey_i_alloc *,
const struct bkey_alloc_unpacked);
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
static inline struct bkey_alloc_unpacked
alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
{
@@ -61,15 +66,17 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
rcu_read_lock();
p = rcu_dereference(ca->alloc_thread);
- if (p)
+ if (p) {
wake_up_process(p);
+ ca->allocator_state = ALLOCATOR_RUNNING;
+ }
rcu_read_unlock();
}
static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
- if (expensive_debug_checks(c)) {
+ if (bch2_expensive_debug_checks) {
size_t iter;
long i;
unsigned j;
@@ -91,7 +98,8 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write(struct bch_fs *, unsigned, bool *);
+int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 4a048828869b..7a92e3d53254 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -309,8 +309,6 @@ out:
.dev = ca->dev_idx,
};
- bucket_io_clock_reset(c, ca, bucket, READ);
- bucket_io_clock_reset(c, ca, bucket, WRITE);
spin_unlock(&ob->lock);
if (c->blocked_allocate_open_bucket) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 68e150fb8510..35311dbb189c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -265,6 +265,8 @@ do { \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \
+ "Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
@@ -295,6 +297,16 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
x(btree_node_split) \
@@ -491,7 +503,6 @@ enum {
BCH_FS_ERRORS_FIXED,
/* misc: */
- BCH_FS_BDEV_MOUNTED,
BCH_FS_FIXED_GENS,
BCH_FS_ALLOC_WRITTEN,
BCH_FS_REBUILD_REPLICAS,
@@ -530,6 +541,10 @@ struct journal_keys {
u64 journal_seq_base;
};
+struct btree_iter_buf {
+ struct btree_iter *iter;
+};
+
struct bch_fs {
struct closure cl;
@@ -625,6 +640,7 @@ struct bch_fs {
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
mempool_t btree_iters_pool;
+ struct btree_iter_buf __percpu *btree_iters_bufs;
struct btree_key_cache btree_key_cache;
@@ -735,7 +751,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -802,7 +818,8 @@ struct bch_fs {
struct mutex verify_lock;
#endif
- u64 unused_inode_hint;
+ u64 *unused_inode_hints;
+ unsigned inode_shard_bits;
/*
* A btree node on disk could have too many bsets for an iterator to fit
@@ -827,10 +844,6 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
-#define BCH_DEBUG_PARAM(name, description) bool name;
- BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
struct time_stats times[BCH_TIME_STAT_NR];
};
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d5a2230e403c..94b5418587e3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -340,7 +340,8 @@ static inline void bkey_init(struct bkey *k)
x(reflink_p, 15) \
x(reflink_v, 16) \
x(inline_data, 17) \
- x(btree_ptr_v2, 18)
+ x(btree_ptr_v2, 18) \
+ x(indirect_inline_data, 19)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -668,10 +669,10 @@ struct bch_inode_generation {
} __attribute__((packed, aligned(8)));
#define BCH_INODE_FIELDS() \
- x(bi_atime, 64) \
- x(bi_ctime, 64) \
- x(bi_mtime, 64) \
- x(bi_otime, 64) \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
x(bi_size, 64) \
x(bi_sectors, 64) \
x(bi_uid, 32) \
@@ -738,7 +739,8 @@ enum {
#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
/* Dirents */
@@ -886,6 +888,12 @@ struct bch_reflink_v {
__u64 _data[0];
};
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[0];
+};
+
/* Inline data */
struct bch_inline_data {
@@ -1032,7 +1040,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
- x(cached, 5)
+ x(cached, 5) \
+ x(parity, 6)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
@@ -1321,13 +1330,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(incompressible, 10) \
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12) \
- x(btree_updates_journalled, 13)
+ x(btree_updates_journalled, 13) \
+ x(reflink_inline_data, 14) \
+ x(new_varint, 15)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates))
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+ (1ULL << BCH_FEATURE_new_varint))\
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 4d0c9129cd4a..c06d0a965be1 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out,
if ((*p & mask) != mask) {
*p += 1ULL << offset;
- EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
return true;
}
@@ -1054,9 +1054,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
}
__pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
struct bkey unpacked;
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index cbcfbd26bc58..2d2c640305e2 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -67,13 +67,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
-#define bkey_packed_typecheck(_k) \
-({ \
- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
- !type_is(_k, struct bkey_packed *)); \
- type_is(_k, struct bkey_packed *); \
-})
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@@ -81,9 +74,6 @@ enum bkey_lr_packed {
BKEY_PACKED_NONE,
};
-#define bkey_lr_packed_typecheck(_l, _r) \
- (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
@@ -132,9 +122,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
const struct bpos *);
__pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
__pure
int __bch2_bkey_cmp_left_packed(const struct btree *,
@@ -160,37 +150,6 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
return bkey_cmp_left_packed(b, l, &r);
}
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r) \
-({ \
- int _cmp; \
- \
- switch (bkey_lr_packed_typecheck(_l, _r)) { \
- case BKEY_PACKED_NONE: \
- _cmp = bkey_cmp(((struct bkey *) (_l))->p, \
- ((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_LEFT: \
- _cmp = bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_l), \
- &((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_RIGHT: \
- _cmp = -bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_r), \
- &((struct bkey *) (_l))->p); \
- break; \
- case BKEY_PACKED_BOTH: \
- _cmp = __bch2_bkey_cmp_packed((void *) (_l), \
- (void *) (_r), (_b)); \
- break; \
- } \
- _cmp; \
-})
-
#if 1
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
@@ -565,6 +524,7 @@ BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
BKEY_VAL_ACCESSORS(btree_ptr_v2);
+BKEY_VAL_ACCESSORS(indirect_inline_data);
/* byte order helpers */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 36e0c5152b47..99b7fce2bfd3 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -72,7 +72,11 @@ static const char *key_type_inline_data_invalid(const struct bch_fs *c,
static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ pr_buf(out, "datalen %u: %*phN",
+ datalen, min(datalen, 32U), d.v->data);
}
#define bch2_bkey_ops_inline_data (struct bkey_ops) { \
@@ -232,7 +236,7 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
enum merge_result ret;
- if (key_merging_disabled(c) ||
+ if (bch2_key_merging_disabled ||
!ops->key_merge ||
l.k->type != r.k->type ||
bversion_cmp(l.k->version, r.k->version) ||
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 839e78d1dc35..99e0a4011fae 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -86,7 +86,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
}
@@ -98,7 +98,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter)
* and should be dropped.
*/
return iter->used >= 2 &&
- !bkey_cmp_packed(iter->b,
+ !bch2_bkey_cmp_packed(iter->b,
iter->data[0].k,
iter->data[1].k);
}
@@ -223,7 +223,7 @@ static inline int sort_keys_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
@@ -245,7 +245,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
continue;
while ((next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
+ !bch2_bkey_cmp_packed(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;
@@ -406,7 +406,7 @@ static inline int sort_extents_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
(int) bkey_deleted(l) - (int) bkey_deleted(r);
}
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index f7c2841ed8a7..26716657453f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bset_tree *t;
+ const struct bset_tree *t;
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
@@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b)
#endif
}
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
{
unsigned i;
b->nsets = 0;
memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
- b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
for (i = 0; i < MAX_BSETS; i++)
b->set[i].data_offset = U16_MAX;
@@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
struct bkey_packed *k = btree_bkey_first(b, t);
unsigned j = 0;
- if (!btree_keys_expensive_checks(b))
+ if (!bch2_expensive_debug_checks)
return;
BUG_ON(bset_has_ro_aux_tree(t));
@@ -710,20 +708,20 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
}
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
}
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
@@ -922,7 +920,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
k = p;
}
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
BUG_ON(ret >= orig_k);
for (i = ret
@@ -1227,8 +1225,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
__flatten
static struct bkey_packed *bset_search_tree(const struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
+ const struct bset_tree *t,
+ const struct bpos *search,
const struct bkey_packed *packed_search)
{
struct ro_aux_tree *base = ro_aux_tree_base(b, t);
@@ -1345,7 +1343,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
@@ -1601,7 +1599,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
bch2_btree_node_iter_verify(iter, b);
bch2_btree_node_iter_next_check(iter, b);
}
@@ -1620,7 +1618,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
struct bset_tree *t;
unsigned end = 0;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
@@ -1656,7 +1654,7 @@ found:
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
return prev;
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5921cf689105..469294cc716c 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -5,7 +5,7 @@
#include <linux/kernel.h>
#include <linux/types.h>
-#include "bcachefs_format.h"
+#include "bcachefs.h"
#include "bkey.h"
#include "bkey_methods.h"
#include "btree_types.h"
@@ -147,17 +147,6 @@
* first key in that range of bytes again.
*/
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
- return false;
-#endif
-}
-
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
@@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
#define BSET_CACHELINE 128
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
{
return (1U << b->byte_order) / BSET_CACHELINE;
}
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
{
return btree_keys_cachelines(b) * 8;
}
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
{
return btree_aux_data_bytes(b) / sizeof(u64);
}
@@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
@@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b,
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
@@ -477,7 +466,7 @@ static inline int bkey_iter_cmp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r)
+ return bch2_bkey_cmp_packed(b, l, r)
?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
?: cmp_int(l, r);
}
@@ -654,7 +643,7 @@ static inline void bch2_verify_insert_pos(struct btree *b,
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
- if (btree_keys_expensive_checks(b))
+ if (bch2_debug_check_btree_accounting)
__bch2_verify_btree_nr_keys(b);
}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 29a2065ad414..325a16615a06 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -81,8 +81,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
@@ -212,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
* - unless btree verify mode is enabled, since it runs out of
* the post write cleanup:
*/
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent);
else
__bch2_btree_node_write(c, b, SIX_LOCK_read);
@@ -253,9 +252,9 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long can_free;
unsigned long touched = 0;
unsigned long freed = 0;
- unsigned i;
+ unsigned i, flags;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
@@ -264,6 +263,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
else if (!mutex_trylock(&bc->lock))
return -1;
+ flags = memalloc_nofs_save();
+
/*
* It's _really_ critical that we don't free too many btree nodes - we
* have to always leave ourselves a reserve. The reserve is how we
@@ -327,6 +328,7 @@ restart:
clear_btree_node_accessed(b);
}
+ memalloc_nofs_restore(flags);
mutex_unlock(&bc->lock);
out:
return (unsigned long) freed * btree_pages(c);
@@ -339,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
btree_cache.shrink);
struct btree_cache *bc = &c->btree_cache;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return 0;
return btree_cache_can_free(bc) * btree_pages(c);
@@ -349,11 +351,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- unsigned i;
+ unsigned i, flags;
if (bc->shrink.list.next)
unregister_shrinker(&bc->shrink);
+ /* vfree() can allocate memory: */
+ flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -389,6 +393,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
}
mutex_unlock(&bc->lock);
+ memalloc_nofs_restore(flags);
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
@@ -585,7 +590,7 @@ out:
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
- bch2_btree_keys_init(b, &c->expensive_debug_checks);
+ bch2_btree_keys_init(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
@@ -700,7 +705,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
*/
struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type)
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@@ -762,7 +768,7 @@ lock_node:
btree_node_unlock(iter, level + 1);
if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
- lock_node_check_fn, (void *) k)) {
+ lock_node_check_fn, (void *) k, trace_ip)) {
if (b->hash_val != btree_ptr_hash_val(k))
goto retry;
return ERR_PTR(-EINTR);
@@ -930,7 +936,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
+ SIX_LOCK_intent, _THIS_IP_);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
struct btree_iter *linked;
@@ -943,14 +949,14 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
* holding other locks that would cause us to deadlock:
*/
trans_for_each_iter(trans, linked)
- if (btree_iter_cmp(iter, linked) < 0)
+ if (btree_iter_lock_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
+ SIX_LOCK_intent, _THIS_IP_);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index d0d3a85bb8be..8a19e60e9258 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -23,7 +23,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
- enum six_lock_type);
+ enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 4f581130270c..ba4acc112ed3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -8,6 +8,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "bkey_methods.h"
+#include "bkey_on_stack.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -36,9 +37,11 @@
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
+ preempt_disable();
write_seqcount_begin(&c->gc_pos_lock);
c->gc_pos = new_pos;
write_seqcount_end(&c->gc_pos_lock);
+ preempt_enable();
}
static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
@@ -98,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (initial) {
- BUG_ON(journal_seq_verify(c) &&
+ BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > journal_cur_seq(&c->journal));
/* XXX change to fsck check */
@@ -206,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
struct btree_iter *iter;
struct btree *b;
unsigned depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
+ : bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
@@ -233,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
- else if (!btree_gc_rewrite_disabled(c) &&
- (btree_gc_always_rewrite(c) || max_stale > 16))
+ else if (!bch2_btree_gc_rewrite_disabled &&
+ (bch2_btree_gc_always_rewrite || max_stale > 16))
bch2_btree_node_rewrite(c, iter,
b->data->keys.seq,
BTREE_INSERT_NOWAIT|
@@ -325,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
{
struct btree *b;
unsigned target_depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
+ : bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
@@ -567,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c,
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
+ ret = 1; \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
@@ -577,6 +581,7 @@ static int bch2_gc_done(struct bch_fs *c,
dst->_f, src->_f); \
dst->_f = src->_f; \
dst->dirty = true; \
+ ret = 1; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
@@ -587,6 +592,7 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_data_types[dst->b[b].mark.data_type],\
dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
+ ret = 1; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -829,7 +835,7 @@ again:
out:
if (!ret &&
(test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
- (!iter && test_restart_gc(c)))) {
+ (!iter && bch2_test_restart_gc))) {
/*
* XXX: make sure gens we fixed got saved
*/
@@ -888,40 +894,77 @@ out:
return ret;
}
+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ percpu_down_read(&c->mark_lock);
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+ if (gen_after(g->mark.gen, ptr->gen) > 16) {
+ percpu_up_read(&c->mark_lock);
+ return true;
+ }
+ }
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+ if (gen_after(g->gc_gen, ptr->gen))
+ g->gc_gen = ptr->gen;
+ }
+ percpu_up_read(&c->mark_lock);
+
+ return false;
+}
+
/*
* For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
* node pointers currently never have cached pointers that can become stale:
*/
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id)
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ struct bkey_on_stack sk;
+ int ret = 0;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH);
- percpu_down_read(&c->mark_lock);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, false);
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k))) {
+ if (gc_btree_gens_key(c, k)) {
+ bkey_on_stack_reassemble(&sk, c, k);
+ bch2_extent_normalize(c, bkey_i_to_s(sk.k));
- if (gen_after(g->gc_gen, ptr->gen))
- g->gc_gen = ptr->gen;
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
- if (gen_after(g->mark.gen, ptr->gen) > 32) {
- /* rewrite btree node */
+ bch2_trans_update(&trans, iter, sk.k, 0);
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+ if (ret == -EINTR)
+ continue;
+ if (ret) {
+ break;
}
}
- percpu_up_read(&c->mark_lock);
+
+ bch2_btree_iter_next(iter);
}
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
+
return ret;
}
@@ -1356,7 +1399,7 @@ static int bch2_gc_thread(void *arg)
#else
ret = bch2_gc_gens(c);
#endif
- if (ret)
+ if (ret < 0)
bch_err(c, "btree gc failed: %i", ret);
debug_check_no_locks_held();
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2f5097218f9c..10a00085cdd6 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -42,7 +42,7 @@ static void verify_no_dups(struct btree *b,
BUG_ON(extents
? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
: bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
- //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+ //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0);
}
#endif
}
@@ -102,14 +102,14 @@ static void sort_bkey_ptrs(const struct btree *bt,
break;
for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
- b = bkey_cmp_packed(bt,
+ b = bch2_bkey_cmp_packed(bt,
ptrs[c],
ptrs[d]) >= 0 ? c : d;
if (d == n)
b = c;
while (b != a &&
- bkey_cmp_packed(bt,
+ bch2_bkey_cmp_packed(bt,
ptrs[a],
ptrs[b]) >= 0)
b = (b - 1) / 2;
@@ -750,7 +750,9 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, b, i,
- "incorrect max key");
+ "incorrect max key %llu:%llu",
+ bn->max_key.inode,
+ bn->max_key.offset);
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -930,7 +932,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
BTREE_ERR_WANT_RETRY, c, b, i,
- "unknown checksum type");
+ "unknown checksum type %llu",
+ BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
@@ -957,7 +960,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
BTREE_ERR_WANT_RETRY, c, b, i,
- "unknown checksum type");
+ "unknown checksum type %llu",
+ BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
@@ -1040,7 +1044,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
if (invalid ||
- (inject_invalid_keys(c) &&
+ (bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index b859a067c78b..626d0f071b70 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -104,7 +104,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6fab76c3220c..58f1a3dd97d3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -197,13 +197,13 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level, struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn,
- void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
- struct btree_iter *linked;
+ struct btree_iter *linked, *deadlock_iter = NULL;
u64 start_time = local_clock();
- bool ret = true;
+ unsigned reason = 9;
/* Check if it's safe to block: */
trans_for_each_iter(trans, linked) {
@@ -228,11 +228,34 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
linked->locks_want = max_t(unsigned,
linked->locks_want,
__fls(linked->nodes_locked) + 1);
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 1;
+ }
} else {
- ret = false;
+ deadlock_iter = linked;
+ reason = 2;
+ }
+ }
+
+ if (linked->btree_id != iter->btree_id) {
+ if (linked->btree_id > iter->btree_id) {
+ deadlock_iter = linked;
+ reason = 3;
+ }
+ continue;
+ }
+
+ /*
+ * Within the same btree, cached iterators come before non
+ * cached iterators:
+ */
+ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+ if (btree_iter_is_cached(iter)) {
+ deadlock_iter = linked;
+ reason = 4;
}
+ continue;
}
/*
@@ -240,30 +263,29 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
* another iterator has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
- if (linked->btree_id == iter->btree_id &&
- level > __fls(linked->nodes_locked)) {
+ if (level > __fls(linked->nodes_locked)) {
if (!(trans->nounlock)) {
linked->locks_want =
max(level + 1, max_t(unsigned,
linked->locks_want,
iter->locks_want));
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 5;
+ }
} else {
- ret = false;
+ deadlock_iter = linked;
+ reason = 6;
}
}
/* Must lock btree nodes in key order: */
- if ((cmp_int(iter->btree_id, linked->btree_id) ?:
- -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
- ret = false;
-
- if (iter->btree_id == linked->btree_id &&
- btree_node_locked(linked, level) &&
+ if (btree_node_locked(linked, level) &&
bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- btree_iter_type(linked))) <= 0)
- ret = false;
+ btree_iter_type(linked))) <= 0) {
+ deadlock_iter = linked;
+ reason = 7;
+ }
/*
* Recheck if this is a node we already have locked - since one
@@ -277,8 +299,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
}
}
- if (unlikely(!ret)) {
- trace_trans_restart_would_deadlock(iter->trans->ip);
+ if (unlikely(deadlock_iter)) {
+ trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+ reason,
+ deadlock_iter->btree_id,
+ btree_iter_type(deadlock_iter),
+ iter->btree_id,
+ btree_iter_type(iter));
return false;
}
@@ -471,7 +498,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
char buf1[100], buf2[100];
const char *msg;
- if (!debug_check_iterators(iter->trans->c))
+ if (!bch2_debug_check_iterators)
return;
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
@@ -567,7 +594,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
{
struct btree_iter *iter;
- if (!debug_check_iterators(trans->c))
+ if (!bch2_debug_check_iterators)
return;
trans_for_each_iter_with_node(trans, b, iter)
@@ -739,7 +766,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (debug_check_iterators(iter->trans->c))
+ if (bch2_debug_check_iterators)
bch2_btree_node_iter_verify(node_iter, b);
}
@@ -769,7 +796,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
ret = bkey_disassemble(l->b, k, u);
- if (debug_check_bkeys(iter->trans->c))
+ if (bch2_debug_check_bkeys)
bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
return ret;
@@ -945,7 +972,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
- unsigned depth_want)
+ unsigned depth_want,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
@@ -974,7 +1002,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type,
- lock_root_check_fn, rootp)))
+ lock_root_check_fn, rootp,
+ trace_ip)))
return -EINTR;
if (likely(b == READ_ONCE(*rootp) &&
@@ -1046,7 +1075,8 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
btree_node_unlock(iter, plevel);
}
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree_iter_level *l = &iter->l[iter->level];
@@ -1060,7 +1090,7 @@ static __always_inline int btree_iter_down(struct btree_iter *iter)
bch2_bkey_unpack(l->b, &tmp.k,
bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
if (unlikely(IS_ERR(b)))
return PTR_ERR(b);
@@ -1084,7 +1114,7 @@ static void btree_iter_up(struct btree_iter *iter)
btree_node_unlock(iter, iter->level++);
}
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
{
@@ -1104,11 +1134,12 @@ retry_all:
sorted[nr_sorted++] = iter->idx;
#define btree_iter_cmp_by_idx(_l, _r) \
- btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
bch2_trans_unlock(trans);
+ cond_resched();
if (unlikely(ret == -ENOMEM)) {
struct closure cl;
@@ -1139,7 +1170,7 @@ retry_all:
if (!(trans->iters_linked & (1ULL << idx)))
continue;
- ret = btree_iter_traverse_one(&trans->iters[idx]);
+ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
if (ret)
goto retry_all;
}
@@ -1202,7 +1233,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+ unsigned long trace_ip)
{
unsigned depth_want = iter->level;
@@ -1249,8 +1281,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
*/
while (iter->level > depth_want) {
int ret = btree_iter_node(iter, iter->level)
- ? btree_iter_down(iter)
- : btree_iter_lock_root(iter, depth_want);
+ ? btree_iter_down(iter, trace_ip)
+ : btree_iter_lock_root(iter, depth_want, trace_ip);
if (unlikely(ret)) {
if (ret == 1)
return 0;
@@ -1281,7 +1313,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
int ret;
ret = bch2_trans_cond_resched(trans) ?:
- btree_iter_traverse_one(iter);
+ btree_iter_traverse_one(iter, _RET_IP_);
if (unlikely(ret))
ret = __btree_iter_traverse_all(trans, ret);
@@ -1545,13 +1577,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
ret.v = bkeyp_val(&l->b->format, _k);
- if (debug_check_iterators(iter->trans->c)) {
+ if (bch2_debug_check_iterators) {
struct bkey k = bkey_unpack_key(l->b, _k);
BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
}
- if (debug_check_bkeys(iter->trans->c))
+ if (bch2_debug_check_bkeys)
bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
}
@@ -1970,6 +2002,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
return bch2_trans_iter_put(trans, iter);
}
+#if 0
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
@@ -2018,8 +2051,7 @@ success:
sizeof(struct btree_iter) * trans->nr_iters +
sizeof(struct btree_insert_entry) * trans->nr_iters);
- if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
+ kfree(trans->iters);
trans->iters = new_iters;
trans->updates = new_updates;
@@ -2033,6 +2065,7 @@ success:
return 0;
}
+#endif
static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
{
@@ -2042,28 +2075,27 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
goto got_slot;
if (trans->nr_iters == trans->size) {
- int ret;
-
- if (trans->nr_iters >= BTREE_ITER_MAX) {
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter) {
- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
- bch2_btree_ids[iter->btree_id],
- iter->pos.inode,
- iter->pos.offset,
- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
- }
+ struct btree_iter *iter;
- panic("trans iter oveflow\n");
+ BUG_ON(trans->size < BTREE_ITER_MAX);
+
+ trans_for_each_iter(trans, iter) {
+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+ bch2_btree_ids[iter->btree_id],
+ iter->pos.inode,
+ iter->pos.offset,
+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+ (void *) iter->ip_allocated);
}
+ panic("trans iter oveflow\n");
+#if 0
ret = bch2_trans_realloc_iters(trans, trans->size * 2);
if (ret)
return ERR_PTR(ret);
+#endif
}
idx = trans->nr_iters++;
@@ -2305,28 +2337,37 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
bch2_btree_iter_traverse_all(trans);
}
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+ unsigned new_size = BTREE_ITER_MAX;
+ size_t iters_bytes = sizeof(struct btree_iter) * new_size;
+ size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size;
+ void *p;
+
+ BUG_ON(trans->used_mempool);
+
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
+ mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+ trans->iters = p; p += iters_bytes;
+ trans->updates = p; p += updates_bytes;
+ trans->updates2 = p; p += updates_bytes;
+ trans->size = new_size;
+}
+
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
unsigned expected_nr_iters,
size_t expected_mem_bytes)
{
- memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+ memset(trans, 0, sizeof(*trans));
+ trans->c = c;
+ trans->ip = _RET_IP_;
/*
* reallocating iterators currently completely breaks
- * bch2_trans_iter_put():
+ * bch2_trans_iter_put(), we always allocate the max:
*/
- expected_nr_iters = BTREE_ITER_MAX;
-
- trans->c = c;
- trans->ip = _RET_IP_;
- trans->size = ARRAY_SIZE(trans->iters_onstack);
- trans->iters = trans->iters_onstack;
- trans->updates = trans->updates_onstack;
- trans->updates2 = trans->updates2_onstack;
- trans->fs_usage_deltas = NULL;
-
- if (expected_nr_iters > trans->size)
- bch2_trans_realloc_iters(trans, expected_nr_iters);
+ bch2_trans_alloc_iters(trans, c);
if (expected_mem_bytes)
bch2_trans_preload_mem(trans, expected_mem_bytes);
@@ -2341,6 +2382,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
int bch2_trans_exit(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
+
bch2_trans_unlock(trans);
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -2353,19 +2396,21 @@ int bch2_trans_exit(struct btree_trans *trans)
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
- if (trans->used_mempool)
+
+ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+ if (trans->iters)
mempool_free(trans->iters, &trans->c->btree_iters_pool);
- else if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
+
trans->mem = (void *) 0x1;
trans->iters = (void *) 0x1;
return trans->error ? -EIO : 0;
}
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *_b,
+ enum btree_iter_type type)
{
pr_buf(out, " %px l=%u %s:",
_b, _b->level, bch2_btree_ids[_b->btree_id]);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index bd9ec3ec9a92..f7a73619c85b 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -177,11 +177,12 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
-static inline int btree_iter_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+ const struct btree_iter *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
bkey_cmp(l->pos, r->pos);
}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 61662750dfc0..0ee4f78ce67a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -29,8 +29,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
};
__flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
{
struct bkey_cached_key key = {
.btree_id = btree_id,
@@ -204,6 +204,7 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
+__flatten
int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
@@ -218,7 +219,7 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
goto fill;
}
retry:
- ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
if (!ck) {
if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
iter->l[0].b = NULL;
@@ -242,7 +243,7 @@ retry:
enum six_lock_type lock_want = __btree_lock_want(iter, 0);
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
- bkey_cached_check_fn, iter)) {
+ bkey_cached_check_fn, iter, _THIS_IP_)) {
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
goto retry;
@@ -415,7 +416,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
struct bkey_cached_key key = { id, pos };
/* Fastpath - assume it won't be found: */
- if (!btree_key_cache_find(c, id, pos))
+ if (!bch2_btree_key_cache_find(c, id, pos))
return 0;
return btree_key_cache_flush_pos(trans, key, 0, true);
@@ -462,7 +463,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
- BUG_ON(btree_key_cache_find(trans->c, id, pos));
+ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
}
#endif
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index b1756c6c622c..d448264abcc8 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,6 +1,9 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
int bch2_btree_iter_traverse_cached(struct btree_iter *);
bool bch2_btree_insert_key_cached(struct btree_trans *,
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 81fbf3e18647..38323e32731f 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -176,13 +176,15 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type,
- six_lock_should_sleep_fn, void *);
+ six_lock_should_sleep_fn, void *,
+ unsigned long);
static inline bool btree_node_lock(struct btree *b,
struct bpos pos, unsigned level,
struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
bool ret;
@@ -200,7 +202,7 @@ static inline bool btree_node_lock(struct btree *b,
ret = likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type,
- should_sleep_fn, p);
+ should_sleep_fn, p, ip);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->locking = NULL;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 683b416ef427..93721fbc7794 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -130,10 +130,6 @@ struct btree {
struct btree_write writes[2];
-#ifdef CONFIG_BCACHEFS_DEBUG
- bool *expensive_debug_checks;
-#endif
-
/* Key/pointer for this btree node */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
@@ -283,6 +279,11 @@ btree_iter_type(const struct btree_iter *iter)
return iter->flags & BTREE_ITER_TYPE;
}
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+ return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
{
return iter->l + iter->level;
@@ -380,10 +381,6 @@ struct btree_trans {
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
-
- struct btree_iter iters_onstack[2];
- struct btree_insert_entry updates_onstack[2];
- struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
@@ -591,6 +588,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_EXTENTS)| \
(1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_EC)| \
(1U << BKEY_TYPE_REFLINK))
enum btree_trigger_flags {
@@ -602,7 +600,6 @@ enum btree_trigger_flags {
__BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_ALLOC_READ,
__BTREE_TRIGGER_NOATOMIC,
};
@@ -614,7 +611,6 @@ enum btree_trigger_flags {
#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a2604b0ce2d8..4ddd1697ffde 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1313,7 +1313,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
* the node the iterator points to:
*/
while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_packed(b, k, &insert->k) >= 0))
+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
;
for_each_keylist_key(keys, insert)
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index cd699c257244..e386f8ed3922 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -72,7 +72,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_packed(b, k, &insert->k))
+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
k = NULL;
/* @k is the key being overwritten/deleted, if any: */
@@ -220,7 +220,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct bch_fs *c = trans->c;
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
- BUG_ON(debug_check_bkeys(c) &&
+ BUG_ON(bch2_debug_check_bkeys &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
__btree_node_type(iter->level, iter->btree_id)));
}
@@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter)
static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
{
- return (BTREE_NODE_TYPE_HAS_TRIGGERS &
- ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+ return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
+ (1U << BTREE_ID_EC)) &
(1U << iter->btree_id);
}
@@ -439,10 +440,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
*/
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
- if (journal_seq_verify(c))
+ if (bch2_journal_seq_verify)
trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
- else if (inject_invalid_keys(c))
+ else if (bch2_inject_invalid_keys)
trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
@@ -679,6 +680,13 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
+static inline int btree_iter_pos_cmp(const struct btree_iter *l,
+ const struct btree_iter *r)
+{
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->pos, r->pos);
+}
+
static void bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
@@ -696,12 +704,12 @@ static void bch2_trans_update2(struct btree_trans *trans,
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
- if (btree_iter_cmp(n.iter, i->iter) == 0) {
+ if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
}
- if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
break;
}
@@ -985,7 +993,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
* Pending updates are kept sorted: first, find position of new update:
*/
trans_for_each_update(trans, i)
- if (btree_iter_cmp(iter, i->iter) <= 0)
+ if (btree_iter_pos_cmp(iter, i->iter) <= 0)
break;
/*
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 97a8af31ded1..82f1cc4ca693 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -77,6 +77,26 @@
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+ enum bch_data_type data_type,
+ s64 sectors)
+{
+ switch (data_type) {
+ case BCH_DATA_btree:
+ fs_usage->btree += sectors;
+ break;
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ fs_usage->data += sectors;
+ break;
+ case BCH_DATA_cached:
+ fs_usage->cached += sectors;
+ break;
+ default:
+ break;
+ }
+}
+
/*
* Clear journal_seq_valid for buckets for which it's not needed, to prevent
* wraparound:
@@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- switch (e->data_type) {
- case BCH_DATA_btree:
- usage->btree += usage->replicas[i];
- break;
- case BCH_DATA_user:
- usage->data += usage->replicas[i];
- break;
- case BCH_DATA_cached:
- usage->cached += usage->replicas[i];
- break;
- }
+ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
percpu_up_write(&c->mark_lock);
@@ -254,6 +264,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
BUG_ON(idx >= 2);
+ preempt_disable();
write_seqcount_begin(&c->usage_lock);
acc_u64s_percpu((u64 *) c->usage_base,
@@ -261,6 +272,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
write_seqcount_end(&c->usage_lock);
+ preempt_enable();
}
void bch2_fs_usage_to_text(struct printbuf *out,
@@ -374,9 +386,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
return 0;
}
+static inline int is_stripe_data_bucket(struct bucket_mark m)
+{
+ return m.stripe && m.data_type != BCH_DATA_parity;
+}
+
static inline int bucket_stripe_sectors(struct bucket_mark m)
{
- return m.stripe ? m.dirty_sectors : 0;
+ return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
}
static inline enum bch_data_type bucket_type(struct bucket_mark m)
@@ -410,8 +427,8 @@ int bch2_fs_usage_apply(struct bch_fs *c,
*/
should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
if (WARN_ONCE(should_not_have_added > 0,
- "disk usage increased by %lli without a reservation",
- should_not_have_added)) {
+ "disk usage increased by %lli more than reservation of %llu",
+ added, disk_res ? disk_res->sectors : 0)) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
ret = -1;
@@ -482,6 +499,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
+__flatten
void bch2_dev_usage_from_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -519,17 +537,7 @@ static inline int update_replicas(struct bch_fs *c,
if (!fs_usage)
return 0;
- switch (r->data_type) {
- case BCH_DATA_btree:
- fs_usage->btree += sectors;
- break;
- case BCH_DATA_user:
- fs_usage->data += sectors;
- break;
- case BCH_DATA_cached:
- fs_usage->cached += sectors;
- break;
- }
+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
@@ -755,8 +763,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
- if (!(flags & BTREE_TRIGGER_ALLOC_READ))
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
@@ -882,124 +889,155 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
p.crc.uncompressed_size);
}
-static void bucket_set_stripe(struct bch_fs *c,
- const struct bch_extent_ptr *ptr,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq,
- unsigned flags,
- bool enabled)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, gc);
- struct bucket_mark new, old;
-
- old = bucket_cmpxchg(g, new, ({
- new.stripe = enabled;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
- }));
-
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
- /*
- * XXX write repair code for these, flag stripe as possibly bad
- */
- if (old.gen != ptr->gen)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "stripe with stale pointer");
-#if 0
- /*
- * We'd like to check for these, but these checks don't work
- * yet:
- */
- if (old.stripe && enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "multiple stripes using same bucket");
-
- if (!old.stripe && !enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "deleting stripe but bucket not marked as stripe bucket");
-#endif
-}
-
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
- struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u16 *dirty_sectors, u16 *cached_sectors)
+static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 bucket_data_type,
+ u16 dirty_sectors, u16 cached_sectors)
{
- u16 *dst_sectors = !p.ptr.cached
+ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+ u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- u16 orig_sectors = *dst_sectors;
char buf[200];
- if (gen_after(p.ptr.gen, bucket_gen)) {
+ if (gen_after(ptr->gen, bucket_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
- bucket_gen,
- bch2_data_types[*bucket_data_type ?: ptr_data_type],
- p.ptr.gen,
+ ptr->dev, bucket_nr, bucket_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) {
+ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
- bucket_gen,
- bch2_data_types[*bucket_data_type ?: ptr_data_type],
- p.ptr.gen,
+ ptr->dev, bucket_nr, bucket_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (bucket_gen != p.ptr.gen && !p.ptr.cached) {
+ if (bucket_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
- bucket_gen,
- bch2_data_types[*bucket_data_type ?: ptr_data_type],
- p.ptr.gen,
+ ptr->dev, bucket_nr, bucket_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (bucket_gen != p.ptr.gen)
+ if (bucket_gen != ptr->gen)
return 1;
- if (*bucket_data_type && *bucket_data_type != ptr_data_type) {
+ if (bucket_data_type && ptr_data_type &&
+ bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
- bucket_gen,
- bch2_data_types[*bucket_data_type],
+ ptr->dev, bucket_nr, bucket_gen,
+ bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (checked_add(*dst_sectors, sectors)) {
+ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
- bucket_gen,
- bch2_data_types[*bucket_data_type ?: ptr_data_type],
- orig_sectors, sectors,
+ ptr->dev, bucket_nr, bucket_gen,
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bucket_sectors, sectors,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
+ return 0;
+}
+
+static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+ unsigned ptr_idx,
+ struct bch_fs_usage *fs_usage,
+ u64 journal_seq, unsigned flags,
+ bool enabled)
+{
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+ bool gc = flags & BTREE_TRIGGER_GC;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+ struct bucket_mark new, old;
+ char buf[200];
+ int ret;
+
+ if (enabled)
+ g->ec_redundancy = s->nr_redundant;
+
+ old = bucket_cmpxchg(g, new, ({
+ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+ new.dirty_sectors, new.cached_sectors);
+ if (ret)
+ return ret;
+
+ if (new.stripe && enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+ if (!new.stripe && !enabled)
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+ new.stripe = enabled;
+
+ if ((flags & BTREE_TRIGGER_GC) && parity) {
+ new.data_type = enabled ? BCH_DATA_parity : 0;
+ new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+ }
+
+ if (journal_seq) {
+ new.journal_seq_valid = 1;
+ new.journal_seq = journal_seq;
+ }
+ }));
+
+ if (!enabled)
+ g->ec_redundancy = 0;
+
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ return 0;
+}
+
+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 *bucket_data_type,
+ u16 *dirty_sectors, u16 *cached_sectors)
+{
+ u16 *dst_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+ bucket_gen, *bucket_data_type,
+ *dirty_sectors, *cached_sectors);
+
+ if (ret)
+ return ret;
+
+ *dst_sectors += sectors;
*bucket_data_type = *dirty_sectors || *cached_sectors
? ptr_data_type : 0;
return 0;
@@ -1024,7 +1062,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
new.v.counter = old.v.counter = v;
bucket_data_type = new.data_type;
- ret = __mark_pointer(c, k, p, sectors, data_type, new.gen,
+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
&bucket_data_type,
&new.dirty_sectors,
&new.cached_sectors);
@@ -1057,12 +1095,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
- s64 sectors, unsigned flags,
- struct bch_replicas_padded *r,
- unsigned *nr_data,
- unsigned *nr_parity)
+ s64 sectors, unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
+ struct bch_replicas_padded r;
struct stripe *m;
unsigned i, blocks_nonempty = 0;
@@ -1077,14 +1113,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
return -EIO;
}
- BUG_ON(m->r.e.data_type != data_type);
-
- *nr_data = m->nr_blocks - m->nr_redundant;
- *nr_parity = m->nr_redundant;
- *r = m->r;
-
m->block_sectors[p.block] += sectors;
+ r = m->r;
+
for (i = 0; i < m->nr_blocks; i++)
blocks_nonempty += m->block_sectors[i] != 0;
@@ -1096,6 +1128,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
+ r.e.data_type = data_type;
+ update_replicas(c, fs_usage, &r.e, sectors);
+
return 0;
}
@@ -1141,25 +1176,11 @@ static int bch2_mark_extent(struct bch_fs *c,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- struct bch_replicas_padded ec_r;
- unsigned nr_data, nr_parity;
- s64 parity_sectors;
-
ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
- fs_usage, disk_sectors, flags,
- &ec_r, &nr_data, &nr_parity);
+ fs_usage, disk_sectors, flags);
if (ret)
return ret;
- parity_sectors =
- __ptr_disk_sectors_delta(p.crc.live_size,
- offset, sectors, flags,
- p.crc.compressed_size * nr_parity,
- p.crc.uncompressed_size * nr_data);
-
- update_replicas(c, fs_usage, &ec_r.e,
- disk_sectors + parity_sectors);
-
/*
* There may be other dirty pointers in this extent, but
* if so they're not required for mounting if we have an
@@ -1188,6 +1209,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
? bkey_s_c_to_stripe(new).v : NULL;
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
+ int ret;
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
@@ -1197,9 +1219,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
if (!new_s) {
/* Deleting: */
- for (i = 0; i < old_s->nr_blocks; i++)
- bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
- journal_seq, flags, false);
+ for (i = 0; i < old_s->nr_blocks; i++) {
+ ret = bucket_set_stripe(c, old, i, fs_usage,
+ journal_seq, flags, false);
+ if (ret)
+ return ret;
+ }
if (!gc && m->on_heap) {
spin_lock(&c->ec_stripes_heap_lock);
@@ -1207,6 +1232,10 @@ static int bch2_mark_stripe(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
}
+ if (gc)
+ update_replicas(c, fs_usage, &m->r.e,
+ -((s64) m->sectors * m->nr_redundant));
+
memset(m, 0, sizeof(*m));
} else {
BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
@@ -1218,11 +1247,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
old_s->ptrs + i,
sizeof(struct bch_extent_ptr))) {
- if (old_s)
- bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+ if (old_s) {
+ bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
- bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
- journal_seq, flags, true);
+ if (ret)
+ return ret;
+ }
+ ret = bucket_set_stripe(c, new, i, fs_usage,
+ journal_seq, flags, true);
+ if (ret)
+ return ret;
}
}
@@ -1231,19 +1265,23 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->algorithm = new_s->algorithm;
m->nr_blocks = new_s->nr_blocks;
m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
- bch2_bkey_to_replicas(&m->r.e, new);
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ m->block_sectors[i] =
+ stripe_blockcount_get(new_s, i);
+ m->blocks_nonempty += !!m->block_sectors[i];
+ }
- /* gc recalculates these fields: */
- if (!(flags & BTREE_TRIGGER_GC)) {
- m->blocks_nonempty = 0;
+ if (gc && old_s)
+ update_replicas(c, fs_usage, &m->r.e,
+ -((s64) m->sectors * m->nr_redundant));
- for (i = 0; i < new_s->nr_blocks; i++) {
- m->block_sectors[i] =
- stripe_blockcount_get(new_s, i);
- m->blocks_nonempty += !!m->block_sectors[i];
- }
- }
+ bch2_bkey_to_replicas(&m->r.e, new);
+
+ if (gc)
+ update_replicas(c, fs_usage, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant));
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
@@ -1548,23 +1586,21 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+ const struct bch_extent_ptr *ptr,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
- struct btree_iter *iter;
- struct bkey_s_c k_a;
- struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
struct bucket *g;
+ struct btree_iter *iter;
+ struct bkey_s_c k;
int ret;
- iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
+ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
- u = bch2_alloc_unpack(k_a);
+ *u = bch2_alloc_unpack(k);
} else {
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
BTREE_ITER_CACHED|
@@ -1574,16 +1610,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
return PTR_ERR(iter);
ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
+ if (ret) {
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+ }
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
- u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
- ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
+ *_iter = iter;
+ return 0;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ s64 sectors, enum bch_data_type data_type)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc *a;
+ int ret;
+
+ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ if (ret)
+ return ret;
+
+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
@@ -1594,7 +1650,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
bkey_alloc_init(&a->k_i);
- a->k.p = pos;
+ a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
out:
@@ -1604,15 +1660,13 @@ out:
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
- s64 sectors, enum bch_data_type data_type,
- struct bch_replicas_padded *r,
- unsigned *nr_data,
- unsigned *nr_parity)
+ s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
+ struct bch_replicas_padded r;
int ret = 0;
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1633,15 +1687,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto out;
bkey_reassemble(&s->k_i, k);
-
stripe_blockcount_set(&s->v, p.block,
stripe_blockcount_get(&s->v, p.block) +
sectors);
-
- *nr_data = s->v.nr_blocks - s->v.nr_redundant;
- *nr_parity = s->v.nr_redundant;
- bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
bch2_trans_update(trans, iter, &s->k_i, 0);
+
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+ r.e.data_type = data_type;
+ update_replicas_list(trans, &r.e, sectors);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1686,25 +1739,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- struct bch_replicas_padded ec_r;
- unsigned nr_data, nr_parity;
- s64 parity_sectors;
-
ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
- disk_sectors, data_type,
- &ec_r, &nr_data, &nr_parity);
+ disk_sectors, data_type);
if (ret)
return ret;
- parity_sectors =
- __ptr_disk_sectors_delta(p.crc.live_size,
- offset, sectors, flags,
- p.crc.compressed_size * nr_parity,
- p.crc.uncompressed_size * nr_data);
-
- update_replicas_list(trans, &ec_r.e,
- disk_sectors + parity_sectors);
-
r.e.nr_required = 0;
}
}
@@ -1715,6 +1754,76 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
return 0;
}
+static int bch2_trans_mark_stripe(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned flags)
+{
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ struct bch_replicas_padded r;
+ struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc *a;
+ struct btree_iter *iter;
+ bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
+ s64 sectors = le16_to_cpu(s->sectors);
+ unsigned i;
+ int ret = 0;
+
+ if (deleting)
+ sectors = -sectors;
+
+ bch2_bkey_to_replicas(&r.e, k);
+ update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
+
+ /*
+ * The allocator code doesn't necessarily update bucket gens in the
+ * btree when incrementing them, right before handing out new buckets -
+ * we just need to persist those updates here along with the new stripe:
+ */
+
+ for (i = 0; i < s->nr_blocks && !ret; i++) {
+ bool parity = i >= nr_data;
+
+ ret = bch2_trans_start_alloc_update(trans, &iter,
+ &s->ptrs[i], &u);
+ if (ret)
+ break;
+
+ if (parity) {
+ u.dirty_sectors += sectors;
+ u.data_type = u.dirty_sectors
+ ? BCH_DATA_parity
+ : 0;
+ }
+
+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto put_iter;
+
+ bkey_alloc_init(&a->k_i);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+ bch2_trans_update(trans, iter, &a->k_i, 0);
+put_iter:
+ bch2_trans_iter_put(trans, iter);
+ }
+
+ return ret;
+}
+
+static __le64 *bkey_refcount(struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_reflink_v:
+ return &bkey_i_to_reflink_v(k)->v.refcount;
+ case KEY_TYPE_indirect_inline_data:
+ return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ default:
+ return NULL;
+ }
+}
+
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 idx, unsigned sectors,
@@ -1723,7 +1832,8 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_i_reflink_v *r_v;
+ struct bkey_i *n;
+ __le64 *refcount;
s64 ret;
ret = trans_get_key(trans, BTREE_ID_REFLINK,
@@ -1731,14 +1841,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
if (ret < 0)
return ret;
- if (k.k->type != KEY_TYPE_reflink_v) {
- bch2_fs_inconsistent(c,
- "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, idx);
- ret = -EIO;
- goto err;
- }
-
if ((flags & BTREE_TRIGGER_OVERWRITE) &&
(bkey_start_offset(k.k) < idx ||
k.k->p.offset > idx + sectors))
@@ -1746,25 +1848,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
sectors = k.k->p.offset - idx;
- r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(r_v);
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- bkey_reassemble(&r_v->k_i, k);
+ bkey_reassemble(n, k);
- le64_add_cpu(&r_v->v.refcount,
- !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
+ refcount = bkey_refcount(n);
+ if (!refcount) {
+ bch2_fs_inconsistent(c,
+ "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, idx);
+ ret = -EIO;
+ goto err;
+ }
+
+ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
- if (!r_v->v.refcount) {
- r_v->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&r_v->k, 0);
+ if (!*refcount) {
+ n->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&n->k, 0);
}
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
- bch2_trans_update(trans, iter, &r_v->k_i, 0);
+ bch2_trans_update(trans, iter, n, 0);
out:
ret = sectors;
err:
@@ -1814,6 +1924,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
case KEY_TYPE_reflink_v:
return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_user);
+ case KEY_TYPE_stripe:
+ return bch2_trans_mark_stripe(trans, k, flags);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 653f6761862e..a3873becbb70 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,12 +58,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
-static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
- size_t b, int rw)
-{
- bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
-}
-
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index d5215b14d7d9..d6057d22b18e 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -41,6 +41,7 @@ struct bucket {
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
+ u8 ec_redundancy;
};
struct bucket_array {
@@ -125,6 +126,7 @@ struct disk_reservation {
struct copygc_heap_entry {
u8 dev;
u8 gen;
+ u8 replicas;
u16 fragmentation;
u32 sectors;
u64 offset;
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..3d88719ba86c 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -68,21 +68,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +95,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +104,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +113,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -199,7 +200,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +225,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -463,7 +464,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +547,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +575,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +607,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 833537cc8fd0..24dee8039d57 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 47838fd2db06..aebf46bb1d21 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
@@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
- if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+ if (!PageHighMem(bio_iter_page(bio, start)) &&
bio_phys_contig(bio, start))
return (struct bbuf) {
.b = page_address(bio_iter_page(bio, start)) +
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index aa10591a3b1a..bbe3fefa2651 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
v->written = 0;
v->c.level = b->c.level;
v->c.btree_id = b->c.btree_id;
- bch2_btree_keys_init(v, &c->expensive_debug_checks);
+ bch2_btree_keys_init(v);
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick) <= 0)
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 56c2d1ab5f63..7ac1615e9447 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -8,44 +8,15 @@ struct bio;
struct btree;
struct bch_fs;
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
#ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d) ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d) 0
-
#endif
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
__bch2_btree_verify(c, b);
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5514f65378ad..d7ba0e7fc3b3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
unsigned offset = 0, bytes = buf->size << 9;
struct bch_extent_ptr *ptr = &v->ptrs[idx];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
+ ? BCH_DATA_user
+ : BCH_DATA_parity;
if (!bch2_dev_get_ioref(ca, rw)) {
clear_bit(idx, buf->valid);
return;
}
+ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
while (offset < bytes) {
unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
DIV_ROUND_UP(bytes, PAGE_SIZE));
@@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
/* stripe creation: */
static int ec_stripe_bkey_insert(struct bch_fs *c,
+ struct ec_stripe_new *s,
struct bkey_i_stripe *stripe)
{
struct btree_trans trans;
@@ -711,7 +717,7 @@ found_slot:
bch2_trans_update(&trans, iter, &stripe->k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL,
+ ret = bch2_trans_commit(&trans, &s->res, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_put(&trans, iter);
@@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
ret = s->existing_stripe
? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
- NULL, NULL, BTREE_INSERT_NOFAIL)
- : ec_stripe_bkey_insert(c, &s->stripe.key);
+ &s->res, NULL, BTREE_INSERT_NOFAIL)
+ : ec_stripe_bkey_insert(c, s, &s->stripe.key);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
@@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
err_put_writes:
percpu_ref_put(&c->writes);
err:
+ bch2_disk_reservation_put(c, &s->res);
+
open_bucket_for_each(c, &s->blocks, ob, i) {
ob->ec = NULL;
__bch2_open_bucket_put(c, ob);
@@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
struct open_bucket *ob;
unsigned i, data_idx = 0;
s64 idx;
+ int ret;
closure_init_stack(&cl);
@@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
}
}
+ if (!h->s->existing_stripe &&
+ !h->s->res.sectors) {
+ ret = bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity, 0);
+ if (ret) {
+ /* What should we do here? */
+ bch_err(c, "unable to create new stripe: %i", ret);
+ bch2_ec_stripe_head_put(c, h);
+ h = NULL;
+ goto out;
+
+ }
+
+ }
+
if (new_stripe_alloc_buckets(c, h)) {
bch2_ec_stripe_head_put(c, h);
h = NULL;
@@ -1448,7 +1473,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
return 0;
}
-int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
+int bch2_stripes_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
@@ -1476,8 +1501,6 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
if (ret)
break;
-
- *wrote = true;
}
bch2_trans_exit(&trans);
@@ -1497,7 +1520,6 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
bch2_mark_key(c, k, 0, 0, NULL, 0,
- BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
if (ret)
return ret;
@@ -1564,7 +1586,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
size_t i;
spin_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min(h->used, 20UL); i++) {
+ for (i = 0; i < min_t(size_t, h->used, 20); i++) {
m = genradix_ptr(&c->stripes[0], h->data[i].idx);
pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f8fc3d616cd7..15f751fc2a35 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_EC_H
#include "ec_types.h"
+#include "buckets_types.h"
#include "keylist_types.h"
const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -105,6 +106,7 @@ struct ec_stripe_new {
struct open_buckets blocks;
u8 data_block_idx[EC_STRIPE_MAX];
struct open_buckets parity;
+ struct disk_reservation res;
struct keylist keys;
u64 inline_keys[BKEY_U64s * 8];
@@ -156,7 +158,7 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
struct journal_keys;
int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
-int bch2_stripes_write(struct bch_fs *, unsigned, bool *);
+int bch2_stripes_write(struct bch_fs *, unsigned);
int bch2_ec_mem_alloc(struct bch_fs *, bool);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 568f039edcff..7fae6a4ba26f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c,
return bch2_rand_range(l1 + l2) > l1;
}
- if (force_reconstruct_read(c))
+ if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.idx < p2.idx;
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
!bch2_dev_is_readable(ca))
p.idx++;
- if (force_reconstruct_read(c) &&
+ if (bch2_force_reconstruct_read &&
!p.idx && p.has_ec)
p.idx++;
@@ -1200,14 +1200,14 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
le64_add_cpu(&p.v->idx, sub);
break;
}
- case KEY_TYPE_inline_data: {
- struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
+ case KEY_TYPE_inline_data:
+ case KEY_TYPE_indirect_inline_data: {
+ void *p = bkey_inline_data_p(k);
+ unsigned bytes = bkey_inline_data_bytes(k.k);
- sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
+ sub = min_t(u64, sub << 9, bytes);
- memmove(d.v->data,
- d.v->data + sub,
- bkey_val_bytes(d.k) - sub);
+ memmove(p, p + sub, bytes - sub);
new_val_u64s -= sub >> 3;
break;
@@ -1245,7 +1245,9 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
switch (k.k->type) {
case KEY_TYPE_inline_data:
- new_val_u64s = min(new_val_u64s, k.k->size << 6);
+ case KEY_TYPE_indirect_inline_data:
+ new_val_u64s = (bkey_inline_data_offset(k.k) +
+ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
break;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 29b15365d19c..74c7bb8f9104 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -445,10 +445,35 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
}
}
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_inline_data ||
+ k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_inline_data:
+ return sizeof(struct bch_inline_data);
+ case KEY_TYPE_indirect_inline_data:
+ return sizeof(struct bch_indirect_inline_data);
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+ return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
static inline bool bkey_extent_is_data(const struct bkey *k)
{
- return bkey_extent_is_direct_data(k) ||
- k->type == KEY_TYPE_inline_data ||
+ return bkey_extent_is_direct_data(k) ||
+ bkey_extent_is_inline_data(k) ||
k->type == KEY_TYPE_reflink_p;
}
@@ -463,6 +488,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
case KEY_TYPE_reflink_p:
case KEY_TYPE_reflink_v:
case KEY_TYPE_inline_data:
+ case KEY_TYPE_indirect_inline_data:
return true;
default:
return false;
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 878419d40992..503ce1920f39 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -34,9 +34,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
- ret = bch2_inode_create(trans, new_inode,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
+ ret = bch2_inode_create(trans, new_inode);
if (ret)
goto err;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2d08263f3a42..1eb69ed38b10 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -26,6 +26,7 @@
#include <linux/migrate.h>
#include <linux/mmu_context.h>
#include <linux/pagevec.h>
+#include <linux/rmap.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uio.h>
@@ -264,28 +265,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
@@ -299,13 +285,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
@@ -603,18 +583,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
@@ -628,10 +602,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -646,41 +620,33 @@ static void bch2_readpages_end_io(struct bio *bio)
bio_put(bio);
}
-static inline void page_state_init_for_read(struct page *page)
-{
- SetPagePrivate(page);
- page->private = 0;
-}
-
struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
@@ -688,41 +654,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
@@ -783,11 +717,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -841,18 +772,19 @@ retry:
if (ret)
break;
- bkey_on_stack_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
+ bkey_on_stack_reassemble(&sk, c, k);
+
ret = bch2_read_indirect_extent(trans,
&offset_into_extent, &sk);
if (ret)
break;
+ k = bkey_i_to_s_c(sk.k);
+
sectors = min(sectors, k.k->size - offset_into_extent);
bch2_trans_unlock(trans);
@@ -870,7 +802,7 @@ retry:
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(&rbio->bio, k);
- bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+ bch2_read_extent(trans, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
break;
@@ -890,10 +822,9 @@ retry:
bkey_on_stack_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
@@ -902,7 +833,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
@@ -937,8 +868,6 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
@@ -1038,32 +967,33 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1087,7 +1017,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1241,7 +1171,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
@@ -1810,8 +1740,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
+ unsigned unaligned;
bool sync = dio->sync;
long ret;
@@ -1820,7 +1751,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
while (1) {
if (kthread)
- use_mm(dio->mm);
+ kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
@@ -1828,7 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
current->faults_disabled_mapping = NULL;
if (kthread)
- unuse_mm(dio->mm);
+ kthread_unuse_mm(dio->mm);
if (unlikely(ret < 0))
goto err;
@@ -1842,7 +1773,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1905,7 +1836,7 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
@@ -2191,6 +2122,12 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
ret = bch2_get_page_disk_reservation(c, inode, page, false);
BUG_ON(ret);
+ /*
+ * This removes any writeable userspace mappings; we need to force
+ * .page_mkwrite to be called again before any mmapped writes, to
+ * redirty the full page:
+ */
+ page_mkclean(page);
__set_page_dirty_nobuffers(page);
unlock:
unlock_page(page);
@@ -2816,235 +2753,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3240,7 +2948,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..2537a3d25ede 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
@@ -35,10 +34,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 071c3a3de98d..3ac57ba29e9f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -38,9 +38,15 @@ static void bch2_vfs_inode_init(struct bch_fs *,
struct bch_inode_info *,
struct bch_inode_unpacked *);
-static void journal_seq_copy(struct bch_inode_info *dst,
+static void journal_seq_copy(struct bch_fs *c,
+ struct bch_inode_info *dst,
u64 journal_seq)
{
+ /*
+ * atomic64_cmpxchg has a fallback for archs that don't support it,
+ * cmpxchg does not:
+ */
+ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
u64 old, v = READ_ONCE(dst->ei_journal_seq);
do {
@@ -48,7 +54,9 @@ static void journal_seq_copy(struct bch_inode_info *dst,
if (old >= journal_seq)
break;
- } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
+
+ bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
}
static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
@@ -222,6 +230,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
return &inode->v;
}
+static int inum_test(struct inode *inode, void *p)
+{
+ unsigned long *ino = p;
+
+ return *ino == inode->i_ino;
+}
+
static struct bch_inode_info *
__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
umode_t mode, dev_t rdev, bool tmpfile)
@@ -285,12 +300,12 @@ err_before_quota:
if (!tmpfile) {
bch2_inode_update_after_write(c, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(dir, journal_seq);
+ journal_seq_copy(c, dir, journal_seq);
mutex_unlock(&dir->ei_update_lock);
}
bch2_vfs_inode_init(c, inode, &inode_u);
- journal_seq_copy(inode, journal_seq);
+ journal_seq_copy(c, inode, journal_seq);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -301,13 +316,17 @@ err_before_quota:
* thread pulling the inode in and modifying it:
*/
- old = to_bch_ei(insert_inode_locked2(&inode->v));
- if (unlikely(old)) {
+ inode->v.i_state |= I_CREATING;
+ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+ inum_test, NULL, &inode->v.i_ino));
+ BUG_ON(!old);
+
+ if (unlikely(old != inode)) {
/*
* We raced, another process pulled the new inode into cache
* before us:
*/
- journal_seq_copy(old, journal_seq);
+ journal_seq_copy(c, old, journal_seq);
make_bad_inode(&inode->v);
iput(&inode->v);
@@ -401,7 +420,7 @@ static int __bch2_link(struct bch_fs *c,
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
- journal_seq_copy(inode, dir->ei_journal_seq);
+ journal_seq_copy(c, inode, dir->ei_journal_seq);
bch2_inode_update_after_write(c, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
@@ -458,7 +477,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
- journal_seq_copy(inode, dir->ei_journal_seq);
+ journal_seq_copy(c, inode, dir->ei_journal_seq);
bch2_inode_update_after_write(c, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
bch2_inode_update_after_write(c, inode, &inode_u,
@@ -493,7 +512,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
if (unlikely(ret))
goto err;
- journal_seq_copy(dir, inode->ei_journal_seq);
+ journal_seq_copy(c, dir, inode->ei_journal_seq);
ret = __bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
@@ -591,22 +610,22 @@ retry:
bch2_inode_update_after_write(c, src_dir, &src_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(src_dir, journal_seq);
+ journal_seq_copy(c, src_dir, journal_seq);
if (src_dir != dst_dir) {
bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(dst_dir, journal_seq);
+ journal_seq_copy(c, dst_dir, journal_seq);
}
bch2_inode_update_after_write(c, src_inode, &src_inode_u,
ATTR_CTIME);
- journal_seq_copy(src_inode, journal_seq);
+ journal_seq_copy(c, src_inode, journal_seq);
if (dst_inode) {
bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
ATTR_CTIME);
- journal_seq_copy(dst_inode, journal_seq);
+ journal_seq_copy(c, dst_inode, journal_seq);
}
err:
bch2_trans_exit(&trans);
@@ -804,7 +823,7 @@ static int bch2_fill_extent(struct bch_fs *c,
struct fiemap_extent_info *info,
struct bkey_s_c k, unsigned flags)
{
- if (bkey_extent_is_data(k.k)) {
+ if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -835,6 +854,12 @@ static int bch2_fill_extent(struct bch_fs *c,
}
return 0;
+ } else if (bkey_extent_is_inline_data(k.k)) {
+ return fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
+ flags|
+ FIEMAP_EXTENT_DATA_INLINE);
} else if (k.k->type == KEY_TYPE_reservation) {
return fiemap_fill_next_extent(info,
bkey_start_offset(k.k) << 9,
@@ -861,6 +886,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bool have_extent = false;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
@@ -880,27 +909,26 @@ retry:
continue;
}
- bkey_on_stack_realloc(&cur, c, k.k->u64s);
- bkey_on_stack_realloc(&prev, c, k.k->u64s);
- bkey_reassemble(cur.k, k);
- k = bkey_i_to_s_c(cur.k);
-
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
+ bkey_on_stack_reassemble(&cur, c, k);
+
ret = bch2_read_indirect_extent(&trans,
&offset_into_extent, &cur);
if (ret)
break;
+ k = bkey_i_to_s_c(cur.k);
+ bkey_on_stack_realloc(&prev, c, k.k->u64s);
+
sectors = min(sectors, k.k->size - offset_into_extent);
- if (offset_into_extent)
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) +
+ offset_into_extent),
+ cur.k);
bch2_key_resize(&cur.k->k, sectors);
cur.k->k.p = iter->pos;
cur.k->k.p.offset += cur.k->k.size;
@@ -915,10 +943,8 @@ retry:
bkey_copy(prev.k, cur.k);
have_extent = true;
- if (k.k->type == KEY_TYPE_reflink_v)
- bch2_btree_iter_set_pos(iter, k.k->p);
- else
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_set_pos(iter,
+ POS(iter->pos.inode, iter->pos.offset + sectors));
}
if (ret == -EINTR)
@@ -967,15 +993,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -993,7 +1010,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1063,7 +1080,7 @@ static const struct address_space_operations bch_address_space_operations = {
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
@@ -1239,6 +1256,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct bch_fs *c = sb->s_fs_info;
struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
unsigned shift = sb->s_blocksize_bits - 9;
+ /*
+ * this assumes inodes take up 64 bytes, which is a decent average
+ * number:
+ */
+ u64 avail_inodes = ((usage.capacity - usage.used) << 3);
u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
@@ -1246,8 +1268,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = usage.capacity >> shift;
buf->f_bfree = (usage.capacity - usage.used) >> shift;
buf->f_bavail = buf->f_bfree;
- buf->f_files = 0;
- buf->f_ffree = 0;
+
+ buf->f_files = usage.nr_inodes + avail_inodes;
+ buf->f_ffree = avail_inodes;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
@@ -1283,91 +1306,36 @@ static struct bch_fs *bch2_path_to_fs(const char *dev)
c = bch2_bdev_to_fs(bdev);
bdput(bdev);
- return c ?: ERR_PTR(-ENOENT);
-}
-
-static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
- unsigned nr_devs, struct bch_opts opts)
-{
- struct bch_fs *c, *c1, *c2;
- size_t i;
-
- if (!nr_devs)
- return ERR_PTR(-EINVAL);
-
- c = bch2_fs_open(devs, nr_devs, opts);
-
- if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
- /*
- * Already open?
- * Look up each block device, make sure they all belong to a
- * filesystem and they all belong to the _same_ filesystem
- */
-
- c1 = bch2_path_to_fs(devs[0]);
- if (IS_ERR(c1))
- return c;
-
- for (i = 1; i < nr_devs; i++) {
- c2 = bch2_path_to_fs(devs[i]);
- if (!IS_ERR(c2))
- closure_put(&c2->cl);
-
- if (c1 != c2) {
- closure_put(&c1->cl);
- return c;
- }
- }
-
- c = c1;
- }
-
- if (IS_ERR(c))
- return c;
-
- down_write(&c->state_lock);
-
- if (!test_bit(BCH_FS_STARTED, &c->flags)) {
- up_write(&c->state_lock);
+ if (c)
closure_put(&c->cl);
- pr_err("err mounting %s: incomplete filesystem", dev_name);
- return ERR_PTR(-EINVAL);
- }
-
- up_write(&c->state_lock);
-
- set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
- return c;
+ return c ?: ERR_PTR(-ENOENT);
}
-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
- struct bch_opts opts)
+static char **split_devs(const char *_dev_name, unsigned *nr)
{
char *dev_name = NULL, **devs = NULL, *s;
- struct bch_fs *c = ERR_PTR(-ENOMEM);
size_t i, nr_devs = 0;
dev_name = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
- goto err;
+ return NULL;
for (s = dev_name; s; s = strchr(s + 1, ':'))
nr_devs++;
- devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
- if (!devs)
- goto err;
+ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+ if (!devs) {
+ kfree(dev_name);
+ return NULL;
+ }
for (i = 0, s = dev_name;
s;
(s = strchr(s, ':')) && (*s++ = '\0'))
devs[i++] = s;
- c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
-err:
- kfree(devs);
- kfree(dev_name);
- return c;
+ *nr = nr_devs;
+ return devs;
}
static int bch2_remount(struct super_block *sb, int *flags, char *data)
@@ -1378,7 +1346,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
- ret = bch2_parse_mount_opts(&opts, data);
+ ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
return ret;
@@ -1454,6 +1422,13 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
+static void bch2_put_super(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ __bch2_fs_stop(c);
+}
+
static const struct super_operations bch_super_operations = {
.alloc_inode = bch2_alloc_inode,
.destroy_inode = bch2_destroy_inode,
@@ -1464,24 +1439,39 @@ static const struct super_operations bch_super_operations = {
.show_devname = bch2_show_devname,
.show_options = bch2_show_options,
.remount_fs = bch2_remount,
-#if 0
.put_super = bch2_put_super,
+#if 0
.freeze_fs = bch2_freeze,
.unfreeze_fs = bch2_unfreeze,
#endif
};
-static int bch2_test_super(struct super_block *s, void *data)
-{
- return s->s_fs_info == data;
-}
-
static int bch2_set_super(struct super_block *s, void *data)
{
s->s_fs_info = data;
return 0;
}
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+ return -EBUSY;
+}
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+ struct bch_fs *c = s->s_fs_info;
+ struct bch_fs **devs = data;
+ unsigned i;
+
+ if (!c)
+ return false;
+
+ for (i = 0; devs[i]; i++)
+ if (c != devs[i])
+ return false;
+ return true;
+}
+
static struct dentry *bch2_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -1490,30 +1480,65 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
struct super_block *sb;
struct inode *vinode;
struct bch_opts opts = bch2_opts_empty();
- unsigned i;
+ char **devs;
+ struct bch_fs **devs_to_fs = NULL;
+ unsigned i, nr_devs;
int ret;
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
- ret = bch2_parse_mount_opts(&opts, data);
+ ret = bch2_parse_mount_opts(NULL, &opts, data);
if (ret)
return ERR_PTR(ret);
- c = bch2_open_as_blockdevs(dev_name, opts);
- if (IS_ERR(c))
- return ERR_CAST(c);
+ devs = split_devs(dev_name, &nr_devs);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
- sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c);
- if (IS_ERR(sb)) {
- closure_put(&c->cl);
- return ERR_CAST(sb);
+ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+ if (!devs_to_fs) {
+ sb = ERR_PTR(-ENOMEM);
+ goto got_sb;
}
- BUG_ON(sb->s_fs_info != c);
+ for (i = 0; i < nr_devs; i++)
+ devs_to_fs[i] = bch2_path_to_fs(devs[i]);
- if (sb->s_root) {
- closure_put(&c->cl);
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+ flags|SB_NOSEC, devs_to_fs);
+ if (!IS_ERR(sb))
+ goto got_sb;
+ c = bch2_fs_open(devs, nr_devs, opts);
+ if (IS_ERR(c)) {
+ sb = ERR_CAST(c);
+ goto got_sb;
+ }
+
+ /* Some options can't be parsed until after the fs is started: */
+ ret = bch2_parse_mount_opts(c, &opts, data);
+ if (ret) {
+ bch2_fs_stop(c);
+ sb = ERR_PTR(ret);
+ goto got_sb;
+ }
+
+ bch2_opts_apply(&c->opts, opts);
+
+ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+ if (IS_ERR(sb))
+ bch2_fs_stop(c);
+got_sb:
+ kfree(devs_to_fs);
+ kfree(devs[0]);
+ kfree(devs);
+
+ if (IS_ERR(sb))
+ return ERR_CAST(sb);
+
+ c = sb->s_fs_info;
+
+ if (sb->s_root) {
if ((flags ^ sb->s_flags) & SB_RDONLY) {
ret = -EBUSY;
goto err_put_super;
@@ -1540,9 +1565,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (ret)
goto err_put_super;
- sb->s_bdi->congested_fn = bch2_congested;
- sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
@@ -1588,11 +1611,7 @@ static void bch2_kill_sb(struct super_block *sb)
struct bch_fs *c = sb->s_fs_info;
generic_shutdown_super(sb);
-
- if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
- bch2_fs_stop(c);
- else
- closure_put(&c->cl);
+ bch2_fs_free(c);
}
static struct file_system_type bcache_fs_type = {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 5a6df3d1973a..0c5035270846 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -537,7 +537,7 @@ retry:
bch2_trans_unlock(&trans);
- bch2_inode_pack(&p, &w.inode);
+ bch2_inode_pack(c, &p, &w.inode);
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&p.inode.k_i, NULL, NULL,
@@ -808,7 +808,7 @@ create_root:
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed, root_inode);
+ bch2_inode_pack(c, &packed, root_inode);
return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL,
@@ -866,36 +866,22 @@ create_lostfound:
return ret;
}
-struct inode_bitmap {
- unsigned long *bits;
- size_t size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
{
- return nr < b->size ? test_bit(nr, b->bits) : false;
+ unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+ return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
}
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
{
- if (nr >= b->size) {
- size_t new_size = max_t(size_t, max_t(size_t,
- PAGE_SIZE * 8,
- b->size * 2),
- nr + 1);
- void *n;
-
- new_size = roundup_pow_of_two(new_size);
- n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
- if (!n) {
- return -ENOMEM;
- }
+ unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
- b->bits = n;
- b->size = new_size;
- }
+ if (!w)
+ return -ENOMEM;
- __set_bit(nr, b->bits);
+ *w |= 1UL << (nr & (BITS_PER_LONG - 1));
return 0;
}
@@ -934,7 +920,7 @@ noinline_for_stack
static int check_directory_structure(struct bch_fs *c,
struct bch_inode_unpacked *lostfound_inode)
{
- struct inode_bitmap dirs_done = { NULL, 0 };
+ inode_bitmap dirs_done;
struct pathbuf path = { 0, 0, NULL };
struct pathbuf_entry *e;
struct btree_trans trans;
@@ -951,6 +937,7 @@ static int check_directory_structure(struct bch_fs *c,
/* DFS: */
restart_dfs:
+ genradix_init(&dirs_done);
had_unreachable = false;
ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
@@ -1057,7 +1044,7 @@ retry:
if (had_unreachable) {
bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
memset(&dirs_done, 0, sizeof(dirs_done));
memset(&path, 0, sizeof(path));
@@ -1066,7 +1053,7 @@ retry:
err:
fsck_err:
ret = bch2_trans_exit(&trans) ?: ret;
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
return ret;
}
@@ -1326,7 +1313,7 @@ static int check_inode(struct btree_trans *trans,
if (do_update) {
struct bkey_inode_buf p;
- bch2_inode_pack(&p, &u);
+ bch2_inode_pack(c, &p, &u);
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 7d20f082ad45..42371de7f72a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1,12 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "str_hash.h"
+#include "varint.h"
#include <linux/random.h>
@@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end,
return bytes;
}
-void bch2_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- u8 *out = packed->inode.v.fields;
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
unsigned bytes;
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits) \
+#define x(_name, _bits) \
out += inode_encode_field(out, end, 0, inode->_name); \
nr_fields++; \
\
@@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
+ u8 *end = (void *) &packed[1];
+ u8 *last_nonzero_field = out;
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ unsigned bytes;
+ int ret;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (inode->_name) { \
+ ret = bch2_varint_encode(out, inode->_name); \
+ out += ret; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ }
+
+ BCH_INODE_FIELDS()
+#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ nr_fields = last_nonzero_fieldnr;
+
+ bytes = out - (u8 *) &packed->inode.v;
+ set_bkey_val_bytes(&packed->inode.k, bytes);
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+ struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bkey_inode_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+ SET_INODE_NEW_VARINT(&packed->inode.v, true);
+ bch2_inode_pack_v2(packed, inode);
+ } else {
+ bch2_inode_pack_v1(packed, inode);
+ }
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
@@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
-#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits) if (unpacked._name != inode->_name) \
+ panic("unpacked %llu should be %llu", \
+ (u64) unpacked._name, (u64) inode->_name);
BCH_INODE_FIELDS()
#undef x
}
}
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
{
const u8 *in = inode.v->fields;
- const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+ const u8 *end = bkey_val_end(inode);
u64 field[2];
unsigned fieldnr = 0, field_bits;
int ret;
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
#define x(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
memset(&unpacked->_name, 0, \
@@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
#undef x
/* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+#define x(_name, _bits) \
+ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
+ ret = bch2_varint_decode(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(inode, unpacked);
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
return 0;
}
@@ -189,11 +301,11 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
- BTREE_ITER_SLOTS|flags);
+ BTREE_ITER_CACHED|flags);
if (IS_ERR(iter))
return iter;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -222,7 +334,7 @@ int bch2_inode_write(struct btree_trans *trans,
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
- bch2_inode_pack(inode_p, inode);
+ bch2_inode_pack(trans->c, inode_p, inode);
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
@@ -271,6 +383,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
+ pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
#define x(_name, _bits) \
pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
@@ -359,20 +473,24 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
int bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
+ struct bch_inode_unpacked *inode_u)
{
+ struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
- u64 start;
+ u64 min, max, start, *hint;
int ret;
- if (!max)
- max = ULLONG_MAX;
+ unsigned cpu = raw_smp_processor_id();
+ unsigned bits = (c->opts.inodes_32bit
+ ? 31 : 63) - c->inode_shard_bits;
- if (trans->c->opts.inodes_32bit)
- max = min_t(u64, max, U32_MAX);
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
start = READ_ONCE(*hint);
@@ -388,7 +506,17 @@ again:
if (bkey_cmp(iter->pos, POS(0, max)) > 0)
break;
- if (k.k->type != KEY_TYPE_inode)
+ /*
+ * There's a potential cache coherency issue with the btree key
+ * cache code here - we're iterating over the btree, skipping
+ * that cache. We should never see an empty slot that isn't
+ * actually empty due to a pending update in the key cache
+ * because the update that creates the inode isn't done with a
+ * cached iterator, but - better safe than sorry, check the
+ * cache before using a slot:
+ */
+ if (k.k->type != KEY_TYPE_inode &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
goto found_slot;
}
@@ -409,10 +537,7 @@ found_slot:
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
- bch2_trans_iter_put(trans, iter);
- return 0;
+ return bch2_inode_write(trans, iter, inode_u);
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
@@ -422,6 +547,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
struct bkey_i_inode_generation delete;
struct bpos start = POS(inode_nr, 0);
struct bpos end = POS(inode_nr + 1, 0);
+ struct bkey_s_c k;
+ u64 bi_generation;
int ret;
/*
@@ -442,51 +569,62 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
return ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ bi_generation = 0;
+
+ ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
+ if (ret) {
+ if (ret != -EINTR)
+ bch_err(c, "error flushing btree key cache: %i", ret);
+ goto err;
+ }
iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- u32 bi_generation = 0;
+ k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- break;
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
- "inode %llu not found when deleting",
- inode_nr);
+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+ "inode %llu not found when deleting",
+ inode_nr);
- switch (k.k->type) {
- case KEY_TYPE_inode: {
- struct bch_inode_unpacked inode_u;
+ switch (k.k->type) {
+ case KEY_TYPE_inode: {
+ struct bch_inode_unpacked inode_u;
- if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
- bi_generation = inode_u.bi_generation + 1;
- break;
- }
- case KEY_TYPE_inode_generation: {
- struct bkey_s_c_inode_generation g =
- bkey_s_c_to_inode_generation(k);
- bi_generation = le32_to_cpu(g.v->bi_generation);
- break;
- }
- }
+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+ bi_generation = inode_u.bi_generation + 1;
+ break;
+ }
+ case KEY_TYPE_inode_generation: {
+ struct bkey_s_c_inode_generation g =
+ bkey_s_c_to_inode_generation(k);
+ bi_generation = le32_to_cpu(g.v->bi_generation);
+ break;
+ }
+ }
- if (!bi_generation) {
- bkey_init(&delete.k);
- delete.k.p.offset = inode_nr;
- } else {
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p.offset = inode_nr;
- delete.v.bi_generation = cpu_to_le32(bi_generation);
- }
+ if (!bi_generation) {
+ bkey_init(&delete.k);
+ delete.k.p.offset = inode_nr;
+ } else {
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p.offset = inode_nr;
+ delete.v.bi_generation = cpu_to_le32(bi_generation);
+ }
- bch2_trans_update(&trans, iter, &delete.k_i, 0);
+ bch2_trans_update(&trans, iter, &delete.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
- } while (ret == -EINTR);
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret == -EINTR)
+ goto retry;
bch2_trans_exit(&trans);
return ret;
@@ -500,11 +638,11 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(0, inode_nr), BTREE_ITER_SLOTS);
+ POS(0, inode_nr), BTREE_ITER_CACHED);
if (IS_ERR(iter))
return PTR_ERR(iter);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -523,32 +661,3 @@ int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
- struct bch_inode_unpacked *u, test_inodes[] = {
- {
- .bi_atime = U64_MAX,
- .bi_ctime = U64_MAX,
- .bi_mtime = U64_MAX,
- .bi_otime = U64_MAX,
- .bi_size = U64_MAX,
- .bi_sectors = U64_MAX,
- .bi_uid = U32_MAX,
- .bi_gid = U32_MAX,
- .bi_nlink = U32_MAX,
- .bi_generation = U32_MAX,
- .bi_dev = U32_MAX,
- },
- };
-
- for (u = test_inodes;
- u < test_inodes + ARRAY_SIZE(test_inodes);
- u++) {
- struct bkey_inode_buf p;
-
- bch2_inode_pack(&p, u);
- }
-}
-#endif
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index bb759a46dc41..ef7e885dce0c 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
.val_to_text = bch2_inode_generation_to_text, \
}
+#if 0
+typedef struct {
+ u64 lo;
+ u32 hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
struct bch_inode_unpacked {
u64 bi_inum;
__le64 bi_hash_seed;
@@ -43,7 +51,8 @@ struct bkey_inode_buf {
#undef x
} __attribute__((packed, aligned(8)));
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+ const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
struct btree_iter *bch2_inode_peek(struct btree_trans *,
@@ -60,9 +69,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-int bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *,
- u64, u64, u64 *);
+int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
int bch2_inode_rm(struct bch_fs *, u64);
@@ -168,10 +175,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 409c59c219df..21087d1193dc 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -7,6 +7,7 @@
*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "bkey_on_stack.h"
#include "bset.h"
@@ -134,10 +135,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -170,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
while (size) {
struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
BUG_ON(!bio_add_page(bio, page, len, 0));
size -= len;
@@ -300,7 +301,7 @@ int bch2_extent_update(struct btree_trans *trans,
inode_u.bi_sectors += delta;
if (delta || new_i_size) {
- bch2_inode_pack(&inode_p, &inode_u);
+ bch2_inode_pack(trans->c, &inode_p, &inode_u);
bch2_trans_update(trans, inode_iter,
&inode_p.inode.k_i, 0);
}
@@ -1474,7 +1475,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
opts,
DATA_PROMOTE,
(struct data_opts) {
- .target = opts.promote_target
+ .target = opts.promote_target,
+ .nr_replicas = 1,
},
btree_id, k);
BUG_ON(ret);
@@ -1635,7 +1637,7 @@ retry:
goto out;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
if (ret == READ_RETRY)
goto retry;
if (ret)
@@ -1674,7 +1676,6 @@ retry:
unsigned bytes, sectors, offset_into_extent;
bkey_on_stack_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
@@ -1685,6 +1686,8 @@ retry:
if (ret)
break;
+ k = bkey_i_to_s_c(sk.k);
+
sectors = min(sectors, k.k->size - offset_into_extent);
bch2_trans_unlock(&trans);
@@ -1692,7 +1695,7 @@ retry:
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
- ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
offset_into_extent, failed, flags);
switch (ret) {
case READ_RETRY:
@@ -2006,7 +2009,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_reflink_v) {
+ if (k.k->type != KEY_TYPE_reflink_v &&
+ k.k->type != KEY_TYPE_indirect_inline_data) {
__bcache_io_error(trans->c,
"pointer to nonexistent indirect extent");
ret = -EIO;
@@ -2020,11 +2024,12 @@ err:
return ret;
}
-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
unsigned offset_into_extent,
struct bch_io_failures *failed, unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
struct bch_dev *ca;
@@ -2033,13 +2038,12 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bpos pos = bkey_start_pos(k.k);
int pick_ret;
- if (k.k->type == KEY_TYPE_inline_data) {
- struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+ if (bkey_extent_is_inline_data(k.k)) {
unsigned bytes = min_t(unsigned, iter.bi_size,
- bkey_val_bytes(d.k));
+ bkey_inline_data_bytes(k.k));
swap(iter.bi_size, bytes);
- memcpy_to_bio(&orig->bio, iter, d.v->data);
+ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
swap(iter.bi_size, bytes);
bio_advance_iter(&orig->bio, &iter, bytes);
zero_fill_bio_iter(&orig->bio, iter);
@@ -2192,9 +2196,9 @@ get_bio:
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- rcu_read_lock();
- bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
- rcu_read_unlock();
+ if (pick.ptr.cached)
+ bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+ PTR_BUCKET_NR(ca, &pick.ptr), READ);
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
@@ -2311,13 +2315,14 @@ retry:
sectors = k.k->size - offset_into_extent;
bkey_on_stack_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
ret = bch2_read_indirect_extent(&trans,
&offset_into_extent, &sk);
if (ret)
goto err;
+ k = bkey_i_to_s_c(sk.k);
+
/*
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
@@ -2336,7 +2341,7 @@ retry:
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+ bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
break;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index ded468d70f09..e6aac594f3e6 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -136,17 +136,17 @@ enum bch_read_flags {
BCH_READ_IN_RETRY = 1 << 7,
};
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
struct bvec_iter, struct bkey_s_c, unsigned,
struct bch_io_failures *, unsigned);
-static inline void bch2_read_extent(struct bch_fs *c,
+static inline void bch2_read_extent(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bkey_s_c k,
unsigned offset_into_extent,
unsigned flags)
{
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k,
offset_into_extent, NULL, flags);
}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 210ad1b0c469..c2cafd3892a4 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -18,6 +18,8 @@
#include <trace/events/bcachefs.h>
+static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+
static bool __journal_entry_is_open(union journal_res_state state)
{
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
@@ -305,6 +307,19 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
return seq;
}
+void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
+{
+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+ struct journal_buf *buf;
+
+ spin_lock(&j->lock);
+
+ if ((buf = journal_seq_to_buf(j, seq)))
+ set_bit(h, buf->has_inode);
+
+ spin_unlock(&j->lock);
+}
+
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
@@ -965,9 +980,11 @@ void bch2_fs_journal_stop(struct journal *j)
wait_event(j->wait, journal_entry_close(j));
- /* do we need to write another journal entry? */
- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
- bch2_journal_meta(j);
+ /*
+ * Always write a new journal entry, to make sure the clock hands are up
+ * to date (and match the superblock)
+ */
+ bch2_journal_meta(j);
journal_quiesce(j);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 1dde0b5d963f..f60bc964ee1f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -147,6 +147,7 @@ static inline u64 journal_cur_seq(struct journal *j)
}
u64 bch2_inode_journal_seq(struct journal *, u64);
+void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
{
@@ -281,7 +282,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 57591983eebd..18e45296e7de 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -465,34 +465,12 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
return ret;
}
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- unsigned iter, min_nr = 0;
u64 seq_to_flush = 0;
-
- lockdep_assert_held(&j->reclaim_lock);
-
- bch2_journal_do_discards(j);
+ unsigned iter;
spin_lock(&j->lock);
@@ -524,20 +502,52 @@ void bch2_journal_reclaim(struct journal *j)
(j->pin.size >> 1));
spin_unlock(&j->lock);
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(j->reclaim_delay_ms)))
- min_nr = 1;
+ return seq_to_flush;
+}
- if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
- seq_to_flush = max(seq_to_flush, journal_last_seq(j));
- min_nr = 1;
- }
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ unsigned min_nr = 0;
+ u64 seq_to_flush = 0;
+
+ lockdep_assert_held(&j->reclaim_lock);
+
+ do {
+ bch2_journal_do_discards(j);
+
+ seq_to_flush = journal_seq_to_flush(j);
+ min_nr = 0;
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(j->reclaim_delay_ms)))
+ min_nr = 1;
- journal_flush_pins(j, seq_to_flush, min_nr);
+ if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+ min_nr = 1;
+ } while (journal_flush_pins(j, seq_to_flush, min_nr));
if (!bch2_journal_error(j))
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 4a2c4debd3f0..6633d21f604a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -95,10 +95,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
!bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
goto nomatch;
- if (m->data_cmd == DATA_REWRITE &&
- !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
- goto nomatch;
-
bkey_reassemble(&_insert.k, k);
insert = &_insert.k;
@@ -110,9 +106,19 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_cut_back(new->k.p, insert);
bch2_cut_back(insert->k.p, &new->k_i);
- if (m->data_cmd == DATA_REWRITE)
- bch2_bkey_drop_device(bkey_i_to_s(insert),
- m->data_opts.rewrite_dev);
+ if (m->data_cmd == DATA_REWRITE) {
+ struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
+ bch2_bkey_has_device(bkey_i_to_s_c(insert),
+ m->data_opts.rewrite_dev);
+ if (!old_ptr)
+ goto nomatch;
+
+ if (old_ptr->cached)
+ extent_for_each_ptr(extent_i_to_s(new), new_ptr)
+ new_ptr->cached = true;
+
+ bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+ }
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
@@ -260,8 +266,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_FROM_INTERNAL;
- m->op.nr_replicas = 1;
- m->op.nr_replicas_required = 1;
+ m->op.nr_replicas = data_opts.nr_replicas;
+ m->op.nr_replicas_required = data_opts.nr_replicas;
m->op.index_update_fn = bch2_migrate_index_update;
switch (data_cmd) {
@@ -291,14 +297,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
unsigned compressed_sectors = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- crc_is_compressed(p.crc) &&
- bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
+ if (p.ptr.dev == data_opts.rewrite_dev &&
+ !p.ptr.cached &&
+ crc_is_compressed(p.crc))
compressed_sectors += p.crc.compressed_size;
if (compressed_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res,
- compressed_sectors,
+ k.k->size * m->op.nr_replicas,
BCH_DISK_RESERVATION_NOFAIL);
if (ret)
return ret;
@@ -320,12 +326,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -409,7 +415,7 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
-static int bch2_move_extent(struct bch_fs *c,
+static int bch2_move_extent(struct btree_trans *trans,
struct moving_context *ctxt,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
@@ -418,6 +424,7 @@ static int bch2_move_extent(struct bch_fs *c,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
const union bch_extent_entry *entry;
@@ -484,7 +491,7 @@ static int bch2_move_extent(struct bch_fs *c,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(c, &io->rbio, k, 0,
+ bch2_read_extent(trans, &io->rbio, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@@ -602,7 +609,7 @@ peek:
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
- ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
+ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
if (ret2 == -ENOMEM) {
@@ -749,6 +756,7 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
return DATA_SKIP;
data_opts->target = 0;
+ data_opts->nr_replicas = 1;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
@@ -764,6 +772,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
return DATA_SKIP;
data_opts->target = 0;
+ data_opts->nr_replicas = 1;
data_opts->btree_insert_flags = 0;
data_opts->rewrite_dev = op->migrate.dev;
return DATA_REWRITE;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 0acd1720d4f8..b04bc669226d 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -20,7 +20,8 @@ enum data_cmd {
struct data_opts {
u16 target;
- unsigned rewrite_dev;
+ u8 rewrite_dev;
+ u8 nr_replicas;
int btree_insert_flags;
};
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index de0a7974ec9f..ddfda1ef8a79 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -53,17 +53,21 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
cmp_int(l->offset, r->offset);
}
-static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
copygc_heap *h = &c->copygc_heap;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct copygc_heap_entry search = {
- .dev = ptr->dev,
- .offset = ptr->offset
+ .dev = p.ptr.dev,
+ .offset = p.ptr.offset,
};
ssize_t i = eytzinger0_find_le(h->data, h->used,
@@ -81,27 +85,24 @@ static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
BUG_ON(i != j);
#endif
if (i >= 0 &&
- ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
- ptr->gen == h->data[i].gen)
- return ptr->dev;
- }
+ p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
+ p.ptr.gen == h->data[i].gen) {
+ data_opts->target = io_opts->background_target;
+ data_opts->nr_replicas = 1;
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
+ data_opts->rewrite_dev = p.ptr.dev;
- return -1;
-}
+ if (p.has_ec) {
+ struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- int dev_idx = __copygc_pred(c, k);
- if (dev_idx < 0)
- return DATA_SKIP;
-
- data_opts->target = io_opts->background_target;
- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
- data_opts->rewrite_dev = dev_idx;
- return DATA_REWRITE;
+ data_opts->nr_replicas += m->nr_redundant;
+ }
+
+ return DATA_REWRITE;
+ }
+ }
+
+ return DATA_SKIP;
}
static bool have_copygc_reserve(struct bch_dev *ca)
@@ -168,7 +169,8 @@ static int bch2_copygc(struct bch_fs *c)
buckets = bucket_array(ca);
for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+ struct bucket *g = buckets->b + b;
+ struct bucket_mark m = READ_ONCE(g->mark);
struct copygc_heap_entry e;
if (m.owned_by_allocator ||
@@ -177,9 +179,12 @@ static int bch2_copygc(struct bch_fs *c)
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
+ WARN_ON(m.stripe && !g->ec_redundancy);
+
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
+ .replicas = 1 + g->ec_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
@@ -196,11 +201,11 @@ static int bch2_copygc(struct bch_fs *c)
}
for (i = h->data; i < h->data + h->used; i++)
- sectors_to_move += i->sectors;
+ sectors_to_move += i->sectors * i->replicas;
while (sectors_to_move > sectors_reserved) {
BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
- sectors_to_move -= e.sectors;
+ sectors_to_move -= e.sectors * e.replicas;
}
buckets_to_move = h->used;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index afe25cd26c06..97a36ac0beea 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -247,7 +247,7 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
break;
case BCH_OPT_FN:
if (!c)
- return -EINVAL;
+ return 0;
return opt->parse(c, val, res);
}
@@ -325,7 +325,8 @@ int bch2_opts_check_may_set(struct bch_fs *c)
return 0;
}
-int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+ char *options)
{
char *opt, *name, *val;
int ret, id;
@@ -340,7 +341,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
if (id < 0)
goto bad_opt;
- ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
if (ret < 0)
goto bad_val;
} else {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 014c608ca0c6..710a7ee67039 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -185,7 +185,7 @@ enum opt_type {
x(inline_data, u8, \
OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(acl, u8, \
OPT_FORMAT|OPT_MOUNT, \
@@ -418,7 +418,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_mount_opts(struct bch_opts *, char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
/* inode opts: */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 56a1f761271f..44d2651be970 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -73,6 +73,7 @@ static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
{
if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
data_opts->target = io_opts->background_target;
+ data_opts->nr_replicas = 1;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
} else {
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6e829bf0a31f..1745cfac6b26 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -25,6 +25,18 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+/* for -o reconstruct_alloc: */
+static void drop_alloc_keys(struct journal_keys *keys)
+{
+ size_t src, dst;
+
+ for (src = 0, dst = 0; src < keys->nr; src++)
+ if (keys->d[src].btree_id != BTREE_ID_ALLOC)
+ keys->d[dst++] = keys->d[src];
+
+ keys->nr = dst;
+}
+
/* iterate over keys read from the journal: */
static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
@@ -845,9 +857,11 @@ static int verify_superblock_clean(struct bch_fs *c,
}
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
+ "superblock read clock %u doesn't match journal %u after clean shutdown",
+ clean->read_clock, j->read_clock);
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
+ "superblock write clock %u doesn't match journal %u after clean shutdown",
+ clean->write_clock, j->write_clock);
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
@@ -928,7 +942,6 @@ static int read_btree_roots(struct bch_fs *c)
continue;
}
-
if (r->error) {
__fsck_err(c, i == BTREE_ID_ALLOC
? FSCK_CAN_IGNORE : 0,
@@ -961,7 +974,7 @@ int bch2_fs_recovery(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
- bool wrote = false, write_sb = false;
+ bool write_sb = false, need_write_alloc = false;
int ret;
if (c->sb.clean)
@@ -1025,6 +1038,11 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
+ if (c->opts.reconstruct_alloc) {
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ drop_alloc_keys(&c->journal_keys);
+ }
+
ret = journal_replay_early(c, clean, &c->journal_entries);
if (ret)
goto err;
@@ -1090,8 +1108,10 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "starting metadata mark and sweep");
err = "error in mark and sweep";
ret = bch2_gc(c, &c->journal_keys, true, true);
- if (ret)
+ if (ret < 0)
goto err;
+ if (ret)
+ need_write_alloc = true;
bch_verbose(c, "mark and sweep done");
}
@@ -1101,8 +1121,10 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
ret = bch2_gc(c, &c->journal_keys, true, false);
- if (ret)
+ if (ret < 0)
goto err;
+ if (ret)
+ need_write_alloc = true;
bch_verbose(c, "mark and sweep done");
}
@@ -1126,7 +1148,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
bch_verbose(c, "journal replay done");
- if (!c->opts.nochanges) {
+ if (need_write_alloc && !c->opts.nochanges) {
/*
* note that even when filesystem was clean there might be work
* to do here, if we ran gc (because of fsck) which recalculated
@@ -1134,8 +1156,8 @@ int bch2_fs_recovery(struct bch_fs *c)
*/
bch_verbose(c, "writing allocation info");
err = "error writing out alloc info";
- ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
- bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
if (ret) {
bch_err(c, "error writing alloc info");
goto err;
@@ -1281,15 +1303,29 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_fs_journal_start(&c->journal, 1, &journal);
bch2_journal_set_replay_done(&c->journal);
+ err = "error going read-write";
+ ret = bch2_fs_read_write_early(c);
+ if (ret)
+ goto err;
+
+ /*
+ * Write out the superblock and journal buckets, now that we can do
+ * btree updates
+ */
+ err = "error writing alloc info";
+ ret = bch2_alloc_write(c, 0);
+ if (ret)
+ goto err;
+
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed_inode, &root_inode);
+ bch2_inode_pack(c, &packed_inode, &root_inode);
err = "error creating root directory";
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
- NULL, NULL, BTREE_INSERT_LAZY_RW);
+ NULL, NULL, 0);
if (ret)
goto err;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 3c473f1380a6..8abcbfb3bd64 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -9,6 +9,18 @@
#include <linux/sched/signal.h>
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_extent:
+ return KEY_TYPE_reflink_v;
+ case KEY_TYPE_inline_data:
+ return KEY_TYPE_indirect_inline_data;
+ default:
+ return 0;
+ }
+}
+
/* reflink pointers */
const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -71,17 +83,42 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
+/* indirect inline data */
+
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
+ return "incorrect value size";
+ return NULL;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+ unsigned datalen = bkey_inline_data_bytes(k.k);
+
+ pr_buf(out, "refcount %llu datalen %u: %*phN",
+ le64_to_cpu(d.v->refcount), datalen,
+ min(datalen, 32U), d.v->data);
+}
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
- struct bkey_i_extent *e)
+ struct bkey_i *orig)
{
struct bch_fs *c = trans->c;
struct btree_iter *reflink_iter;
struct bkey_s_c k;
- struct bkey_i_reflink_v *r_v;
+ struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
+ __le64 *refcount;
int ret;
+ if (orig->k.type == KEY_TYPE_inline_data)
+ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
POS(0, c->reflink_hint),
BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
@@ -90,7 +127,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
continue;
}
- if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+ if (bkey_deleted(k.k) && orig->k.size <= k.k->size)
break;
}
@@ -100,29 +137,31 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
/* rewind iter to start of hole, if necessary: */
bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
- r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k));
ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
goto err;
- bkey_reflink_v_init(&r_v->k_i);
+ bkey_init(&r_v->k);
+ r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter->pos;
- bch2_key_resize(&r_v->k, e->k.size);
- r_v->k.version = e->k.version;
+ bch2_key_resize(&r_v->k, orig->k.size);
+ r_v->k.version = orig->k.version;
+
+ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
- set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
- bkey_val_u64s(&e->k));
- r_v->v.refcount = 0;
- memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+ refcount = (void *) &r_v->v;
+ *refcount = 0;
+ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
- bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0);
+ bch2_trans_update(trans, reflink_iter, r_v, 0);
r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
if (IS_ERR(r_p))
return PTR_ERR(r_p);
- e->k.type = KEY_TYPE_reflink_p;
- r_p = bkey_i_to_reflink_p(&e->k_i);
+ orig->k.type = KEY_TYPE_reflink_p;
+ r_p = bkey_i_to_reflink_p(orig);
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
@@ -144,8 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
if (bkey_cmp(iter->pos, end) >= 0)
return bkey_s_c_null;
- if (k.k->type == KEY_TYPE_extent ||
- k.k->type == KEY_TYPE_reflink_p)
+ if (bkey_extent_is_data(k.k))
break;
}
@@ -218,7 +256,7 @@ s64 bch2_remap_range(struct bch_fs *c,
if (!bkey_cmp(dst_iter->pos, dst_end))
break;
- if (src_k.k->type == KEY_TYPE_extent) {
+ if (src_k.k->type != KEY_TYPE_reflink_p) {
bkey_on_stack_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
@@ -226,7 +264,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_cut_back(src_end, new_src.k);
ret = bch2_make_extent_indirect(&trans, src_iter,
- bkey_i_to_extent(new_src.k));
+ new_src.k);
if (ret)
goto btree_err;
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 5445c1cf0797..9d5e7dc58f2b 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -18,13 +18,22 @@ const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
}
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
+ struct bkey_s_c);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+ struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
+ .key_invalid = bch2_indirect_inline_data_invalid, \
+ .val_to_text = bch2_indirect_inline_data_to_text, \
+}
+
s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
u64, u64 *, u64, s64 *);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 6b6506c68609..91518c0d6794 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
extent_to_replicas(k, e);
break;
case KEY_TYPE_stripe:
- e->data_type = BCH_DATA_user;
+ e->data_type = BCH_DATA_parity;
stripe_to_replicas(k, e);
break;
}
@@ -446,7 +446,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
bch2_bkey_to_replicas(&search.e, k);
- return __bch2_mark_replicas(c, &search.e, check);
+ ret = __bch2_mark_replicas(c, &search.e, check);
+ if (ret)
+ return ret;
+
+ if (search.e.data_type == BCH_DATA_parity) {
+ search.e.data_type = BCH_DATA_cached;
+ ret = __bch2_mark_replicas(c, &search.e, check);
+ if (ret)
+ return ret;
+
+ search.e.data_type = BCH_DATA_user;
+ ret = __bch2_mark_replicas(c, &search.e, check);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 30be083b09bf..8673e9744ce1 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -149,44 +149,6 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
return c;
}
-int bch2_congested(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
- struct backing_dev_info *bdi;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- rcu_read_lock();
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- const struct bch_devs_mask *devs =
- bch2_target_to_mask(c, c->opts.foreground_target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_member_device_rcu(ca, c, i, devs) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/* Filesystem RO/RW: */
/*
@@ -207,14 +169,15 @@ int bch2_congested(void *data, int bdi_bits)
static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
- bool wrote = false;
unsigned i, clean_passes = 0;
- int ret;
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
+ bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
+ bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
+
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
@@ -228,20 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
goto nowrote_alloc;
- bch_verbose(c, "writing alloc info");
- /*
- * This should normally just be writing the bucket read/write clocks:
- */
- ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
- bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
- bch_verbose(c, "writing alloc info complete");
-
- if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
-
- if (ret)
- goto nowrote_alloc;
-
bch_verbose(c, "flushing journal and stopping allocators");
bch2_journal_flush_all_pins(&c->journal);
@@ -454,6 +403,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
+ bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
+
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
@@ -496,9 +448,10 @@ int bch2_fs_read_write_early(struct bch_fs *c)
/* Filesystem startup/shutdown: */
-static void bch2_fs_free(struct bch_fs *c)
+static void __bch2_fs_free(struct bch_fs *c)
{
unsigned i;
+ int cpu;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
@@ -523,6 +476,12 @@ static void bch2_fs_free(struct bch_fs *c)
free_percpu(c->usage[1]);
free_percpu(c->usage[0]);
kfree(c->usage_base);
+
+ if (c->btree_iters_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+ free_percpu(c->btree_iters_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
@@ -533,6 +492,7 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
+ kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
if (c->journal_reclaim_wq)
@@ -552,10 +512,10 @@ static void bch2_fs_release(struct kobject *kobj)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- bch2_fs_free(c);
+ __bch2_fs_free(c);
}
-void bch2_fs_stop(struct bch_fs *c)
+void __bch2_fs_stop(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
@@ -586,13 +546,6 @@ void bch2_fs_stop(struct bch_fs *c)
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
- mutex_lock(&bch_fs_list_lock);
- list_del(&c->list);
- mutex_unlock(&bch_fs_list_lock);
-
- closure_sync(&c->cl);
- closure_debug_destroy(&c->cl);
-
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
@@ -605,6 +558,22 @@ void bch2_fs_stop(struct bch_fs *c)
for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i])
+ bch2_free_super(&c->devs[i]->disk_sb);
+}
+
+void bch2_fs_free(struct bch_fs *c)
+{
+ unsigned i;
+
+ mutex_lock(&bch_fs_list_lock);
+ list_del(&c->list);
+ mutex_unlock(&bch_fs_list_lock);
+
+ closure_sync(&c->cl);
+ closure_debug_destroy(&c->cl);
+
+ for (i = 0; i < c->sb.nr_devices; i++)
+ if (c->devs[i])
bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
bch_verbose(c, "shutdown complete");
@@ -612,6 +581,12 @@ void bch2_fs_stop(struct bch_fs *c)
kobject_put(&c->kobj);
}
+void bch2_fs_stop(struct bch_fs *c)
+{
+ __bch2_fs_stop(c);
+ bch2_fs_free(c);
+}
+
static const char *bch2_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -669,6 +644,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
__module_get(THIS_MODULE);
+ closure_init(&c->cl, NULL);
+
+ c->kobj.kset = bcachefs_kset;
+ kobject_init(&c->kobj, &bch2_fs_ktype);
+ kobject_init(&c->internal, &bch2_fs_internal_ktype);
+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
c->minor = -1;
c->disk_sb.fs_sb = true;
@@ -761,11 +744,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+ !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
@@ -775,9 +760,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+ sizeof(u64), GFP_KERNEL)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
@@ -799,18 +787,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_dev_alloc(c, i))
goto err;
- /*
- * Now that all allocations have succeeded, init various refcounty
- * things that let us shutdown:
- */
- closure_init(&c->cl, NULL);
-
- c->kobj.kset = bcachefs_kset;
- kobject_init(&c->kobj, &bch2_fs_ktype);
- kobject_init(&c->internal, &bch2_fs_internal_ktype);
- kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
- kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
-
mutex_lock(&bch_fs_list_lock);
err = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
@@ -906,6 +882,13 @@ int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags);
+ /*
+ * Allocator threads don't start filling copygc reserve until after we
+ * set BCH_FS_STARTED - wake them now:
+ */
+ for_each_online_member(ca, c, i)
+ bch2_wake_allocator(ca);
+
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
@@ -1683,6 +1666,11 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ err = "alloc write failed";
+ ret = bch2_dev_alloc_write(c, ca, 0);
+ if (ret)
+ goto err;
+
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
@@ -2037,7 +2025,6 @@ static void bcachefs_exit(void)
static int __init bcachefs_init(void)
{
bch2_bkey_pack_test();
- bch2_inode_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
bch2_chardev_init() ||
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index fffee96726ce..02c81f3555c3 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -199,7 +199,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
@@ -231,6 +230,8 @@ static inline void bch2_fs_lazy_rw(struct bch_fs *c)
bch2_fs_read_write_early(c);
}
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 0cb29f43d99d..d7ad293aff4d 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -208,12 +208,6 @@ read_attribute(io_timers_write);
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
-#define BCH_DEBUG_PARAM(name, description) \
- rw_attribute(name);
-
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
@@ -414,10 +408,6 @@ SHOW(bch2_fs)
return out.pos - buf;
}
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
return 0;
}
@@ -462,10 +452,6 @@ STORE(bch2_fs)
/* Debugging: */
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM;
@@ -590,11 +576,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_io_timers_write,
&sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
NULL
};
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index fd4044a6a08f..2709163e02b5 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
{
while (size) {
struct page *page = alloc_page(gfp_mask);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
if (!page)
return -ENOMEM;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 119c86122023..6e5335440b4b 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -37,17 +37,6 @@ struct closure;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-#define memcpy(dst, src, len) \
-({ \
- void *_dst = (dst); \
- const void *_src = (src); \
- size_t _len = (len); \
- \
- BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
- (void *) (_dst) + (_len) <= (void *) (_src))); \
- memcpy(_dst, _src, _len); \
-})
-
#else /* DEBUG */
#define EBUG_ON(cond)
@@ -99,7 +88,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
@@ -664,35 +653,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
new file mode 100644
index 000000000000..a3d252c741c8
--- /dev/null
+++ b/fs/bcachefs/varint.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ }
+
+ put_unaligned_le64(v, out);
+ return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+ u64 v = get_unaligned_le64(in);
+ unsigned bytes = ffz(v & 255) + 1;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ v >>= bytes;
+ v &= ~(~0ULL << (7 * bytes));
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
new file mode 100644
index 000000000000..8daf813576b7
--- /dev/null
+++ b/fs/bcachefs/varint.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 9b4e8295ed75..ba2c55559796 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -536,9 +536,46 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+ TP_PROTO(unsigned long trans_ip,
+ unsigned long caller_ip,
+ unsigned reason,
+ enum btree_id have_btree_id,
+ unsigned have_iter_type,
+ enum btree_id want_btree_id,
+ unsigned want_iter_type),
+ TP_ARGS(trans_ip, caller_ip, reason,
+ have_btree_id, have_iter_type,
+ want_btree_id, want_iter_type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, trans_ip )
+ __field(unsigned long, caller_ip )
+ __field(u8, reason )
+ __field(u8, have_btree_id )
+ __field(u8, have_iter_type )
+ __field(u8, want_btree_id )
+ __field(u8, want_iter_type )
+ ),
+
+ TP_fast_assign(
+ __entry->trans_ip = trans_ip;
+ __entry->caller_ip = caller_ip;
+ __entry->reason = reason;
+ __entry->have_btree_id = have_btree_id;
+ __entry->have_iter_type = have_iter_type;
+ __entry->want_btree_id = want_btree_id;
+ __entry->want_iter_type = want_iter_type;
+ ),
+
+ TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+ (void *) __entry->trans_ip,
+ (void *) __entry->caller_ip,
+ __entry->reason,
+ __entry->have_btree_id,
+ __entry->have_iter_type,
+ __entry->want_btree_id,
+ __entry->want_iter_type)
);
TRACE_EVENT(trans_restart_iters_realloced,