summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-01-20 15:34:50 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-05-06 17:14:18 -0400
commit6f4bfbc65f9100a98dba1734dbff26129a4413a8 (patch)
tree2ff581f9abb56c912e7007577bb5813f3e5de1c4
parent181bdfee8fea1f62a6544b6913d2291ecca5fcda (diff)
Merge with fe198a9f39) bcachefs: Improve tracepoints slightly in commit path
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/acl.c3
-rw-r--r--fs/bcachefs/alloc_background.c24
-rw-r--r--fs/bcachefs/bcachefs.h4
-rw-r--r--fs/bcachefs/bcachefs_format.h156
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h70
-rw-r--r--fs/bcachefs/bkey_methods.c2
-rw-r--r--fs/bcachefs/bkey_sort.c482
-rw-r--r--fs/bcachefs/bkey_sort.h28
-rw-r--r--fs/bcachefs/bset.c24
-rw-r--r--fs/bcachefs/bset.h36
-rw-r--r--fs/bcachefs/btree_cache.c1
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_gc.c18
-rw-r--r--fs/bcachefs/btree_io.c264
-rw-r--r--fs/bcachefs/btree_io.h13
-rw-r--r--fs/bcachefs/btree_iter.c193
-rw-r--r--fs/bcachefs/btree_iter.h6
-rw-r--r--fs/bcachefs/btree_types.h66
-rw-r--r--fs/bcachefs/btree_update.h65
-rw-r--r--fs/bcachefs/btree_update_interior.c41
-rw-r--r--fs/bcachefs/btree_update_interior.h29
-rw-r--r--fs/bcachefs/btree_update_leaf.c482
-rw-r--r--fs/bcachefs/buckets.c147
-rw-r--r--fs/bcachefs/buckets.h17
-rw-r--r--fs/bcachefs/chardev.c152
-rw-r--r--fs/bcachefs/checksum.c31
-rw-r--r--fs/bcachefs/checksum.h10
-rw-r--r--fs/bcachefs/clock.c43
-rw-r--r--fs/bcachefs/clock.h6
-rw-r--r--fs/bcachefs/clock_types.h1
-rw-r--r--fs/bcachefs/compress.c40
-rw-r--r--fs/bcachefs/dirent.c22
-rw-r--r--fs/bcachefs/dirent.h2
-rw-r--r--fs/bcachefs/ec.c21
-rw-r--r--fs/bcachefs/error.h20
-rw-r--r--fs/bcachefs/extent_update.c379
-rw-r--r--fs/bcachefs/extent_update.h7
-rw-r--r--fs/bcachefs/extents.c93
-rw-r--r--fs/bcachefs/extents.h1
-rw-r--r--fs/bcachefs/fs-common.c12
-rw-r--r--fs/bcachefs/fs-common.h1
-rw-r--r--fs/bcachefs/fs-io.c331
-rw-r--r--fs/bcachefs/fs-io.h4
-rw-r--r--fs/bcachefs/fs.c31
-rw-r--r--fs/bcachefs/fsck.c133
-rw-r--r--fs/bcachefs/inode.c9
-rw-r--r--fs/bcachefs/io.c50
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c4
-rw-r--r--fs/bcachefs/migrate.c61
-rw-r--r--fs/bcachefs/move.c9
-rw-r--r--fs/bcachefs/movinggc.c52
-rw-r--r--fs/bcachefs/opts.c20
-rw-r--r--fs/bcachefs/opts.h17
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c2
-rw-r--r--fs/bcachefs/recovery.c77
-rw-r--r--fs/bcachefs/reflink.c12
-rw-r--r--fs/bcachefs/replicas.c6
-rw-r--r--fs/bcachefs/replicas.h3
-rw-r--r--fs/bcachefs/str_hash.h6
-rw-r--r--fs/bcachefs/super-io.c4
-rw-r--r--fs/bcachefs/super.c49
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/sysfs.c16
-rw-r--r--fs/bcachefs/tests.c10
-rw-r--r--fs/bcachefs/util.h29
-rw-r--r--fs/bcachefs/xattr.c2
67 files changed, 1875 insertions, 2079 deletions
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index dcd0dfe87b51..76c98ddbf628 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -323,7 +323,6 @@ retry:
ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
btree_err:
if (ret == -EINTR)
@@ -378,7 +377,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
}
new->k.p = iter->pos;
- bch2_trans_update(trans, iter, &new->k_i);
+ bch2_trans_update(trans, iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index e252a039dc2b..c57df50168e0 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -222,8 +222,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
bch2_mark_key(c, k, 0, 0, NULL, 0,
- BCH_BUCKET_MARK_ALLOC_READ|
- BCH_BUCKET_MARK_NOATOMIC);
+ BTREE_TRIGGER_ALLOC_READ|
+ BTREE_TRIGGER_NOATOMIC);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
@@ -235,8 +235,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
if (j->btree_id == BTREE_ID_ALLOC)
bch2_mark_key(c, bkey_i_to_s_c(j->k),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_ALLOC_READ|
- BCH_BUCKET_MARK_NOATOMIC);
+ BTREE_TRIGGER_ALLOC_READ|
+ BTREE_TRIGGER_NOATOMIC);
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
@@ -314,12 +314,10 @@ retry:
a->k.p = iter->pos;
bch2_alloc_pack(a, new_u);
- bch2_trans_update(trans, iter, &a->k_i);
+ bch2_trans_update(trans, iter, &a->k_i,
+ BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOMARK|
- flags);
+ BTREE_INSERT_NOFAIL|flags);
err:
if (ret == -EINTR)
goto retry;
@@ -384,8 +382,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
ret = bch2_alloc_write_key(&trans, iter,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
+ BTREE_INSERT_JOURNAL_REPLAY);
bch2_trans_exit(&trans);
return ret < 0 ? ret : 0;
}
@@ -902,7 +899,8 @@ retry:
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i);
+ bch2_trans_update(trans, iter, &a->k_i,
+ BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
* XXX:
@@ -913,13 +911,11 @@ retry:
*/
ret = bch2_trans_commit(trans, NULL,
invalidating_cached_data ? journal_seq : NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_BUCKET_INVALIDATE|
flags);
if (ret == -EINTR)
goto retry;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 7175e4b9e070..6ea26ae5c3b8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -714,12 +714,12 @@ struct bch_fs {
struct rhashtable promote_table;
mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_NR];
+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
mempool_t decompress_workspace;
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 3d85012a15fd..f6141fde830b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -432,47 +432,6 @@ struct bch_csum {
__le64 hi;
} __attribute__((packed, aligned(8)));
-enum bch_csum_type {
- BCH_CSUM_NONE = 0,
- BCH_CSUM_CRC32C_NONZERO = 1,
- BCH_CSUM_CRC64_NONZERO = 2,
- BCH_CSUM_CHACHA20_POLY1305_80 = 3,
- BCH_CSUM_CHACHA20_POLY1305_128 = 4,
- BCH_CSUM_CRC32C = 5,
- BCH_CSUM_CRC64 = 6,
- BCH_CSUM_NR = 7,
-};
-
-static const unsigned bch_crc_bytes[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C_NONZERO] = 4,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64_NONZERO] = 8,
- [BCH_CSUM_CRC64] = 8,
- [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
- [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
-};
-
-static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
-{
- switch (type) {
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128:
- return true;
- default:
- return false;
- }
-}
-
-enum bch_compression_type {
- BCH_COMPRESSION_NONE = 0,
- BCH_COMPRESSION_LZ4_OLD = 1,
- BCH_COMPRESSION_GZIP = 2,
- BCH_COMPRESSION_LZ4 = 3,
- BCH_COMPRESSION_ZSTD = 4,
- BCH_COMPRESSION_NR = 5,
-};
-
#define BCH_EXTENT_ENTRY_TYPES() \
x(ptr, 0) \
x(crc32, 1) \
@@ -1080,6 +1039,9 @@ struct bch_replicas_entry {
__u8 devs[0];
} __attribute__((packed));
+#define replicas_entry_bytes(_i) \
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
struct bch_sb_field_replicas {
struct bch_sb_field field;
struct bch_replicas_entry entries[0];
@@ -1313,17 +1275,31 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
-/* Features: */
-enum bch_sb_features {
- BCH_FEATURE_LZ4 = 0,
- BCH_FEATURE_GZIP = 1,
- BCH_FEATURE_ZSTD = 2,
- BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
- BCH_FEATURE_EC = 4,
- BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
- BCH_FEATURE_REFLINK = 6,
- BCH_FEATURE_NEW_SIPHASH = 7,
- BCH_FEATURE_INLINE_DATA = 8,
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink: gates KEY_TYPE_reflink
+ * inline_data: gates KEY_TYPE_inline_data
+ * new_siphash: gates BCH_STR_HASH_SIPHASH
+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
+ */
+#define BCH_SB_FEATURES() \
+ x(lz4, 0) \
+ x(gzip, 1) \
+ x(zstd, 2) \
+ x(atomic_nlink, 3) \
+ x(ec, 4) \
+ x(journal_seq_blacklist_v3, 5) \
+ x(reflink, 6) \
+ x(new_siphash, 7) \
+ x(inline_data, 8) \
+ x(new_extent_overwrite, 9)
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+ BCH_SB_FEATURES()
+#undef x
BCH_FEATURE_NR,
};
@@ -1343,13 +1319,6 @@ enum bch_error_actions {
BCH_NR_ERROR_ACTIONS = 3,
};
-enum bch_csum_opts {
- BCH_CSUM_OPT_NONE = 0,
- BCH_CSUM_OPT_CRC32C = 1,
- BCH_CSUM_OPT_CRC64 = 2,
- BCH_CSUM_OPT_NR = 3,
-};
-
enum bch_str_hash_type {
BCH_STR_HASH_CRC32C = 0,
BCH_STR_HASH_CRC64 = 1,
@@ -1365,16 +1334,69 @@ enum bch_str_hash_opts {
BCH_STR_HASH_OPT_NR = 3,
};
+enum bch_csum_type {
+ BCH_CSUM_NONE = 0,
+ BCH_CSUM_CRC32C_NONZERO = 1,
+ BCH_CSUM_CRC64_NONZERO = 2,
+ BCH_CSUM_CHACHA20_POLY1305_80 = 3,
+ BCH_CSUM_CHACHA20_POLY1305_128 = 4,
+ BCH_CSUM_CRC32C = 5,
+ BCH_CSUM_CRC64 = 6,
+ BCH_CSUM_NR = 7,
+};
+
+static const unsigned bch_crc_bytes[] = {
+ [BCH_CSUM_NONE] = 0,
+ [BCH_CSUM_CRC32C_NONZERO] = 4,
+ [BCH_CSUM_CRC32C] = 4,
+ [BCH_CSUM_CRC64_NONZERO] = 8,
+ [BCH_CSUM_CRC64] = 8,
+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+ switch (type) {
+ case BCH_CSUM_CHACHA20_POLY1305_80:
+ case BCH_CSUM_CHACHA20_POLY1305_128:
+ return true;
+ default:
+ return false;
+ }
+}
+
+enum bch_csum_opts {
+ BCH_CSUM_OPT_NONE = 0,
+ BCH_CSUM_OPT_CRC32C = 1,
+ BCH_CSUM_OPT_CRC64 = 2,
+ BCH_CSUM_OPT_NR = 3,
+};
+
#define BCH_COMPRESSION_TYPES() \
- x(NONE) \
- x(LZ4) \
- x(GZIP) \
- x(ZSTD)
+ x(none, 0) \
+ x(lz4_old, 1) \
+ x(gzip, 2) \
+ x(lz4, 3) \
+ x(zstd, 4)
-enum bch_compression_opts {
-#define x(t) BCH_COMPRESSION_OPT_##t,
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t,
BCH_COMPRESSION_TYPES()
#undef x
+ BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS() \
+ x(none, 0) \
+ x(lz4, 1) \
+ x(gzip, 2) \
+ x(zstd, 3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
BCH_COMPRESSION_OPT_NR
};
@@ -1593,7 +1615,9 @@ struct btree_node {
LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
-/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+ struct btree_node, flags, 8, 9);
+/* 9-32 unused */
LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
struct btree_node_entry {
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index d668ede5491a..ba8c75706bf1 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -68,7 +68,8 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
-#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage)
+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
@@ -224,46 +225,59 @@ struct bch_ioctl_data_event {
};
} __attribute__((packed, aligned(8)));
-struct bch_ioctl_dev_usage {
- __u8 state;
- __u8 alive;
- __u8 pad[6];
- __u32 dev;
+struct bch_replicas_usage {
+ __u64 sectors;
+ struct bch_replicas_entry r;
+} __attribute__((packed));
- __u32 bucket_size;
- __u64 nr_buckets;
-
- __u64 buckets[BCH_DATA_NR];
- __u64 sectors[BCH_DATA_NR];
-};
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+ return (void *) u + replicas_entry_bytes(&u->r) + 8;
+}
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
struct bch_ioctl_fs_usage {
__u64 capacity;
__u64 used;
__u64 online_reserved;
__u64 persistent_reserved[BCH_REPLICAS_MAX];
- __u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+
+ __u32 replica_entries_bytes;
+ __u32 pad;
+
+ struct bch_replicas_usage replicas[0];
};
/*
- * BCH_IOCTL_USAGE: query filesystem disk space usage
- *
- * Returns disk space usage broken out by data type, number of replicas, and
- * by component device
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
*
- * @nr_devices - number of devices userspace allocated space for in @devs
- *
- * On success, @fs and @devs will be filled out appropriately and devs[i].alive
- * will indicate if a device was present in that slot
- *
- * Returns -ERANGE if @nr_devices was too small
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
*/
-struct bch_ioctl_usage {
- __u16 nr_devices;
- __u16 pad[3];
+struct bch_ioctl_dev_usage {
+ __u64 dev;
+ __u32 flags;
+ __u8 state;
+ __u8 pad[7];
+
+ __u32 bucket_size;
+ __u64 nr_buckets;
- struct bch_ioctl_fs_usage fs;
- struct bch_ioctl_dev_usage devs[0];
+ __u64 buckets[BCH_DATA_NR];
+ __u64 sectors[BCH_DATA_NR];
};
/*
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index ed448fad83c5..320e17d108d2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -156,7 +156,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
return;
}
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index 2e205db5433d..7cbb57042af1 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -5,90 +5,15 @@
#include "bset.h"
#include "extents.h"
-/* too many iterators, need to clean this up */
-
-/* btree_node_iter_large: */
-
-#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r)
+typedef int (*sort_cmp_fn)(struct btree *,
+ struct bkey_packed *,
+ struct bkey_packed *);
-static inline bool
-bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+static inline bool sort_iter_end(struct sort_iter *iter)
{
return !iter->used;
}
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
- struct btree *b)
-{
- return bch2_btree_node_iter_large_end(iter)
- ? NULL
- : __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static void
-bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
- struct btree *b)
-{
- iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
-
- EBUG_ON(!iter->used);
- EBUG_ON(iter->data->k > iter->data->end);
-
- if (iter->data->k == iter->data->end)
- heap_del(iter, 0, btree_node_iter_cmp_heap, NULL);
- else
- heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL);
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
- struct btree *b)
-{
- struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
-
- if (ret)
- bch2_btree_node_iter_large_advance(iter, b);
-
- return ret;
-}
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- if (k != end) {
- struct btree_node_iter_set n =
- ((struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
- __btree_node_key_to_offset(b, end)
- });
-
- __heap_add(iter, n, btree_node_iter_cmp_heap, NULL);
- }
-}
-
-static void sort_key_next(struct btree_node_iter_large *iter,
- struct btree *b,
- struct btree_node_iter_set *i)
-{
- i->k += __btree_node_offset_to_key(b, i->k)->u64s;
-
- while (i->k != i->end &&
- !__btree_node_offset_to_key(b, i->k)->u64s)
- i->k++;
-
- if (i->k == i->end)
- *i = iter->data[--iter->used];
-}
-
-/* regular sort_iters */
-
-typedef int (*sort_cmp_fn)(struct btree *,
- struct bkey_packed *,
- struct bkey_packed *);
-
static inline void __sort_iter_sift(struct sort_iter *iter,
unsigned from,
sort_cmp_fn cmp)
@@ -118,19 +43,29 @@ static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
{
- return iter->used ? iter->data->k : NULL;
+ return !sort_iter_end(iter) ? iter->data->k : NULL;
}
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+static inline void __sort_iter_advance(struct sort_iter *iter,
+ unsigned idx, sort_cmp_fn cmp)
{
- iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end);
+ struct sort_iter_set *i = iter->data + idx;
+
+ BUG_ON(idx >= iter->used);
+
+ i->k = bkey_next_skip_noops(i->k, i->end);
- BUG_ON(iter->data->k > iter->data->end);
+ BUG_ON(i->k > i->end);
- if (iter->data->k == iter->data->end)
- array_remove_item(iter->data, iter->used, 0);
+ if (i->k == i->end)
+ array_remove_item(iter->data, iter->used, idx);
else
- sort_iter_sift(iter, cmp);
+ __sort_iter_sift(iter, idx, cmp);
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+ __sort_iter_advance(iter, 0, cmp);
}
static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
@@ -145,107 +80,56 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
}
/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
+ * If keys compare equal, compare by pointer order:
*/
-#define key_sort_cmp(h, l, r) \
-({ \
- bkey_cmp_packed(b, \
- __btree_node_offset_to_key(b, (l).k), \
- __btree_node_offset_to_key(b, (r).k)) \
- \
- ?: (l).k - (r).k; \
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
- struct btree *b)
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
{
- struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
- struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
-
- if (bkey_whiteout(k))
- return true;
-
- if (iter->used < 2)
- return false;
-
- if (iter->used > 2 &&
- key_sort_cmp(iter, r[0], r[1]) >= 0)
- r++;
+ return bkey_cmp_packed(b, l, r) ?:
+ cmp_int((unsigned long) l, (unsigned long) r);
+}
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
/*
* key_sort_cmp() ensures that when keys compare equal the older key
- * comes first; so if l->k compares equal to r->k then l->k is older and
- * should be dropped.
+ * comes first; so if l->k compares equal to r->k then l->k is older
+ * and should be dropped.
*/
- return !bkey_cmp_packed(b,
- __btree_node_offset_to_key(b, l->k),
- __btree_node_offset_to_key(b, r->k));
+ return iter->used >= 2 &&
+ !bkey_cmp_packed(iter->b,
+ iter->data[0].k,
+ iter->data[1].k);
}
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
- struct btree *b,
- struct btree_node_iter_large *iter)
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+ struct sort_iter *iter)
{
struct bkey_packed *out = dst->start;
+ struct bkey_packed *k;
struct btree_nr_keys nr;
memset(&nr, 0, sizeof(nr));
- heap_resort(iter, key_sort_cmp, NULL);
-
- while (!bch2_btree_node_iter_large_end(iter)) {
- if (!should_drop_next_key(iter, b)) {
- struct bkey_packed *k =
- __btree_node_offset_to_key(b, iter->data->k);
+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
+ while ((k = sort_iter_peek(iter))) {
+ if (!bkey_whiteout(k) &&
+ !should_drop_next_key(iter)) {
bkey_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
out = bkey_next(out);
}
- sort_key_next(iter, b, iter->data);
- heap_sift_down(iter, 0, key_sort_cmp, NULL);
+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
return nr;
}
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(h, l, r) \
-({ \
- struct bkey _ul = bkey_unpack_key(b, \
- __btree_node_offset_to_key(b, (l).k)); \
- struct bkey _ur = bkey_unpack_key(b, \
- __btree_node_offset_to_key(b, (r).k)); \
- \
- bkey_cmp(bkey_start_pos(&_ul), \
- bkey_start_pos(&_ur)) ?: (r).k - (l).k; \
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
- struct btree *b, size_t i)
-{
- heap_sift_down(iter, i, extent_sort_cmp, NULL);
-}
-
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
- struct btree *b,
- struct btree_node_iter_set *i)
-{
- sort_key_next(iter, b, i);
- heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL);
-}
-
static void extent_sort_advance_prev(struct bkey_format *f,
struct btree_nr_keys *nr,
struct bkey_packed *start,
@@ -286,104 +170,6 @@ static void extent_sort_append(struct bch_fs *c,
bkey_reassemble((void *) *prev, k.s_c);
}
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
- struct bset *dst,
- struct btree *b,
- struct btree_node_iter_large *iter)
-{
- struct bkey_format *f = &b->format;
- struct btree_node_iter_set *_l = iter->data, *_r;
- struct bkey_packed *prev = NULL, *lk, *rk;
- struct bkey l_unpacked, r_unpacked;
- struct bkey_s l, r;
- struct btree_nr_keys nr;
- struct bkey_on_stack split;
-
- memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&split);
-
- heap_resort(iter, extent_sort_cmp, NULL);
-
- while (!bch2_btree_node_iter_large_end(iter)) {
- lk = __btree_node_offset_to_key(b, _l->k);
- l = __bkey_disassemble(b, lk, &l_unpacked);
-
- if (iter->used == 1) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- extent_sort_next(iter, b, _l);
- continue;
- }
-
- _r = iter->data + 1;
- if (iter->used > 2 &&
- extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
- _r++;
-
- rk = __btree_node_offset_to_key(b, _r->k);
- r = __bkey_disassemble(b, rk, &r_unpacked);
-
- /* If current key and next key don't overlap, just append */
- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- extent_sort_next(iter, b, _l);
- continue;
- }
-
- /* Skip 0 size keys */
- if (!r.k->size) {
- extent_sort_next(iter, b, _r);
- continue;
- }
-
- /*
- * overlap: keep the newer key and trim the older key so they
- * don't overlap. comparing pointers tells us which one is
- * newer, since the bsets are appended one after the other.
- */
-
- /* can't happen because of comparison func */
- BUG_ON(_l->k < _r->k &&
- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
- if (_l->k > _r->k) {
- /* l wins, trim r */
- if (bkey_cmp(l.k->p, r.k->p) >= 0) {
- sort_key_next(iter, b, _r);
- } else {
- bch2_cut_front_s(l.k->p, r);
- extent_save(b, rk, r.k);
- }
-
- extent_sort_sift(iter, b, _r - iter->data);
- } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
- /*
- * r wins, but it overlaps in the middle of l - split l:
- */
- bkey_on_stack_reassemble(&split, c, l.s_c);
- bch2_cut_back(bkey_start_pos(r.k), split.k);
-
- bch2_cut_front_s(r.k->p, l);
- extent_save(b, lk, l.k);
-
- extent_sort_sift(iter, b, 0);
-
- extent_sort_append(c, f, &nr, dst->start,
- &prev, bkey_i_to_s(split.k));
- } else {
- bch2_cut_back_s(bkey_start_pos(r.k), l);
- extent_save(b, lk, l.k);
- }
- }
-
- extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
-
- bkey_on_stack_exit(&split, c);
- return nr;
-}
-
/* Sort + repack in a new format: */
struct btree_nr_keys
bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -424,28 +210,38 @@ bch2_sort_repack_merge(struct bch_fs *c,
bool filter_whiteouts)
{
struct bkey_packed *prev = NULL, *k_packed;
- struct bkey_s k;
+ struct bkey_on_stack k;
struct btree_nr_keys nr;
- struct bkey unpacked;
memset(&nr, 0, sizeof(nr));
+ bkey_on_stack_init(&k);
while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
if (filter_whiteouts && bkey_whiteout(k_packed))
continue;
- k = __bkey_disassemble(src, k_packed, &unpacked);
+ /*
+ * NOTE:
+ * bch2_bkey_normalize may modify the key we pass it (dropping
+ * stale pointers) and we don't have a write lock on the src
+ * node; we have to make a copy of the entire key before calling
+ * normalize
+ */
+ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+ bch2_bkey_unpack(src, k.k, k_packed);
if (filter_whiteouts &&
- bch2_bkey_normalize(c, k))
+ bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
continue;
- extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k);
+ extent_sort_append(c, out_f, &nr, vstruct_last(dst),
+ &prev, bkey_i_to_s(k.k));
}
extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ bkey_on_stack_exit(&k, c);
return nr;
}
@@ -454,7 +250,7 @@ static inline int sort_keys_cmp(struct btree *b,
struct bkey_packed *r)
{
return bkey_cmp_packed(b, l, r) ?:
- (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
@@ -468,23 +264,18 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
sort_iter_sort(iter, sort_keys_cmp);
while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+ bool needs_whiteout = false;
+
if (bkey_whiteout(in) &&
(filter_whiteouts || !in->needs_whiteout))
continue;
- if (bkey_whiteout(in) &&
- (next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
+ while ((next = sort_iter_peek(iter)) &&
+ !bkey_cmp_packed(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
- /*
- * XXX racy, called with read lock from write path
- *
- * leads to spurious BUG_ON() in bkey_unpack_key() in
- * debug mode
- */
- next->needs_whiteout |= in->needs_whiteout;
- continue;
+ needs_whiteout |= in->needs_whiteout;
+ in = sort_iter_next(iter, sort_keys_cmp);
}
if (bkey_whiteout(in)) {
@@ -493,12 +284,129 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
} else {
bkey_copy(out, in);
}
+ out->needs_whiteout |= needs_whiteout;
out = bkey_next(out);
}
return (u64 *) out - (u64 *) dst;
}
+/* Compat code for btree_node_old_extent_overwrite: */
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
+{
+ struct bkey ul = bkey_unpack_key(b, l);
+ struct bkey ur = bkey_unpack_key(b, r);
+
+ return bkey_cmp(bkey_start_pos(&ul),
+ bkey_start_pos(&ur)) ?:
+ cmp_int((unsigned long) r, (unsigned long) l);
+}
+
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+ struct sort_iter *iter)
+{
+ struct btree *b = iter->b;
+ struct bkey_format *f = &b->format;
+ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
+ struct bkey_packed *prev = NULL;
+ struct bkey l_unpacked, r_unpacked;
+ struct bkey_s l, r;
+ struct btree_nr_keys nr;
+ struct bkey_on_stack split;
+
+ memset(&nr, 0, sizeof(nr));
+ bkey_on_stack_init(&split);
+
+ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+
+ while (!sort_iter_end(iter)) {
+ l = __bkey_disassemble(b, _l->k, &l_unpacked);
+
+ if (iter->used == 1) {
+ extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ sort_iter_advance(iter,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ r = __bkey_disassemble(b, _r->k, &r_unpacked);
+
+ /* If current key and next key don't overlap, just append */
+ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+ extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ sort_iter_advance(iter,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ /* Skip 0 size keys */
+ if (!r.k->size) {
+ __sort_iter_advance(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ /*
+ * overlap: keep the newer key and trim the older key so they
+ * don't overlap. comparing pointers tells us which one is
+ * newer, since the bsets are appended one after the other.
+ */
+
+ /* can't happen because of comparison func */
+ BUG_ON(_l->k < _r->k &&
+ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+ if (_l->k > _r->k) {
+ /* l wins, trim r */
+ if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+ __sort_iter_advance(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ } else {
+ bch2_cut_front_s(l.k->p, r);
+ extent_save(b, _r->k, r.k);
+ __sort_iter_sift(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ }
+ } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+
+ /*
+ * r wins, but it overlaps in the middle of l - split l:
+ */
+ bkey_on_stack_reassemble(&split, c, l.s_c);
+ bch2_cut_back(bkey_start_pos(r.k), split.k);
+
+ bch2_cut_front_s(r.k->p, l);
+ extent_save(b, _l->k, l.k);
+
+ __sort_iter_sift(iter, 0,
+ extent_sort_fix_overlapping_cmp);
+
+ extent_sort_append(c, f, &nr, dst->start,
+ &prev, bkey_i_to_s(split.k));
+ } else {
+ bch2_cut_back_s(bkey_start_pos(r.k), l);
+ extent_save(b, _l->k, l.k);
+ }
+ }
+
+ extent_sort_advance_prev(f, &nr, dst->start, &prev);
+
+ dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+ bkey_on_stack_exit(&split, c);
+ return nr;
+}
+
static inline int sort_extents_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
@@ -530,28 +438,6 @@ unsigned bch2_sort_extents(struct bkey_packed *dst,
return (u64 *) out - (u64 *) dst;
}
-static inline int sort_key_whiteouts_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- return bkey_cmp_packed(b, l, r);
-}
-
-unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst,
- struct sort_iter *iter)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, sort_key_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
- bkey_copy(out, in);
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
static inline int sort_extent_whiteouts_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
index 397009181eae..458a051fdac5 100644
--- a/fs/bcachefs/bkey_sort.h
+++ b/fs/bcachefs/bkey_sort.h
@@ -2,20 +2,10 @@
#ifndef _BCACHEFS_BKEY_SORT_H
#define _BCACHEFS_BKEY_SORT_H
-struct btree_node_iter_large {
- u16 used;
-
- struct btree_node_iter_set data[MAX_BSETS];
-};
-
-void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
- struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-
struct sort_iter {
- struct btree *b;
+ struct btree *b;
unsigned used;
+ unsigned size;
struct sort_iter_set {
struct bkey_packed *k, *end;
@@ -24,27 +14,27 @@ struct sort_iter {
static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
{
- memset(iter, 0, sizeof(*iter));
iter->b = b;
+ iter->used = 0;
+ iter->size = ARRAY_SIZE(iter->data);
}
static inline void sort_iter_add(struct sort_iter *iter,
struct bkey_packed *k,
struct bkey_packed *end)
{
- BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
+ BUG_ON(iter->used >= iter->size);
if (k != end)
iter->data[iter->used++] = (struct sort_iter_set) { k, end };
}
struct btree_nr_keys
-bch2_key_sort_fix_overlapping(struct bset *, struct btree *,
- struct btree_node_iter_large *);
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+ struct sort_iter *);
struct btree_nr_keys
bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
- struct btree *,
- struct btree_node_iter_large *);
+ struct sort_iter *);
struct btree_nr_keys
bch2_sort_repack(struct bset *, struct btree *,
@@ -61,8 +51,6 @@ unsigned bch2_sort_keys(struct bkey_packed *,
unsigned bch2_sort_extents(struct bkey_packed *,
struct sort_iter *, bool);
-unsigned bch2_sort_key_whiteouts(struct bkey_packed *,
- struct sort_iter *);
unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
struct sort_iter *);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index a0f0b0eadffb..cf8fa59fada1 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -253,10 +253,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bch2_bkey_to_text(&PBUF(buf2), &k2);
panic("prev > insert:\n"
- "prev key %5u %s\n"
- "insert key %5u %s\n",
- __btree_node_key_to_offset(b, prev), buf1,
- __btree_node_key_to_offset(b, insert), buf2);
+ "prev key %s\n"
+ "insert key %s\n",
+ buf1, buf2);
}
#endif
#if 0
@@ -275,10 +274,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bch2_bkey_to_text(&PBUF(buf2), &k2);
panic("insert > next:\n"
- "insert key %5u %s\n"
- "next key %5u %s\n",
- __btree_node_key_to_offset(b, insert), buf1,
- __btree_node_key_to_offset(b, next), buf2);
+ "insert key %s\n"
+ "next key %s\n",
+ buf1, buf2);
}
#endif
}
@@ -1399,21 +1397,21 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
{
if (lossy_packed_search)
while (m != btree_bkey_last(b, t) &&
- bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
- m) > 0)
+ bkey_iter_cmp_p_or_unp(b, m,
+ lossy_packed_search, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
- bkey_iter_pos_cmp(b, search, m) > 0)
+ bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (btree_keys_expensive_checks(b)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
- bkey_iter_cmp_p_or_unp(b, search, packed_search,
- prev) <= 0);
+ bkey_iter_cmp_p_or_unp(b, prev,
+ packed_search, search) >= 0);
}
return m;
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 2653a74b3b14..7338ccbc8cbd 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -199,12 +199,6 @@ __bkey_unpack_key_format_checked(const struct btree *b,
if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
- /*
- * hack around a harmless race when compacting whiteouts
- * for a write:
- */
- dst2.needs_whiteout = dst->needs_whiteout;
-
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
}
}
@@ -360,7 +354,7 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
static inline int bkey_cmp_p_or_unp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r_packed,
- struct bpos *r)
+ const struct bpos *r)
{
EBUG_ON(r_packed && !bkey_packed(r_packed));
@@ -449,7 +443,7 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
* XXX: only need to compare pointers for keys that are both within a
* btree_node_iterator - we need to break ties for prev() to work correctly
*/
-static inline int bkey_iter_cmp(struct btree *b,
+static inline int bkey_iter_cmp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
@@ -458,7 +452,7 @@ static inline int bkey_iter_cmp(struct btree *b,
?: cmp_int(l, r);
}
-static inline int btree_node_iter_cmp(struct btree *b,
+static inline int btree_node_iter_cmp(const struct btree *b,
struct btree_node_iter_set l,
struct btree_node_iter_set r)
{
@@ -467,22 +461,22 @@ static inline int btree_node_iter_cmp(struct btree *b,
__btree_node_offset_to_key(b, r.k));
}
-/* These assume l (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(struct btree *b,
- struct bpos *l,
- const struct bkey_packed *r)
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bpos *r)
{
- return -bkey_cmp_left_packed(b, r, l)
- ?: (int) bkey_deleted(r);
+ return bkey_cmp_left_packed(b, l, r)
+ ?: -((int) bkey_deleted(l));
}
-static inline int bkey_iter_cmp_p_or_unp(struct btree *b,
- struct bpos *l,
- const struct bkey_packed *l_packed,
- const struct bkey_packed *r)
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r_packed,
+ const struct bpos *r)
{
- return -bkey_cmp_p_or_unp(b, r, l_packed, l)
- ?: (int) bkey_deleted(r);
+ return bkey_cmp_p_or_unp(b, l, r_packed, r)
+ ?: -((int) bkey_deleted(l));
}
static inline struct bkey_packed *
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 5d3acba525c2..0c737f35f430 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -557,7 +557,6 @@ out:
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
- b->uncompacted_whiteout_u64s = 0;
bch2_btree_keys_init(b, &c->expensive_debug_checks);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index c5873c58439c..83358d6a4df8 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -75,7 +75,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
return c->opts.btree_node_size >> c->block_bits;
}
-#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 8bbf60b07736..05879b66d6af 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -116,8 +116,8 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
unsigned flags =
- BCH_BUCKET_MARK_GC|
- (initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
+ BTREE_TRIGGER_GC|
+ (initial ? BTREE_TRIGGER_NOATOMIC : 0);
int ret = 0;
if (initial) {
@@ -294,8 +294,8 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id,
BTREE_ITER_SLOTS, k, ret) {
percpu_down_read(&c->mark_lock);
ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
- BCH_BUCKET_MARK_GC|
- BCH_BUCKET_MARK_NOATOMIC);
+ BTREE_TRIGGER_GC|
+ BTREE_TRIGGER_NOATOMIC);
percpu_up_read(&c->mark_lock);
if (!ret)
@@ -407,7 +407,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
- bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
mutex_unlock(&c->sb_lock);
}
@@ -424,7 +424,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
if (d->index_update_done)
bch2_mark_key(c, bkey_i_to_s_c(&d->key),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -445,7 +445,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_GC);
@@ -453,7 +453,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_GC);
}
spin_unlock(&c->freelist_lock);
@@ -467,7 +467,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_GC);
}
spin_unlock(&ob->lock);
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index fe6eb45ddfc2..5f1c3183fa85 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -23,7 +23,8 @@
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
- struct bkey_packed *end)
+ struct bkey_packed *end,
+ bool extents)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct bkey_packed *k, *p;
@@ -37,7 +38,7 @@ static void verify_no_dups(struct btree *b,
struct bkey l = bkey_unpack_key(b, p);
struct bkey r = bkey_unpack_key(b, k);
- BUG_ON(btree_node_is_extents(b)
+ BUG_ON(extents
? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
: bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
//BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
@@ -80,27 +81,103 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
}
-static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
- bool compacting,
- enum compact_mode mode)
+static void sort_bkey_ptrs(const struct btree *bt,
+ struct bkey_packed **ptrs, unsigned nr)
{
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
- unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+ unsigned n = nr, a = nr / 2, b, c, d;
- if (mode == COMPACT_LAZY) {
- if (should_compact_bset_lazy(b, t) ||
- (compacting && !bset_written(b, bset(b, t))))
- return dead_u64s;
- } else {
- if (bset_written(b, bset(b, t)))
- return dead_u64s;
+ if (!a)
+ return;
+
+ /* Heap sort: see lib/sort.c: */
+ while (1) {
+ if (a)
+ a--;
+ else if (--n)
+ swap(ptrs[0], ptrs[n]);
+ else
+ break;
+
+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+ b = bkey_cmp_packed(bt,
+ ptrs[c],
+ ptrs[d]) >= 0 ? c : d;
+ if (d == n)
+ b = c;
+
+ while (b != a &&
+ bkey_cmp_packed(bt,
+ ptrs[a],
+ ptrs[b]) >= 0)
+ b = (b - 1) / 2;
+ c = b;
+ while (b != a) {
+ b = (b - 1) / 2;
+ swap(ptrs[b], ptrs[c]);
+ }
}
+}
+
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
+{
+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+ bool used_mempool = false;
+ unsigned order;
+
+ if (!b->whiteout_u64s)
+ return;
+
+ order = get_order(b->whiteout_u64s * sizeof(u64));
+
+ new_whiteouts = btree_bounce_alloc(c, order, &used_mempool);
+
+ ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order));
+
+ for (k = unwritten_whiteouts_start(c, b);
+ k != unwritten_whiteouts_end(c, b);
+ k = bkey_next(k))
+ *--ptrs = k;
- return 0;
+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
+
+ k = new_whiteouts;
+
+ while (ptrs != ptrs_end) {
+ bkey_copy(k, *ptrs);
+ k = bkey_next(k);
+ ptrs++;
+ }
+
+ verify_no_dups(b, new_whiteouts,
+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
+ btree_node_old_extent_overwrite(b));
+
+ memcpy_u64s(unwritten_whiteouts_start(c, b),
+ new_whiteouts, b->whiteout_u64s);
+
+ btree_bounce_free(c, order, used_mempool, new_whiteouts);
}
-bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
- enum compact_mode mode)
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+ bool compacting, enum compact_mode mode)
+{
+ if (!bset_dead_u64s(b, t))
+ return false;
+
+ switch (mode) {
+ case COMPACT_LAZY:
+ return should_compact_bset_lazy(b, t) ||
+ (compacting && !bset_written(b, bset(b, t)));
+ case COMPACT_ALL:
+ return true;
+ default:
+ BUG();
+ }
+}
+
+static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
+ struct btree *b,
+ enum compact_mode mode)
{
const struct bkey_format *f = &b->format;
struct bset_tree *t;
@@ -110,13 +187,17 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
unsigned order, whiteout_u64s = 0, u64s;
bool used_mempool, compacting = false;
+ BUG_ON(!btree_node_is_extents(b));
+
for_each_bset(b, t)
- whiteout_u64s += should_compact_bset(b, t,
- whiteout_u64s != 0, mode);
+ if (should_compact_bset(b, t, whiteout_u64s != 0, mode))
+ whiteout_u64s += bset_dead_u64s(b, t);
if (!whiteout_u64s)
return false;
+ bch2_sort_whiteouts(c, b);
+
sort_iter_init(&sort_iter, b);
whiteout_u64s += b->whiteout_u64s;
@@ -139,9 +220,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
if (t != b->set && !bset_written(b, i)) {
src = container_of(i, struct btree_node_entry, keys);
dst = max(write_block(b),
- (void *) btree_bkey_last(b, t -1));
+ (void *) btree_bkey_last(b, t - 1));
}
+ if (src != dst)
+ compacting = true;
+
if (!should_compact_bset(b, t, compacting, mode)) {
if (src != dst) {
memmove(dst, src, sizeof(*src) +
@@ -169,18 +253,21 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
for (k = start; k != end; k = n) {
n = bkey_next_skip_noops(k, end);
- if (bkey_deleted(k) && btree_node_is_extents(b))
+ if (bkey_deleted(k))
continue;
+ BUG_ON(bkey_whiteout(k) &&
+ k->needs_whiteout &&
+ bkey_written(b, k));
+
if (bkey_whiteout(k) && !k->needs_whiteout)
continue;
if (bkey_whiteout(k)) {
- unreserve_whiteout(b, k);
memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
set_bkeyp_val_u64s(f, u_pos, 0);
u_pos = bkey_next(u_pos);
- } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
+ } else {
bkey_copy(out, k);
out = bkey_next(out);
}
@@ -188,11 +275,9 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
sort_iter_add(&sort_iter, u_start, u_pos);
- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
- }
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ set_btree_bset_end(b, t);
+ bch2_bset_set_no_aux_tree(b, t);
}
b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
@@ -200,13 +285,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
BUG_ON((void *) unwritten_whiteouts_start(c, b) <
(void *) btree_bkey_last(b, bset_tree_last(b)));
- u64s = (btree_node_is_extents(b)
- ? bch2_sort_extent_whiteouts
- : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b),
- &sort_iter);
+ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
+ &sort_iter);
BUG_ON(u64s > b->whiteout_u64s);
- BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
BUG_ON(u_pos != whiteouts && !u64s);
if (u64s != b->whiteout_u64s) {
@@ -218,12 +300,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
verify_no_dups(b,
unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_end(c, b),
+ true);
btree_bounce_free(c, order, used_mempool, whiteouts);
- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
- bch2_btree_build_aux_trees(b);
+ bch2_btree_build_aux_trees(b);
bch_btree_keys_u64s_remaining(c, b);
bch2_verify_btree_nr_keys(b);
@@ -231,7 +313,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
return true;
}
-static bool bch2_drop_whiteouts(struct btree *b)
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
{
struct bset_tree *t;
bool ret = false;
@@ -239,21 +321,34 @@ static bool bch2_drop_whiteouts(struct btree *b)
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k, *n, *out, *start, *end;
+ struct btree_node_entry *src = NULL, *dst = NULL;
+
+ if (t != b->set && !bset_written(b, i)) {
+ src = container_of(i, struct btree_node_entry, keys);
+ dst = max(write_block(b),
+ (void *) btree_bkey_last(b, t - 1));
+ }
- if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
+ if (src != dst)
+ ret = true;
+
+ if (!should_compact_bset(b, t, ret, mode)) {
+ if (src != dst) {
+ memmove(dst, src, sizeof(*src) +
+ le16_to_cpu(src->keys.u64s) *
+ sizeof(u64));
+ i = &dst->keys;
+ set_btree_bset(b, t, i);
+ }
continue;
+ }
start = btree_bkey_first(b, t);
end = btree_bkey_last(b, t);
- if (!bset_written(b, i) &&
- t != b->set) {
- struct bset *dst =
- max_t(struct bset *, write_block(b),
- (void *) btree_bkey_last(b, t -1));
-
- memmove(dst, i, sizeof(struct bset));
- i = dst;
+ if (src != dst) {
+ memmove(dst, src, sizeof(*src));
+ i = &dst->keys;
set_btree_bset(b, t, i);
}
@@ -265,19 +360,32 @@ static bool bch2_drop_whiteouts(struct btree *b)
if (!bkey_whiteout(k)) {
bkey_copy(out, k);
out = bkey_next(out);
+ } else {
+ BUG_ON(k->needs_whiteout);
}
}
i->u64s = cpu_to_le16((u64 *) out - i->_data);
+ set_btree_bset_end(b, t);
bch2_bset_set_no_aux_tree(b, t);
ret = true;
}
bch2_verify_btree_nr_keys(b);
+ bch2_btree_build_aux_trees(b);
+
return ret;
}
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+ enum compact_mode mode)
+{
+ return !btree_node_old_extent_overwrite(b)
+ ? bch2_drop_whiteouts(b, mode)
+ : bch2_compact_extent_whiteouts(c, b, mode);
+}
+
static void btree_node_sort(struct bch_fs *c, struct btree *b,
struct btree_iter *iter,
unsigned start_idx,
@@ -313,10 +421,10 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- if (btree_node_is_extents(b))
+ if (btree_node_old_extent_overwrite(b))
filter_whiteouts = bset_written(b, start_bset);
- u64s = (btree_node_is_extents(b)
+ u64s = (btree_node_old_extent_overwrite(b)
? bch2_sort_extents
: bch2_sort_keys)(out->keys.start,
&sort_iter,
@@ -509,7 +617,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -602,7 +710,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
bool have_retry)
{
struct bkey_packed *k, *prev = NULL;
- struct bpos prev_pos = POS_MIN;
+ struct bpos prev_pos = POS_MIN;
+ struct bpos prev_data = POS_MIN;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
@@ -735,7 +844,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
(bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
- } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+ } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
+ bkey_cmp(prev_pos, u.k->p) > 0) {
btree_err(BTREE_ERR_FATAL, c, b, i,
"keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode,
@@ -745,7 +855,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX: repair this */
}
+ if (!bkey_deleted(u.k))
+ prev_data = u.k->p;
prev_pos = u.k->p;
+
prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
@@ -758,7 +871,7 @@ fsck_err:
int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
{
struct btree_node_entry *bne;
- struct btree_node_iter_large *iter;
+ struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
@@ -767,7 +880,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
int ret, retry_read = 0, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
- iter->used = 0;
+ sort_iter_init(iter, b);
+ iter->size = (btree_blocks(c) + 1) * 2;
if (bch2_meta_read_fault("btree"))
btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
@@ -803,6 +917,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
bset_encrypt(c, i, b->written << 9);
+ if (btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ set_btree_node_old_extent_overwrite(b);
+
sectors = vstruct_sectors(b->data, c->block_bits);
btree_node_set_format(b, b->data->format);
@@ -846,13 +964,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
if (blacklisted && !first)
continue;
- bch2_btree_node_iter_large_push(iter, b,
- i->start,
- vstruct_idx(i, whiteout_u64s));
+ sort_iter_add(iter, i->start,
+ vstruct_idx(i, whiteout_u64s));
- bch2_btree_node_iter_large_push(iter, b,
- vstruct_idx(i, whiteout_u64s),
- vstruct_last(i));
+ sort_iter_add(iter,
+ vstruct_idx(i, whiteout_u64s),
+ vstruct_last(i));
}
for (bne = write_block(b);
@@ -867,9 +984,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
set_btree_bset(b, b->set, &b->data->keys);
- b->nr = btree_node_is_extents(b)
- ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
- : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
+ b->nr = (btree_node_old_extent_overwrite(b)
+ ? bch2_extent_sort_fix_overlapping
+ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
@@ -1343,21 +1460,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
- /*
- * We can't block on six_lock_write() here; another thread might be
- * trying to get a journal reservation with read locks held, and getting
- * a journal reservation might be blocked on flushing the journal and
- * doing btree writes:
- */
- if (lock_type_held == SIX_LOCK_intent &&
- six_trylock_write(&b->lock)) {
- __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
- six_unlock_write(&b->lock);
- } else {
- __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
- }
-
- BUG_ON(b->uncompacted_whiteout_u64s);
+ bch2_sort_whiteouts(c, b);
sort_iter_init(&sort_iter, b);
@@ -1396,7 +1499,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
i->journal_seq = cpu_to_le64(seq);
i->u64s = 0;
- if (!btree_node_is_extents(b)) {
+ if (!btree_node_old_extent_overwrite(b)) {
sort_iter_add(&sort_iter,
unwritten_whiteouts_start(c, b),
unwritten_whiteouts_end(c, b));
@@ -1411,7 +1514,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->whiteout_u64s = 0;
- u64s = btree_node_is_extents(b)
+ u64s = btree_node_old_extent_overwrite(b)
? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
: bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
@@ -1545,7 +1648,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
return false;
BUG_ON(b->whiteout_u64s);
- BUG_ON(b->uncompacted_whiteout_u64s);
clear_btree_node_just_written(b);
@@ -1566,7 +1668,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
btree_node_sort(c, b, NULL, 0, b->nsets, true);
invalidated_iter = true;
} else {
- invalidated_iter = bch2_drop_whiteouts(b);
+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
}
for_each_bset(b, t)
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 955a80cafae3..e90e89eee273 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -54,16 +54,17 @@ static inline bool btree_node_may_write(struct btree *b)
enum compact_mode {
COMPACT_LAZY,
- COMPACT_WRITTEN,
- COMPACT_WRITTEN_NO_WRITE_LOCK,
+ COMPACT_ALL,
};
-bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+ enum compact_mode);
-static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+static inline bool should_compact_bset_lazy(struct btree *b,
+ struct bset_tree *t)
{
unsigned total_u64s = bset_u64s(t);
- unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
+ unsigned dead_u64s = bset_dead_u64s(b, t);
return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
}
@@ -74,7 +75,7 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *
for_each_bset(b, t)
if (should_compact_bset_lazy(b, t))
- return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
return false;
}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a4180124d7d1..ea0555b806f0 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -11,10 +11,6 @@
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
- struct btree_iter_level *,
- struct bkey *);
-
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
@@ -29,37 +25,14 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
(unsigned long) iter->l[l].b >= 128;
}
-/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
-static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
- const struct btree *b,
- const struct bkey_packed *k,
- bool interior_node)
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
- int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
-
- if (cmp)
- return cmp;
- if (bkey_deleted(k))
- return -1;
+ struct bpos pos = iter->pos;
- /*
- * Normally, for extents we want the first key strictly greater than
- * the iterator position - with the exception that for interior nodes,
- * we don't want to advance past the last key if the iterator position
- * is POS_MAX:
- */
- if (iter->flags & BTREE_ITER_IS_EXTENTS &&
- (!interior_node ||
- bkey_cmp_left_packed_byval(b, k, POS_MAX)))
- return -1;
- return 1;
-}
-
-static inline int btree_iter_pos_cmp(struct btree_iter *iter,
- const struct btree *b,
- const struct bkey_packed *k)
-{
- return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ bkey_cmp(pos, POS_MAX))
+ pos = bkey_successor(pos);
+ return pos;
}
/* Btree node locking: */
@@ -415,6 +388,7 @@ void bch2_trans_unlock(struct btree_trans *trans)
static void __bch2_btree_iter_verify(struct btree_iter *iter,
struct btree *b)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[b->level];
struct btree_node_iter tmp = l->iter;
struct bkey_packed *k;
@@ -437,17 +411,17 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
: bch2_btree_node_iter_prev_all(&tmp, b);
- if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
+ if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
+ panic("iterator should be before prev key:\n%s\n%llu:%llu\n",
buf, iter->pos.inode, iter->pos.offset);
}
k = bch2_btree_node_iter_peek_all(&l->iter, b);
- if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
+ if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
@@ -457,11 +431,6 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
"cur key %s\n",
iter->pos.inode, iter->pos.offset, buf);
}
-
- BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
- btree_iter_type(iter) == BTREE_ITER_KEYS &&
- !bkey_whiteout(&iter->k) &&
- bch2_btree_node_iter_end(&l->iter));
}
void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
@@ -500,15 +469,19 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
}
static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
- struct btree *b,
- struct bkey_packed *where)
+ struct btree *b,
+ struct bkey_packed *where)
{
- struct btree_node_iter *node_iter = &iter->l[0].iter;
+ struct btree_iter_level *l = &iter->l[b->level];
+ struct bpos pos = btree_iter_search_key(iter);
- if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
- bkey_disassemble(b, where, &iter->k);
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
- }
+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+ return;
+
+ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+ bch2_btree_node_iter_advance(&l->iter, l->b);
+
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
@@ -540,6 +513,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
bool iter_current_key_modified =
orig_iter_pos >= offset &&
orig_iter_pos <= offset + clobber_u64s;
+ struct bpos iter_pos = btree_iter_search_key(iter);
btree_node_iter_for_each(node_iter, set)
if (set->end == old_end)
@@ -547,7 +521,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
@@ -562,7 +536,7 @@ found:
return;
if (new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
@@ -707,11 +681,12 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
struct btree_iter_level *l,
int max_advance)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- btree_iter_pos_cmp(iter, l->b, k) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
@@ -770,13 +745,7 @@ static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- int cmp = bkey_cmp(b->key.k.p, iter->pos);
-
- if (!cmp &&
- (iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(b->key.k.p, POS_MAX))
- cmp = -1;
- return cmp < 0;
+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -790,16 +759,10 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
static inline void __btree_iter_init(struct btree_iter *iter,
unsigned level)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- btree_iter_advance_to_pos(iter, l, -1);
-
- /* Skip to first non whiteout: */
- if (level)
- bch2_btree_node_iter_peek(&l->iter, l->b);
+ bch2_btree_node_iter_init(&l->iter, l->b, &pos);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
@@ -1032,10 +995,7 @@ retry_all:
for (i = 0; i < nr_sorted; i++) {
iter = &trans->iters[sorted[i]];
- do {
- ret = btree_iter_traverse_one(iter);
- } while (ret == -EINTR);
-
+ ret = btree_iter_traverse_one(iter);
if (ret)
goto retry_all;
}
@@ -1148,7 +1108,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
iter->uptodate = BTREE_ITER_NEED_PEEK;
bch2_btree_trans_verify_locks(iter->trans);
- __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
+ if (btree_iter_node(iter, iter->level))
+ __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
return 0;
}
@@ -1378,12 +1339,6 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
if (debug_check_iterators(iter->trans->c)) {
struct bkey k = bkey_unpack_key(l->b, _k);
- /*
- * this flag is internal to the btree code,
- * we don't care if it doesn't match - if it's now set
- * it just means the key has been written out to disk:
- */
- k.needs_whiteout = iter->k.needs_whiteout;
BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
}
@@ -1571,9 +1526,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
int ret;
recheck:
- while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
- bkey_cmp(k.k->p, iter->pos) <= 0)
- bch2_btree_node_iter_advance(&l->iter, l->b);
+ btree_iter_advance_to_pos(iter, l, -1);
/*
* iterator is now at the correct position for inserting at iter->pos,
@@ -1582,9 +1535,27 @@ recheck:
*/
node_iter = l->iter;
- if (k.k && bkey_whiteout(k.k))
- k = __btree_iter_unpack(iter, l, &iter->k,
- bch2_btree_node_iter_peek(&node_iter, l->b));
+ k = __btree_iter_unpack(iter, l, &iter->k,
+ bch2_btree_node_iter_peek(&node_iter, l->b));
+
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+ /*
+ * If there wasn't actually a hole, want the iterator to be
+ * pointed at the key we found:
+ *
+ * XXX: actually, we shouldn't be changing the iterator here:
+ * the iterator needs to be correct for inserting at iter->pos,
+ * and there may be whiteouts between iter->pos and what this
+ * iterator points at:
+ */
+ l->iter = node_iter;
+
+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
+ iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
+ return k;
+ }
/*
* If we got to the end of the node, check if we need to traverse to the
@@ -1599,24 +1570,6 @@ recheck:
goto recheck;
}
- if (k.k &&
- !bkey_whiteout(k.k) &&
- bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
- /*
- * if we skipped forward to find the first non whiteout and
- * there _wasn't_ actually a hole, we want the iterator to be
- * pointed at the key we found:
- */
- l->iter = node_iter;
-
- EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
- EBUG_ON(bkey_deleted(k.k));
- iter->uptodate = BTREE_ITER_UPTODATE;
-
- __bch2_btree_iter_verify(iter, l->b);
- return k;
- }
-
/* hole */
/* holes can't span inode numbers: */
@@ -1797,10 +1750,9 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
- void *new_iters, *new_updates, *new_sorted;
+ void *new_iters, *new_updates;
size_t iters_bytes;
size_t updates_bytes;
- size_t sorted_bytes;
new_size = roundup_pow_of_two(new_size);
@@ -1814,12 +1766,9 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
bch2_trans_unlock(trans);
iters_bytes = sizeof(struct btree_iter) * new_size;
- updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4);
- sorted_bytes = sizeof(u8) * (new_size + 4);
+ updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- new_iters = kmalloc(iters_bytes +
- updates_bytes +
- sorted_bytes, GFP_NOFS);
+ new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
if (new_iters)
goto success;
@@ -1829,7 +1778,6 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
trans->used_mempool = true;
success:
new_updates = new_iters + iters_bytes;
- new_sorted = new_updates + updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
@@ -1846,7 +1794,6 @@ success:
trans->iters = new_iters;
trans->updates = new_updates;
- trans->updates_sorted = new_sorted;
trans->size = new_size;
if (trans->iters_live) {
@@ -1895,6 +1842,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
got_slot:
BUG_ON(trans->iters_linked & (1ULL << idx));
trans->iters_linked |= 1ULL << idx;
+ trans->iters[idx].flags = 0;
return &trans->iters[idx];
}
@@ -1910,6 +1858,9 @@ static inline void btree_iter_copy(struct btree_iter *dst,
if (btree_node_locked(dst, i))
six_lock_increment(&dst->l[i].b->lock,
__btree_lock_want(dst, i));
+
+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
}
static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
@@ -1960,7 +1911,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
iter = best;
}
- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
@@ -1972,6 +1922,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
BUG_ON(iter->btree_id != btree_id);
BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
BUG_ON(trans->iters_live & (1ULL << iter->idx));
trans->iters_live |= 1ULL << iter->idx;
@@ -2034,7 +1985,6 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
* it's cheap to copy it again:
*/
trans->iters_touched &= ~(1ULL << iter->idx);
- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
return iter;
}
@@ -2094,7 +2044,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
struct btree_iter *iter;
trans_for_each_iter(trans, iter)
- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
+ BTREE_ITER_SET_POS_AFTER_COMMIT);
bch2_trans_unlink_iters(trans);
@@ -2103,12 +2054,21 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->iters_touched &= trans->iters_live;
+ trans->need_reset = 0;
trans->nr_updates = 0;
if (flags & TRANS_RESET_MEM)
trans->mem_top = 0;
- bch2_btree_iter_traverse_all(trans);
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset(&trans->fs_usage_deltas->memset_start, 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
+ }
+
+ if (!(flags & TRANS_RESET_NOTRAVERSE))
+ bch2_btree_iter_traverse_all(trans);
}
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
@@ -2122,7 +2082,6 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
- trans->updates_sorted = trans->updates_sorted_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@@ -2159,6 +2118,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
- sizeof(struct btree_insert_entry) * (nr + 4) +
- sizeof(u8) * (nr + 4));
+ sizeof(struct btree_insert_entry) * nr +
+ sizeof(u8) * nr);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 4c5032222319..962380925511 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -291,6 +291,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
#define TRANS_RESET_ITERS (1 << 0)
#define TRANS_RESET_MEM (1 << 1)
+#define TRANS_RESET_NOTRAVERSE (1 << 2)
void bch2_trans_reset(struct btree_trans *, unsigned);
@@ -299,11 +300,6 @@ static inline void bch2_trans_begin(struct btree_trans *trans)
return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
}
-static inline void bch2_trans_begin_updates(struct btree_trans *trans)
-{
- return bch2_trans_reset(trans, TRANS_RESET_MEM);
-}
-
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
int bch2_trans_exit(struct btree_trans *);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index efa68bb578ab..b7af88e05837 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -94,7 +94,6 @@ struct btree {
struct btree_nr_keys nr;
u16 sib_u64s[2];
u16 whiteout_u64s;
- u16 uncompacted_whiteout_u64s;
u8 page_order;
u8 unpack_fn_len;
@@ -185,9 +184,25 @@ enum btree_iter_type {
#define BTREE_ITER_TYPE ((1 << 2) - 1)
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
#define BTREE_ITER_SLOTS (1 << 2)
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
#define BTREE_ITER_INTENT (1 << 3)
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
#define BTREE_ITER_PREFETCH (1 << 4)
+/*
+ * Indicates that this iterator should not be reused until transaction commit,
+ * either because a pending update references it or because the update depends
+ * on that particular key being locked (e.g. by the str_hash code, for hash
+ * table consistency)
+ */
#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
@@ -195,6 +210,7 @@ enum btree_iter_type {
*/
#define BTREE_ITER_IS_EXTENTS (1 << 6)
#define BTREE_ITER_ERROR (1 << 7)
+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -211,12 +227,13 @@ enum btree_iter_uptodate {
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
- u8 idx;
-
struct btree_trans *trans;
struct bpos pos;
+ struct bpos pos_after_commit;
+
+ u16 flags;
+ u8 idx;
- u8 flags;
enum btree_iter_uptodate uptodate:4;
enum btree_id btree_id:4;
unsigned level:4,
@@ -243,6 +260,8 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
}
struct btree_insert_entry {
+ unsigned trigger_flags;
+ unsigned trans_triggers_run:1;
struct bkey_i *k;
struct btree_iter *iter;
};
@@ -263,6 +282,7 @@ struct btree_trans {
unsigned used_mempool:1;
unsigned error:1;
unsigned nounlock:1;
+ unsigned need_reset:1;
unsigned mem_top;
unsigned mem_bytes;
@@ -270,7 +290,6 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
- u8 *updates_sorted;
/* update path: */
struct journal_res journal_res;
@@ -279,11 +298,11 @@ struct btree_trans {
struct disk_reservation *disk_res;
unsigned flags;
unsigned journal_u64s;
+ unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
struct btree_iter iters_onstack[2];
- struct btree_insert_entry updates_onstack[6];
- u8 updates_sorted_onstack[6];
+ struct btree_insert_entry updates_onstack[2];
};
#define BTREE_FLAG(flag) \
@@ -308,6 +327,7 @@ enum btree_flags {
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
+ BTREE_NODE_old_extent_overwrite,
};
BTREE_FLAG(read_in_flight);
@@ -321,6 +341,7 @@ BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
+BTREE_FLAG(old_extent_overwrite);
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -421,6 +442,11 @@ static inline unsigned bset_u64s(struct bset_tree *t)
sizeof(struct bset) / sizeof(u64);
}
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+{
+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+}
+
static inline unsigned bset_byte_offset(struct btree *b, void *i)
{
return i - (void *) b->data;
@@ -474,6 +500,32 @@ static inline bool btree_node_is_extents(struct btree *b)
(1U << BKEY_TYPE_INODES)| \
(1U << BKEY_TYPE_REFLINK))
+enum btree_trigger_flags {
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
+ __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */
+
+ __BTREE_TRIGGER_INSERT,
+ __BTREE_TRIGGER_OVERWRITE,
+ __BTREE_TRIGGER_OVERWRITE_SPLIT,
+
+ __BTREE_TRIGGER_GC,
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
+ __BTREE_TRIGGER_ALLOC_READ,
+ __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES)
+
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
+
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ)
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index ad8cbf3fb778..2c34bae64281 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,8 +15,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
-enum {
- __BTREE_INSERT_ATOMIC,
+enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
__BTREE_INSERT_NOCHECK_RW,
@@ -25,10 +24,6 @@ enum {
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
- __BTREE_INSERT_NOMARK_OVERWRITES,
- __BTREE_INSERT_NOMARK,
- __BTREE_INSERT_NO_CLEAR_REPLICAS,
- __BTREE_INSERT_BUCKET_INVALIDATE,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
@@ -36,12 +31,6 @@ enum {
};
/*
- * Don't drop/retake locks before doing btree update, instead return -EINTR if
- * we had to drop locks for any reason
- */
-#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC)
-
-/*
* Don't drop locks _after_ successfully updating btree:
*/
#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
@@ -61,16 +50,6 @@ enum {
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-/* Don't mark overwrites, just new key: */
-#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
-
-/* Don't call mark new key at all: */
-#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
-
-#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
-
-#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE)
-
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
@@ -93,6 +72,8 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_btree_ptr *);
+int bch2_trans_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_trigger_flags);
int __bch2_trans_commit(struct btree_trans *);
/**
@@ -101,8 +82,7 @@ int __bch2_trans_commit(struct btree_trans *);
* This is main entry point for btree updates.
*
* Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
+ * -EINTR: locking changed, this function should be called again.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
@@ -118,37 +98,34 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
return __bch2_trans_commit(trans);
}
-static inline void bch2_trans_update(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *k)
-{
- EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
-
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
- trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
- .iter = iter, .k = k
- };
-}
-
-#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \
+ _flags, _reset_flags, _do) \
({ \
- struct btree_trans trans; \
int _ret; \
\
- bch2_trans_init(&trans, (_c), 0, 0); \
- \
do { \
- bch2_trans_begin(&trans); \
+ bch2_trans_reset(_trans, _reset_flags); \
\
- _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \
+ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \
(_journal_seq), (_flags)); \
} while (_ret == -EINTR); \
\
- bch2_trans_exit(&trans); \
_ret; \
})
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+({ \
+ struct btree_trans trans; \
+ int _ret, _ret2; \
+ \
+ bch2_trans_init(&trans, (_c), 0, 0); \
+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
+ TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \
+ _ret2 = bch2_trans_exit(&trans); \
+ \
+ _ret ?: _ret2; \
+})
+
#define trans_for_each_update(_trans, _i) \
for ((_i) = (_trans)->updates; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f8a30cb34750..748e6356f3d6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -193,8 +193,8 @@ found:
gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_OVERWRITE|
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_OVERWRITE|
+ BTREE_TRIGGER_GC);
}
static void __btree_node_free(struct bch_fs *c, struct btree *b)
@@ -265,13 +265,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
+ 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_OVERWRITE|
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_OVERWRITE|
+ BTREE_TRIGGER_GC);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -374,6 +374,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
SET_BTREE_NODE_LEVEL(b->data, level);
b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+ if (btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ set_btree_node_old_extent_overwrite(b);
+
bch2_btree_build_aux_trees(b);
btree_node_will_make_reachable(as, b);
@@ -1077,12 +1084,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
0, 0, fs_usage, 0,
- BCH_BUCKET_MARK_INSERT);
+ BTREE_TRIGGER_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_INSERT|
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_GC);
if (old && !btree_node_fake(old))
bch2_btree_node_free_index(as, NULL,
@@ -1175,16 +1182,16 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
0, 0, fs_usage, 0,
- BCH_BUCKET_MARK_INSERT);
+ BTREE_TRIGGER_INSERT);
if (gc_visited(c, gc_pos_btree_node(b)))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_INSERT|
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_GC);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
- bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
/*
@@ -1378,7 +1385,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
if (keys)
btree_split_insert_keys(as, n1, iter, keys);
- if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_split(c, b);
n2 = __btree_split_node(as, n1, iter);
@@ -1657,6 +1664,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
size_t sib_u64s;
int ret = 0;
+ BUG_ON(!btree_node_locked(iter, level));
+
closure_init_stack(&cl);
retry:
BUG_ON(!btree_node_locked(iter, level));
@@ -2022,12 +2031,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
0, 0, fs_usage, 0,
- BCH_BUCKET_MARK_INSERT);
+ BTREE_TRIGGER_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
0, 0, NULL, 0,
- BCH_BUCKET_MARK_INSERT||
- BCH_BUCKET_MARK_GC);
+ BTREE_TRIGGER_INSERT||
+ BTREE_TRIGGER_GC);
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c5a0ab5d7bb8..2d8e0b7f3aaf 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -251,8 +251,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
- b->whiteout_u64s +
- b->uncompacted_whiteout_u64s;
+ b->whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
return total - used;
@@ -302,23 +301,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k)
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
+ struct bkey_packed *k)
{
- if (bkey_written(b, k)) {
- EBUG_ON(b->uncompacted_whiteout_u64s <
- bkeyp_key_u64s(&b->format, k));
- b->uncompacted_whiteout_u64s -=
- bkeyp_key_u64s(&b->format, k);
- }
-}
+ unsigned u64s = bkeyp_key_u64s(&b->format, k);
+ struct bkey_packed *dst;
-static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
-{
- if (bkey_written(b, k)) {
- BUG_ON(!k->needs_whiteout);
- b->uncompacted_whiteout_u64s +=
- bkeyp_key_u64s(&b->format, k);
- }
+ BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+
+ b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
+ dst = unwritten_whiteouts_start(c, b);
+ memcpy_u64s(dst, k, u64s);
+ dst->u64s = u64s;
+ dst->type = KEY_TYPE_deleted;
}
/*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index d37a95299240..afd2086edeff 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -21,18 +21,12 @@
#include <trace/events/bcachefs.h>
static inline bool same_leaf_as_prev(struct btree_trans *trans,
- unsigned idx)
+ struct btree_insert_entry *i)
{
- return idx &&
- trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
- trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
+ return i != trans->updates &&
+ i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
-#define trans_for_each_update_sorted(_trans, _i, _iter) \
- for (_iter = 0; \
- _iter < _trans->nr_updates && \
- (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \
- _iter++)
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
@@ -51,28 +45,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
bch2_btree_init_next(c, b, iter);
}
-static inline void btree_trans_sort_updates(struct btree_trans *trans)
-{
- struct btree_insert_entry *l, *r;
- unsigned nr = 0, pos;
-
- trans_for_each_update(trans, l) {
- for (pos = 0; pos < nr; pos++) {
- r = trans->updates + trans->updates_sorted[pos];
-
- if (btree_iter_cmp(l->iter, r->iter) <= 0)
- break;
- }
-
- memmove(&trans->updates_sorted[pos + 1],
- &trans->updates_sorted[pos],
- (nr - pos) * sizeof(trans->updates_sorted[0]));
-
- trans->updates_sorted[pos] = l - trans->updates;
- nr++;
- }
-}
-
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -92,58 +64,63 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
bkey_cmp(insert->k.p, b->data->max_key) > 0);
k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && !bkey_cmp_packed(b, k, &insert->k)) {
- BUG_ON(bkey_whiteout(k));
+ if (k && bkey_cmp_packed(b, k, &insert->k))
+ k = NULL;
- if (!bkey_written(b, k) &&
- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
- !bkey_whiteout(&insert->k)) {
- k->type = insert->k.type;
- memcpy_u64s(bkeyp_val(f, k), &insert->v,
- bkey_val_u64s(&insert->k));
- return true;
- }
+ /* @k is the key being overwritten/deleted, if any: */
- insert->k.needs_whiteout = k->needs_whiteout;
+ EBUG_ON(k && bkey_whiteout(k));
+
+ if (bkey_whiteout(&insert->k)) {
+ /* Deleting: */
+
+ /* Not found? Nothing to do: */
+ if (!k)
+ return false;
btree_account_key_drop(b, k);
+ k->type = KEY_TYPE_deleted;
+
+ if (k->needs_whiteout) {
+ push_whiteout(iter->trans->c, b, k);
+ k->needs_whiteout = false;
+ }
if (k >= btree_bset_last(b)->start) {
clobber_u64s = k->u64s;
- /*
- * If we're deleting, and the key we're deleting doesn't
- * need a whiteout (it wasn't overwriting a key that had
- * been written to disk) - just delete it:
- */
- if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
- bch2_bset_delete(b, k, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter,
- k, clobber_u64s, 0);
- return true;
- }
+ bch2_bset_delete(b, k, clobber_u64s);
+ bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ clobber_u64s, 0);
+ } else {
+ bch2_btree_iter_fix_key_modified(iter, b, k);
+ }
- goto overwrite;
+ return true;
+ }
+
+ if (k) {
+ /* Overwriting: */
+ if (!bkey_written(b, k) &&
+ bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+ k->type = insert->k.type;
+ memcpy_u64s(bkeyp_val(f, k), &insert->v,
+ bkey_val_u64s(&insert->k));
+ return true;
}
+ btree_account_key_drop(b, k);
k->type = KEY_TYPE_deleted;
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- k->u64s, k->u64s);
- if (bkey_whiteout(&insert->k)) {
- reserve_whiteout(b, k);
- return true;
+ insert->k.needs_whiteout = k->needs_whiteout;
+ k->needs_whiteout = false;
+
+ if (k >= btree_bset_last(b)->start) {
+ clobber_u64s = k->u64s;
+ goto overwrite;
} else {
- k->needs_whiteout = false;
+ bch2_btree_iter_fix_key_modified(iter, b, k);
}
- } else {
- /*
- * Deleting, but the key to delete wasn't found - nothing to do:
- */
- if (bkey_whiteout(&insert->k))
- return false;
-
- insert->k.needs_whiteout = false;
}
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
@@ -234,38 +211,39 @@ void bch2_btree_journal_key(struct btree_trans *trans,
}
static void bch2_insert_fixup_key(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
- struct btree_iter *iter = insert->iter;
struct btree_iter_level *l = &iter->l[0];
EBUG_ON(iter->level);
- EBUG_ON(insert->k->k.u64s >
+ EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
- insert->k)))
- bch2_btree_journal_key(trans, iter, insert->k);
+ if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
+ bch2_btree_journal_key(trans, iter, insert);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static void btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
struct bset_tree *t = bset_tree_last(b);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ insert->k.needs_whiteout = false;
+
if (!btree_node_is_extents(b))
- bch2_insert_fixup_key(trans, insert);
+ bch2_insert_fixup_key(trans, iter, insert);
else
- bch2_insert_fixup_extent(trans, insert);
+ bch2_insert_fixup_extent(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -279,26 +257,25 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_maybe_compact_whiteouts(c, b))
bch2_btree_iter_reinit_node(iter, b);
- trace_btree_insert_key(c, b, insert->k);
+ trace_btree_insert_key(c, b, insert);
}
/* Normal update interface: */
static inline void btree_insert_entry_checks(struct btree_trans *trans,
- struct btree_insert_entry *i)
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- BUG_ON(i->iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- !(trans->flags & BTREE_INSERT_ATOMIC));
+ BUG_ON(iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
+ EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
BUG_ON(debug_check_bkeys(c) &&
- !bkey_deleted(&i->k->k) &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
+ !bkey_deleted(&insert->k) &&
+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
}
static noinline int
@@ -339,11 +316,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
static enum btree_insert_ret
btree_key_can_insert(struct btree_trans *trans,
- struct btree_insert_entry *insert,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
unsigned *u64s)
{
struct bch_fs *c = trans->c;
- struct btree *b = insert->iter->l[0].b;
+ struct btree *b = iter->l[0].b;
static enum btree_insert_ret ret;
if (unlikely(btree_node_fake(b)))
@@ -351,7 +329,7 @@ btree_key_can_insert(struct btree_trans *trans,
ret = !btree_node_is_extents(b)
? BTREE_INSERT_OK
- : bch2_extent_can_insert(trans, insert, u64s);
+ : bch2_extent_can_insert(trans, iter, insert, u64s);
if (ret)
return ret;
@@ -362,21 +340,22 @@ btree_key_can_insert(struct btree_trans *trans,
}
static inline void do_btree_insert_one(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
- btree_insert_key_leaf(trans, insert);
+ btree_insert_key_leaf(trans, iter, insert);
}
-static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
+static inline bool iter_has_trans_triggers(struct btree_iter *iter)
{
- return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
}
-static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
+static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
{
return (BTREE_NODE_TYPE_HAS_TRIGGERS &
~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
- (1U << i->iter->btree_id);
+ (1U << iter->btree_id);
}
static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
@@ -388,17 +367,11 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
- ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
- : 0;
-
- if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
- return;
trans_for_each_update(trans, i)
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
- bch2_mark_update(trans, i, NULL,
- mark_flags|BCH_BUCKET_MARK_GC);
+ bch2_mark_update(trans, i->iter, i->k, NULL,
+ i->trigger_flags|BTREE_TRIGGER_GC);
}
static inline int
@@ -408,10 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
- unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
- ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
- : 0;
- unsigned iter, u64s = 0;
+ unsigned u64s = 0;
bool marking = false;
int ret;
@@ -428,13 +398,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
- trans_for_each_update_sorted(trans, i, iter) {
+ trans_for_each_update(trans, i) {
/* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, iter))
+ if (!same_leaf_as_prev(trans, i))
u64s = 0;
u64s += i->k->k.u64s;
- ret = btree_key_can_insert(trans, i, &u64s);
+ ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
if (ret) {
*stopped_at = i;
return ret;
@@ -483,9 +453,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
}
trans_for_each_update(trans, i)
- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- update_has_nontrans_triggers(i))
- bch2_mark_update(trans, i, fs_usage, mark_flags);
+ if (iter_has_nontrans_triggers(i->iter))
+ bch2_mark_update(trans, i->iter, i->k,
+ fs_usage, i->trigger_flags);
if (marking)
bch2_trans_fs_usage_apply(trans, fs_usage);
@@ -494,7 +464,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
- do_btree_insert_one(trans, i);
+ do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
bch2_fs_usage_scratch_put(c, fs_usage);
@@ -512,44 +482,17 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
{
struct btree_insert_entry *i;
struct btree_iter *iter;
- unsigned idx, u64s, journal_preres_u64s = 0;
int ret;
- /*
- * note: running triggers will append more updates to the list of
- * updates as we're walking it:
- */
- trans_for_each_update(trans, i) {
- /* we know trans->nounlock won't be set here: */
- if (unlikely(!(i->iter->locks_want < 1
- ? __bch2_btree_iter_upgrade(i->iter, 1)
- : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
- trace_trans_restart_upgrade(trans->ip);
- return -EINTR;
- }
-
- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- update_has_trans_triggers(i)) {
- ret = bch2_trans_mark_update(trans, i->iter, i->k);
- if (unlikely(ret)) {
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip);
- return ret;
- }
- }
-
- u64s = jset_u64s(i->k->k.u64s);
- if (0)
- journal_preres_u64s += u64s;
- trans->journal_u64s += u64s;
- }
+ trans_for_each_update(trans, i)
+ BUG_ON(!btree_node_intent_locked(i->iter, 0));
ret = bch2_journal_preres_get(&trans->c->journal,
- &trans->journal_preres, journal_preres_u64s,
+ &trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK);
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
- journal_preres_u64s);
+ trans->journal_preres_u64s);
if (unlikely(ret))
return ret;
@@ -570,24 +513,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_update(trans, i)
- btree_insert_entry_checks(trans, i);
+ btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
- /*
- * No more updates can be added - sort updates so we can take write
- * locks in the correct order:
- */
- btree_trans_sort_updates(trans);
-
- trans_for_each_update_sorted(trans, i, idx)
- if (!same_leaf_as_prev(trans, idx))
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
- trans_for_each_update_sorted(trans, i, idx)
- if (!same_leaf_as_prev(trans, idx))
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
i->iter);
@@ -603,8 +540,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
- trans_for_each_update_sorted(trans, i, idx)
- if (!same_leaf_as_prev(trans, idx))
+ trans_for_each_update(trans, i)
+ if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
@@ -636,8 +573,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
/*
* if the split succeeded without dropping locks the insert will
- * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
- * caller peeked() and is overwriting won't have changed)
+ * still be atomic (what the caller peeked() and is overwriting
+ * won't have changed)
*/
#if 0
/*
@@ -708,13 +645,6 @@ int bch2_trans_commit_error(struct btree_trans *trans,
return ret2;
}
- /*
- * BTREE_ITER_ATOMIC means we have to return -EINTR if we
- * dropped locks:
- */
- if (!(flags & BTREE_INSERT_ATOMIC))
- return 0;
-
trace_trans_restart_atomic(trans->ip);
}
@@ -744,79 +674,197 @@ int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
- unsigned orig_nr_updates = trans->nr_updates;
- unsigned orig_mem_top = trans->mem_top;
+ bool trans_trigger_run;
+ unsigned u64s;
int ret = 0;
+ BUG_ON(trans->need_reset);
+
if (!trans->nr_updates)
goto out_noupdates;
- /* for the sake of sanity: */
- EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
-
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&trans->c->gc_lock);
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+ trans->journal_u64s = 0;
+ trans->journal_preres_u64s = 0;
+
if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
unlikely(!percpu_ref_tryget(&trans->c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
return ret;
}
+
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ trans_for_each_update(trans, i) {
+ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
+ trace_trans_restart_traverse(trans->ip);
+ ret = -EINTR;
+ goto out;
+ }
+
+ /*
+ * We're not using bch2_btree_iter_upgrade here because
+ * we know trans->nounlock can't be set:
+ */
+ if (unlikely(i->iter->locks_want < 1 &&
+ !__bch2_btree_iter_upgrade(i->iter, 1))) {
+ trace_trans_restart_upgrade(trans->ip);
+ ret = -EINTR;
+ goto out;
+ }
+
+ if (iter_has_trans_triggers(i->iter) &&
+ !i->trans_triggers_run) {
+ i->trans_triggers_run = true;
+ trans_trigger_run = true;
+
+ ret = bch2_trans_mark_update(trans, i->iter, i->k,
+ i->trigger_flags);
+ if (unlikely(ret)) {
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip);
+ goto out;
+ }
+ }
+ }
+ } while (trans_trigger_run);
+
+ trans_for_each_update(trans, i) {
+ u64s = jset_u64s(i->k->k.u64s);
+ if (0)
+ trans->journal_preres_u64s += u64s;
+ trans->journal_u64s += u64s;
+ }
retry:
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- trans->journal_u64s = 0;
ret = do_bch2_trans_commit(trans, &i);
- if (trans->fs_usage_deltas) {
- trans->fs_usage_deltas->used = 0;
- memset(&trans->fs_usage_deltas->memset_start, 0,
- (void *) &trans->fs_usage_deltas->memset_end -
- (void *) &trans->fs_usage_deltas->memset_start);
- }
-
/* make sure we didn't drop or screw up locks: */
bch2_btree_trans_verify_locks(trans);
if (ret)
goto err;
+
+ trans_for_each_iter(trans, iter)
+ if ((trans->iters_live & (1ULL << iter->idx)) &&
+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
+ if (trans->flags & BTREE_INSERT_NOUNLOCK)
+ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
+ else
+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
+ }
out:
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
out_noupdates:
- EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
-
- trans_for_each_iter_all(trans, iter)
- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-
- if (!ret) {
- bch2_trans_unlink_iters(trans);
- trans->iters_touched = 0;
- }
- trans->nr_updates = 0;
- trans->mem_top = 0;
+ bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE);
return ret;
err:
ret = bch2_trans_commit_error(trans, i, ret);
-
- /* can't loop if it was passed in and we changed it: */
- if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
- ret = -EINTR;
if (ret)
goto out;
- /* free updates and memory used by triggers, they'll be reexecuted: */
- trans->nr_updates = orig_nr_updates;
- trans->mem_top = orig_mem_top;
goto retry;
}
+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_trigger_flags flags)
+{
+ struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+ .trigger_flags = flags, .iter = iter, .k = k
+ };
+
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ iter->pos_after_commit = k->k.p;
+ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+ }
+
+ /*
+ * Pending updates are kept sorted: first, find position of new update:
+ */
+ trans_for_each_update(trans, i)
+ if (btree_iter_cmp(iter, i->iter) <= 0)
+ break;
+
+ /*
+ * Now delete/trim any updates the new update overwrites:
+ */
+ if (i > trans->updates &&
+ i[-1].iter->btree_id == iter->btree_id &&
+ bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
+ bch2_cut_back(n.iter->pos, i[-1].k);
+
+ while (i < trans->updates + trans->nr_updates &&
+ iter->btree_id == i->iter->btree_id &&
+ bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
+ array_remove_item(trans->updates, trans->nr_updates,
+ i - trans->updates);
+
+ if (i < trans->updates + trans->nr_updates &&
+ iter->btree_id == i->iter->btree_id &&
+ bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
+ /*
+ * When we have an extent that overwrites the start of another
+ * update, trimming that extent will mean the iterator's
+ * position has to change since the iterator position has to
+ * match the extent's start pos - but we don't want to change
+ * the iterator pos if some other code is using it, so we may
+ * need to clone it:
+ */
+ if (trans->iters_live & (1ULL << i->iter->idx)) {
+ i->iter = bch2_trans_copy_iter(trans, i->iter);
+ if (IS_ERR(i->iter)) {
+ trans->need_reset = true;
+ return PTR_ERR(i->iter);
+ }
+
+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ bch2_trans_iter_put(trans, i->iter);
+ }
+
+ bch2_cut_front(n.k->k.p, i->k);
+ bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+ }
+
+ EBUG_ON(trans->nr_updates >= trans->nr_iters);
+
+ array_insert_item(trans->updates, trans->nr_updates,
+ i - trans->updates, n);
+ return 0;
+}
+
+static int __bch2_btree_insert(struct btree_trans *trans,
+ enum btree_id id, struct bkey_i *k)
+{
+ struct btree_iter *iter;
+
+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ bch2_trans_update(trans, iter, k, 0);
+ return 0;
+}
+
/**
* bch2_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs
@@ -825,29 +873,12 @@ err:
* @hook: insert callback
*/
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- u64 *journal_seq, int flags)
+ struct bkey_i *k,
+ struct disk_reservation *disk_res,
+ u64 *journal_seq, int flags)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- bch2_trans_update(&trans, iter, k);
-
- ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
- if (ret == -EINTR)
- goto retry;
- bch2_trans_exit(&trans);
-
- return ret;
+ return bch2_trans_do(c, disk_res, journal_seq, flags,
+ __bch2_btree_insert(&trans, id, k));
}
int bch2_btree_delete_at_range(struct btree_trans *trans,
@@ -863,6 +894,8 @@ retry:
bkey_cmp(iter->pos, end) < 0) {
struct bkey_i delete;
+ bch2_trans_reset(trans, TRANS_RESET_MEM);
+
bkey_init(&delete.k);
/*
@@ -890,9 +923,8 @@ retry:
break;
}
- bch2_trans_update(trans, iter, &delete);
+ bch2_trans_update(trans, iter, &delete, 0);
ret = bch2_trans_commit(trans, NULL, journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL);
if (ret)
break;
@@ -917,7 +949,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
bkey_init(&k.k);
k.k.p = iter->pos;
- bch2_trans_update(trans, iter, &k);
+ bch2_trans_update(trans, iter, &k, 0);
return bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|flags);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 8d223aa2bee5..731b93255876 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -161,7 +161,7 @@ struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
struct bch_fs_usage *ret;
unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
- ret = kzalloc(bytes, GFP_NOWAIT);
+ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
if (ret)
return ret;
@@ -628,7 +628,7 @@ unwind:
percpu_rwsem_assert_held(&c->mark_lock); \
\
for (gc = 0; gc < 2 && !ret; gc++) \
- if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \
(gc && gc_visited(c, pos))) \
ret = fn(c, __VA_ARGS__, gc); \
ret; \
@@ -710,7 +710,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
- bool gc = flags & BCH_BUCKET_MARK_GC;
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bkey_alloc_unpacked u;
struct bch_dev *ca;
struct bucket *g;
@@ -719,8 +719,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
/*
* alloc btree is read in by bch2_alloc_read, not gc:
*/
- if ((flags & BCH_BUCKET_MARK_GC) &&
- !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE))
+ if ((flags & BTREE_TRIGGER_GC) &&
+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -743,7 +743,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
}
}));
- if (!(flags & BCH_BUCKET_MARK_ALLOC_READ))
+ if (!(flags & BTREE_TRIGGER_ALLOC_READ))
bch2_dev_usage_update(c, ca, fs_usage, old, m, gc);
g->io_time[READ] = u.read_time;
@@ -756,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
* not:
*/
- if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) &&
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old.cached_sectors) {
update_cached_sectors(c, fs_usage, ca->dev_idx,
-old.cached_sectors);
@@ -842,13 +842,13 @@ static s64 __ptr_disk_sectors_delta(unsigned old_size,
{
BUG_ON(!n || !d);
- if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
BUG_ON(offset + -delta > old_size);
return -disk_sectors_scaled(n, d, old_size) +
disk_sectors_scaled(n, d, offset) +
disk_sectors_scaled(n, d, old_size - offset + delta);
- } else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+ } else if (flags & BTREE_TRIGGER_OVERWRITE) {
BUG_ON(offset + -delta > old_size);
return -disk_sectors_scaled(n, d, old_size) +
@@ -874,8 +874,8 @@ static void bucket_set_stripe(struct bch_fs *c,
u64 journal_seq,
unsigned flags)
{
- bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE);
- bool gc = flags & BCH_BUCKET_MARK_GC;
+ bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE);
+ bool gc = flags & BTREE_TRIGGER_GC;
unsigned i;
for (i = 0; i < v->nr_blocks; i++) {
@@ -922,7 +922,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
- bool gc = flags & BCH_BUCKET_MARK_GC;
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
@@ -970,7 +970,7 @@ static bool bch2_mark_pointer(struct bch_fs *c,
new.data_type = data_type;
}
- if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+ if (flags & BTREE_TRIGGER_NOATOMIC) {
g->_mark = new;
break;
}
@@ -1008,7 +1008,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
unsigned *nr_data,
unsigned *nr_parity)
{
- bool gc = flags & BCH_BUCKET_MARK_GC;
+ bool gc = flags & BTREE_TRIGGER_GC;
struct stripe *m;
unsigned old, new;
int blocks_nonempty_delta;
@@ -1121,7 +1121,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
- bool gc = flags & BCH_BUCKET_MARK_GC;
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
size_t idx = s.k->p.offset;
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
@@ -1129,14 +1129,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
spin_lock(&c->ec_stripes_heap_lock);
- if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) {
+ if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) {
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
return -1;
}
- if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) {
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
m->sectors = le16_to_cpu(s.v->sectors);
m->algorithm = s.v->algorithm;
m->nr_blocks = s.v->nr_blocks;
@@ -1152,7 +1152,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
#endif
/* gc recalculates these fields: */
- if (!(flags & BCH_BUCKET_MARK_GC)) {
+ if (!(flags & BTREE_TRIGGER_GC)) {
for (i = 0; i < s.v->nr_blocks; i++) {
m->block_sectors[i] =
stripe_blockcount_get(s.v, i);
@@ -1185,16 +1185,16 @@ int bch2_mark_key_locked(struct bch_fs *c,
preempt_disable();
- if (!fs_usage || (flags & BCH_BUCKET_MARK_GC))
+ if (!fs_usage || (flags & BTREE_TRIGGER_GC))
fs_usage = fs_usage_ptr(c, journal_seq,
- flags & BCH_BUCKET_MARK_GC);
+ flags & BTREE_TRIGGER_GC);
switch (k.k->type) {
case KEY_TYPE_alloc:
ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
- sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
@@ -1210,7 +1210,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_inode:
- if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+ if (!(flags & BTREE_TRIGGER_OVERWRITE))
fs_usage->nr_inodes++;
else
fs_usage->nr_inodes--;
@@ -1260,7 +1260,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
unsigned offset = 0;
s64 sectors = 0;
- flags |= BCH_BUCKET_MARK_OVERWRITE;
+ flags |= BTREE_TRIGGER_OVERWRITE;
if (btree_node_is_extents(b)
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
@@ -1288,7 +1288,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
offset = bkey_start_offset(&new->k) -
bkey_start_offset(old.k);
sectors = -((s64) new->k.size);
- flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
break;
}
@@ -1300,26 +1300,29 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
}
int bch2_mark_update(struct btree_trans *trans,
- struct btree_insert_entry *insert,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
struct bch_fs_usage *fs_usage,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
struct bkey_packed *_k;
int ret = 0;
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
+
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
- 0, insert->k->k.size,
+ bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
+ 0, insert->k.size,
fs_usage, trans->journal_res.seq,
- BCH_BUCKET_MARK_INSERT|flags);
+ BTREE_TRIGGER_INSERT|flags);
- if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
return 0;
/*
@@ -1328,7 +1331,7 @@ int bch2_mark_update(struct btree_trans *trans,
*/
if ((iter->btree_id == BTREE_ID_ALLOC ||
iter->btree_id == BTREE_ID_EC) &&
- !bkey_deleted(&insert->k->k))
+ !bkey_deleted(&insert->k))
return 0;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
@@ -1336,7 +1339,7 @@ int bch2_mark_update(struct btree_trans *trans,
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
- ret = bch2_mark_overwrite(trans, iter, k, insert->k,
+ ret = bch2_mark_overwrite(trans, iter, k, insert,
fs_usage, flags);
if (ret <= 0)
break;
@@ -1430,30 +1433,6 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
-static void *trans_update_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned u64s)
-{
- struct btree_insert_entry *i;
- struct bkey_i *new_k;
-
- new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(new_k))
- return new_k;
-
- bkey_init(&new_k->k);
- new_k->k.p = iter->pos;
-
- trans_for_each_update(trans, i)
- if (i->iter == iter) {
- i->k = new_k;
- return new_k;
- }
-
- bch2_trans_update(trans, iter, new_k);
- return new_k;
-}
-
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
@@ -1537,7 +1516,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
u.data_type = u.dirty_sectors || u.cached_sectors
? data_type : 0;
- a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1545,6 +1524,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
+ bch2_trans_update(trans, iter, &a->k_i, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1559,9 +1539,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
- struct bkey_i *new_k;
struct bkey_s_c k;
- struct bkey_s_stripe s;
+ struct bkey_i_stripe *s;
int ret = 0;
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1576,21 +1555,21 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto out;
}
- new_k = trans_update_key(trans, iter, k.k->u64s);
- ret = PTR_ERR_OR_ZERO(new_k);
+ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(s);
if (ret)
goto out;
- bkey_reassemble(new_k, k);
- s = bkey_i_to_s_stripe(new_k);
+ bkey_reassemble(&s->k_i, k);
- stripe_blockcount_set(s.v, p.block,
- stripe_blockcount_get(s.v, p.block) +
+ stripe_blockcount_set(&s->v, p.block,
+ stripe_blockcount_get(&s->v, p.block) +
sectors);
- *nr_data = s.v->nr_blocks - s.v->nr_redundant;
- *nr_parity = s.v->nr_redundant;
- bch2_bkey_to_replicas(&r->e, s.s_c);
+ *nr_data = s->v.nr_blocks - s->v.nr_redundant;
+ *nr_parity = s->v.nr_redundant;
+ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
+ bch2_trans_update(trans, iter, &s->k_i, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1671,7 +1650,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
- struct bkey_i *new_k;
struct bkey_s_c k;
struct bkey_i_reflink_v *r_v;
s64 ret;
@@ -1689,7 +1667,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
goto err;
}
- if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+ if ((flags & BTREE_TRIGGER_OVERWRITE) &&
(bkey_start_offset(k.k) < idx ||
k.k->p.offset > idx + sectors))
goto out;
@@ -1697,21 +1675,22 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
- new_k = trans_update_key(trans, iter, k.k->u64s);
- ret = PTR_ERR_OR_ZERO(new_k);
+ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
goto err;
- bkey_reassemble(new_k, k);
- r_v = bkey_i_to_reflink_v(new_k);
+ bkey_reassemble(&r_v->k_i, k);
le64_add_cpu(&r_v->v.refcount,
- !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
if (!r_v->v.refcount) {
r_v->k.type = KEY_TYPE_deleted;
set_bkey_val_u64s(&r_v->k, 0);
}
+
+ bch2_trans_update(trans, iter, &r_v->k_i, 0);
out:
ret = k.k->p.offset - idx;
err:
@@ -1750,7 +1729,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
- sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE)
+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
@@ -1763,7 +1742,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);
- if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
+ if (!(flags & BTREE_TRIGGER_OVERWRITE))
d->nr_inodes++;
else
d->nr_inodes--;
@@ -1791,22 +1770,26 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
int bch2_trans_mark_update(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert)
+ struct bkey_i *insert,
+ unsigned flags)
{
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
struct bkey_packed *_k;
int ret;
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
+
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
- 0, insert->k.size, BCH_BUCKET_MARK_INSERT);
+ 0, insert->k.size, BTREE_TRIGGER_INSERT);
if (ret)
return ret;
- if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
+ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES))
return 0;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
@@ -1815,7 +1798,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
struct bkey_s_c k;
unsigned offset = 0;
s64 sectors = 0;
- unsigned flags = BCH_BUCKET_MARK_OVERWRITE;
+ unsigned flags = BTREE_TRIGGER_OVERWRITE;
k = bkey_disassemble(b, _k, &unpacked);
@@ -1845,7 +1828,7 @@ int bch2_trans_mark_update(struct btree_trans *trans,
offset = bkey_start_offset(&insert->k) -
bkey_start_offset(k.k);
sectors = -((s64) insert->k.size);
- flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
break;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index ad6f731b1cea..4717a1a6f568 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -258,14 +258,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-#define BCH_BUCKET_MARK_INSERT (1 << 0)
-#define BCH_BUCKET_MARK_OVERWRITE (1 << 1)
-#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3)
-#define BCH_BUCKET_MARK_GC (1 << 4)
-#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5)
-#define BCH_BUCKET_MARK_NOATOMIC (1 << 6)
-
int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
@@ -276,17 +268,16 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
struct bch_fs_usage *, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
- struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct bch_fs_usage *, unsigned);
int bch2_replicas_delta_list_apply(struct bch_fs *,
struct bch_fs_usage *,
struct replicas_delta_list *);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
unsigned, s64, unsigned);
-int bch2_trans_mark_update(struct btree_trans *,
- struct btree_iter *iter,
- struct bkey_i *insert);
+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
+ struct bkey_i *insert, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
/* disk reservations: */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 059eca01ccc4..5028d0dcc2d6 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -6,6 +6,7 @@
#include "buckets.h"
#include "chardev.h"
#include "move.h"
+#include "replicas.h"
#include "super.h"
#include "super-io.h"
@@ -371,89 +372,116 @@ err:
return ret;
}
-static long bch2_ioctl_usage(struct bch_fs *c,
- struct bch_ioctl_usage __user *user_arg)
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+ struct bch_ioctl_fs_usage __user *user_arg)
{
- struct bch_ioctl_usage arg;
- struct bch_dev *ca;
- unsigned i, j;
- int ret;
+ struct bch_ioctl_fs_usage *arg = NULL;
+ struct bch_replicas_usage *dst_e, *dst_end;
+ struct bch_fs_usage *src;
+ u32 replica_entries_bytes;
+ unsigned i;
+ int ret = 0;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
return -EFAULT;
- for (i = 0; i < arg.nr_devices; i++) {
- struct bch_ioctl_dev_usage dst = { .alive = 0 };
+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
- ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
- if (ret)
- return ret;
+ src = bch2_fs_usage_read(c);
+ if (!src) {
+ ret = -ENOMEM;
+ goto err;
}
- {
- struct bch_fs_usage *src;
- struct bch_ioctl_fs_usage dst = {
- .capacity = c->capacity,
- };
+ arg->capacity = c->capacity;
+ arg->used = bch2_fs_sectors_used(c, src);
+ arg->online_reserved = src->online_reserved;
- src = bch2_fs_usage_read(c);
- if (!src)
- return -ENOMEM;
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ arg->persistent_reserved[i] = src->persistent_reserved[i];
- dst.used = bch2_fs_sectors_used(c, src);
- dst.online_reserved = src->online_reserved;
+ dst_e = arg->replicas;
+ dst_end = (void *) arg->replicas + replica_entries_bytes;
- percpu_up_read(&c->mark_lock);
+ for (i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry *src_e =
+ cpu_replicas_entry(&c->replicas, i);
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- dst.persistent_reserved[i] =
- src->persistent_reserved[i];
-#if 0
- for (j = 0; j < BCH_DATA_NR; j++)
- dst.sectors[j][i] = src.replicas[i].data[j];
-#endif
+ if (replicas_usage_next(dst_e) > dst_end) {
+ ret = -ERANGE;
+ break;
}
- kfree(src);
+ dst_e->sectors = src->replicas[i];
+ dst_e->r = *src_e;
+
+ /* recheck after setting nr_devs: */
+ if (replicas_usage_next(dst_e) > dst_end) {
+ ret = -ERANGE;
+ break;
+ }
- ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
- if (ret)
- return ret;
+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+
+ dst_e = replicas_usage_next(dst_e);
}
- for_each_member_device(ca, c, i) {
- struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
- struct bch_ioctl_dev_usage dst = {
- .alive = 1,
- .state = ca->mi.state,
- .bucket_size = ca->mi.bucket_size,
- .nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket,
- };
-
- if (ca->dev_idx >= arg.nr_devices) {
- percpu_ref_put(&ca->ref);
- return -ERANGE;
- }
+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
- if (percpu_ref_tryget(&ca->io_ref)) {
- dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
- percpu_ref_put(&ca->io_ref);
- }
+ percpu_up_read(&c->mark_lock);
+ kfree(src);
- for (j = 0; j < BCH_DATA_NR; j++) {
- dst.buckets[j] = src.buckets[j];
- dst.sectors[j] = src.sectors[j];
- }
+ if (!ret)
+ ret = copy_to_user(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes);
+err:
+ kfree(arg);
+ return ret;
+}
+
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+ struct bch_ioctl_dev_usage __user *user_arg)
+{
+ struct bch_ioctl_dev_usage arg;
+ struct bch_dev_usage src;
+ struct bch_dev *ca;
+ unsigned i;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
+ return -EINVAL;
- ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
- if (ret)
- return ret;
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad[0] ||
+ arg.pad[1] ||
+ arg.pad[2])
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ src = bch2_dev_usage_read(c, ca);
+
+ arg.state = ca->mi.state;
+ arg.bucket_size = ca->mi.bucket_size;
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ arg.buckets[i] = src.buckets[i];
+ arg.sectors[i] = src.sectors[i];
}
- return 0;
+ percpu_ref_put(&ca->ref);
+
+ return copy_to_user(user_arg, &arg, sizeof(arg));
}
static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -547,8 +575,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
switch (cmd) {
case BCH_IOCTL_QUERY_UUID:
return bch2_ioctl_query_uuid(c, arg);
- case BCH_IOCTL_USAGE:
- return bch2_ioctl_usage(c, arg);
+ case BCH_IOCTL_FS_USAGE:
+ return bch2_ioctl_fs_usage(c, arg);
+ case BCH_IOCTL_DEV_USAGE:
+ return bch2_ioctl_dev_usage(c, arg);
}
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index ad6993b7565a..a5c947e8adf3 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <keys/user-type.h>
@@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -198,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -223,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -461,7 +462,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -544,7 +545,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -572,7 +573,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -604,7 +605,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 0b359aba2526..ca9e45906dc8 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -108,8 +108,8 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
}
static const unsigned bch2_compression_opt_to_type[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
- BCH_COMPRESSION_TYPES()
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+ BCH_COMPRESSION_OPTS()
#undef x
};
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f18266330687..d9de0d1302e2 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -18,6 +18,14 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
size_t i;
spin_lock(&clock->timer_lock);
+
+ if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+ timer->expire)) {
+ spin_unlock(&clock->timer_lock);
+ timer->fn(timer);
+ return;
+ }
+
for (i = 0; i < clock->timers.used; i++)
if (clock->timers.data[i] == timer)
goto out;
@@ -135,26 +143,31 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
return ret;
}
-void __bch2_increment_clock(struct io_clock *clock)
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
- unsigned long now;
- unsigned sectors;
+ unsigned long now = atomic_long_add_return(sectors, &clock->now);
- /* Buffer up one megabyte worth of IO in the percpu counter */
- preempt_disable();
+ while ((timer = get_expired_timer(clock, now)))
+ timer->fn(timer);
+}
- if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) {
- preempt_enable();
- return;
- }
+ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf)
+{
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
+ unsigned long now;
+ unsigned i;
- sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
- preempt_enable();
- now = atomic_long_add_return(sectors, &clock->now);
+ spin_lock(&clock->timer_lock);
+ now = atomic_long_read(&clock->now);
- while ((timer = get_expired_timer(clock, now)))
- timer->fn(timer);
+ for (i = 0; i < clock->timers.used; i++)
+ pr_buf(&out, "%pf:\t%li\n",
+ clock->timers.data[i]->fn,
+ clock->timers.data[i]->expire - now);
+ spin_unlock(&clock->timer_lock);
+
+ return out.pos - buf;
}
void bch2_io_clock_exit(struct io_clock *clock)
@@ -168,6 +181,8 @@ int bch2_io_clock_init(struct io_clock *clock)
atomic_long_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
if (!clock->pcpu_buf)
return -ENOMEM;
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index bfbbca8a207b..da50afe206cc 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -7,7 +7,7 @@ void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
-void __bch2_increment_clock(struct io_clock *);
+void __bch2_increment_clock(struct io_clock *, unsigned);
static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
int rw)
@@ -16,7 +16,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
IO_CLOCK_PCPU_SECTORS))
- __bch2_increment_clock(clock);
+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
}
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
@@ -30,6 +30,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
__ret; \
})
+ssize_t bch2_io_timers_show(struct io_clock *, char *);
+
void bch2_io_clock_exit(struct io_clock *);
int bch2_io_clock_init(struct io_clock *);
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 2b5e499e12b4..92c740a47565 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -28,6 +28,7 @@ typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
atomic_long_t now;
u16 __percpu *pcpu_buf;
+ unsigned max_slop;
spinlock_t timer_lock;
io_timer_heap timers;
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 24f565614cd9..bb557eda111b 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
#ifndef CONFIG_HIGHMEM
- __bio_for_each_contig_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (bv.bv_len == start.bi_size)
return (struct bbuf) {
.b = page_address(bv.bv_page) + bv.bv_offset,
@@ -158,14 +158,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
src_data = bio_map_or_bounce(c, src, READ);
switch (crc.compression_type) {
- case BCH_COMPRESSION_LZ4_OLD:
- case BCH_COMPRESSION_LZ4:
+ case BCH_COMPRESSION_TYPE_lz4_old:
+ case BCH_COMPRESSION_TYPE_lz4:
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
if (ret != dst_len)
goto err;
break;
- case BCH_COMPRESSION_GZIP: {
+ case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
.next_in = src_data.b,
.avail_in = src_len,
@@ -185,7 +185,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
goto err;
break;
}
- case BCH_COMPRESSION_ZSTD: {
+ case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_DCtx *ctx;
size_t len;
@@ -290,10 +290,10 @@ static int attempt_compress(struct bch_fs *c,
void *workspace,
void *dst, size_t dst_len,
void *src, size_t src_len,
- unsigned compression_type)
+ enum bch_compression_type compression_type)
{
switch (compression_type) {
- case BCH_COMPRESSION_LZ4: {
+ case BCH_COMPRESSION_TYPE_lz4: {
int len = src_len;
int ret = LZ4_compress_destSize(
src, dst,
@@ -305,7 +305,7 @@ static int attempt_compress(struct bch_fs *c,
return ret;
}
- case BCH_COMPRESSION_GZIP: {
+ case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
.next_in = src,
.avail_in = src_len,
@@ -326,7 +326,7 @@ static int attempt_compress(struct bch_fs *c,
return strm.total_out;
}
- case BCH_COMPRESSION_ZSTD: {
+ case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
@@ -348,14 +348,14 @@ static int attempt_compress(struct bch_fs *c,
static unsigned __bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
- unsigned compression_type)
+ enum bch_compression_type compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
void *workspace;
unsigned pad;
int ret = 0;
- BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
/* If it's only one block, don't bother trying to compress: */
@@ -452,8 +452,8 @@ unsigned bch2_bio_compress(struct bch_fs *c,
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- if (compression_type == BCH_COMPRESSION_LZ4_OLD)
- compression_type = BCH_COMPRESSION_LZ4;
+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old)
+ compression_type = BCH_COMPRESSION_TYPE_lz4;
compression_type =
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
@@ -465,15 +465,15 @@ unsigned bch2_bio_compress(struct bch_fs *c,
static int __bch2_fs_compress_init(struct bch_fs *, u64);
-#define BCH_FEATURE_NONE 0
+#define BCH_FEATURE_none 0
static const unsigned bch2_compression_opt_to_feature[] = {
-#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
- BCH_COMPRESSION_TYPES()
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+ BCH_COMPRESSION_OPTS()
#undef x
};
-#undef BCH_FEATURE_NONE
+#undef BCH_FEATURE_none
static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
{
@@ -537,11 +537,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
size_t compress_workspace;
size_t decompress_workspace;
} compression_types[] = {
- { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
- { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 },
+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
- { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
ZSTD_CCtxWorkspaceBound(params.cParams),
ZSTD_DCtxWorkspaceBound() },
}, *i;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 38017699c04a..623b6c3eda95 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -246,7 +246,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
*/
new_dst->k.p = src_iter->pos;
bch2_trans_update(trans, src_iter,
- &new_dst->k_i);
+ &new_dst->k_i, 0);
return 0;
} else {
/* If we're overwriting, we can't insert new_dst
@@ -268,8 +268,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
- bch2_trans_update(trans, src_iter, &new_src->k_i);
- bch2_trans_update(trans, dst_iter, &new_dst->k_i);
+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
return 0;
}
@@ -281,18 +281,6 @@ int bch2_dirent_delete_at(struct btree_trans *trans,
hash_info, iter);
}
-int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name,
- u64 *journal_seq)
-{
- return bch2_trans_do(c, journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL,
- bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, name));
-}
-
struct btree_iter *
__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
const struct bch_hash_info *hash_info,
@@ -343,7 +331,9 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
break;
}
}
- bch2_trans_iter_put(trans, iter);
+
+ if (!IS_ERR(iter))
+ bch2_trans_iter_put(trans, iter);
return ret;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e6184dc796d3..34769371dd13 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -36,8 +36,6 @@ int bch2_dirent_create(struct btree_trans *, u64,
int bch2_dirent_delete_at(struct btree_trans *,
const struct bch_hash_info *,
struct btree_iter *);
-int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *, u64 *);
enum bch_rename_mode {
BCH_RENAME,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 5287b5ee7d4a..a49d0745c720 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -736,10 +736,9 @@ found_slot:
stripe->k.p = iter->pos;
- bch2_trans_update(&trans, iter, &stripe->k_i);
+ bch2_trans_update(&trans, iter, &stripe->k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL);
err:
if (ret == -EINTR)
@@ -819,10 +818,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
extent_stripe_ptr_add(e, s, ec_ptr, idx);
- bch2_trans_update(&trans, iter, sk.k);
+ bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE);
if (ret == -EINTR)
@@ -1232,7 +1230,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
spin_unlock(&c->ec_stripes_heap_lock);
- bch2_trans_update(trans, iter, &new_key->k_i);
+ bch2_trans_update(trans, iter, &new_key->k_i, 0);
return bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags);
@@ -1259,8 +1257,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote)
if (!m->dirty)
continue;
- ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
- new_key, flags);
+ do {
+ bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
+ ret = __bch2_stripe_write_key(&trans, iter, m,
+ giter.pos, new_key, flags);
+ } while (ret == -EINTR);
+
if (ret)
break;
@@ -1313,8 +1316,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_mark_key(c, btree ? btree_k : journal_k,
0, 0, NULL, 0,
- BCH_BUCKET_MARK_ALLOC_READ|
- BCH_BUCKET_MARK_NOATOMIC);
+ BTREE_TRIGGER_ALLOC_READ|
+ BTREE_TRIGGER_NOATOMIC);
if (btree)
btree_k = bch2_btree_iter_next(btree_iter);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 7dcb0f6552fc..de319794ccd1 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -17,26 +17,6 @@ struct work_struct;
/* Error messages: */
/*
- * Very fatal logic/inconsistency errors: these indicate that we've majorly
- * screwed up at runtime, i.e. it's not likely that it was just caused by the
- * data on disk being inconsistent. These BUG():
- *
- * XXX: audit and convert to inconsistent() checks
- */
-
-#define bch2_fs_bug(c, ...) \
-do { \
- bch_err(c, __VA_ARGS__); \
- BUG(); \
-} while (0)
-
-#define bch2_fs_bug_on(cond, c, ...) \
-do { \
- if (cond) \
- bch2_fs_bug(c, __VA_ARGS__); \
-} while (0)
-
-/*
* Inconsistency errors: The on disk data is inconsistent. If these occur during
* initial recovery, they don't indicate a bug in the running code - we walk all
* the metadata before modifying anything. If they occur at runtime, they
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 742b4d78cb3a..846d77dc2530 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -166,54 +166,72 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
- struct btree_insert_entry *insert,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
unsigned *u64s)
{
- struct btree_iter_level *l = &insert->iter->l[0];
+ struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
- enum bch_extent_overlap overlap;
struct bkey_packed *_k;
struct bkey unpacked;
- struct bkey_s_c k;
int sectors;
- /*
- * We avoid creating whiteouts whenever possible when deleting, but
- * those optimizations mean we may potentially insert two whiteouts
- * instead of one (when we overlap with the front of one extent and the
- * back of another):
- */
- if (bkey_whiteout(&insert->k->k))
- *u64s += BKEY_U64s;
-
- _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
- KEY_TYPE_discard);
- if (!_k)
- return BTREE_INSERT_OK;
-
- k = bkey_disassemble(l->b, _k, &unpacked);
-
- overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
- /* account for having to split existing extent: */
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- *u64s += _k->u64s;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bch2_bkey_sectors_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c,
- trans->disk_res,
- sectors, flags)) {
- case 0:
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+ KEY_TYPE_discard))) {
+ struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
+ enum bch_extent_overlap overlap =
+ bch2_extent_overlap(&insert->k, k.k);
+
+ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
+
+ overlap = bch2_extent_overlap(&insert->k, k.k);
+
+ /*
+ * If we're overwriting an existing extent, we may need to emit
+ * a whiteout - unless we're inserting a new extent at the same
+ * position:
+ */
+ if (k.k->needs_whiteout &&
+ (!bkey_whiteout(&insert->k) ||
+ bkey_cmp(k.k->p, insert->k.p)))
+ *u64s += BKEY_U64s;
+
+ /*
+ * If we're partially overwriting an existing extent which has
+ * been written out to disk, we'll need to emit a new version of
+ * that extent:
+ */
+ if (bkey_written(l->b, _k) &&
+ overlap != BCH_EXTENT_OVERLAP_ALL)
+ *u64s += _k->u64s;
+
+ /* And we may be splitting an existing extent: */
+ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+ *u64s += _k->u64s;
+
+ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+ (sectors = bch2_bkey_sectors_compressed(k))) {
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+ switch (bch2_disk_reservation_add(trans->c,
+ trans->disk_res,
+ sectors, flags)) {
+ case 0:
+ break;
+ case -ENOSPC:
+ return BTREE_INSERT_ENOSPC;
+ default:
+ BUG();
+ }
}
+
+ if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+ overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+ break;
+
+ bch2_btree_node_iter_advance(&node_iter, l->b);
}
return BTREE_INSERT_OK;
@@ -284,6 +302,52 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
}
+static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
+ struct bpos pos)
+{
+ struct bkey_packed k;
+
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey_i tmp;
+
+ bkey_init(&tmp.k);
+ tmp.k.p = pos;
+ bkey_copy(&k, &tmp);
+ }
+
+ k.needs_whiteout = true;
+ push_whiteout(c, b, &k);
+}
+
+static void
+extent_drop(struct bch_fs *c, struct btree_iter *iter,
+ struct bkey_packed *_k, struct bkey_s k)
+{
+ struct btree_iter_level *l = &iter->l[0];
+
+ if (!bkey_whiteout(k.k))
+ btree_account_key_drop(l->b, _k);
+
+ k.k->size = 0;
+ k.k->type = KEY_TYPE_deleted;
+
+ if (!btree_node_old_extent_overwrite(l->b) &&
+ k.k->needs_whiteout) {
+ pack_push_whiteout(c, l->b, k.k->p);
+ k.k->needs_whiteout = false;
+ }
+
+ if (_k >= btree_bset_last(l->b)->start) {
+ unsigned u64s = _k->u64s;
+
+ bch2_bset_delete(l->b, _k, _k->u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
+ } else {
+ extent_save(l->b, _k, k.k);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+ }
+}
+
static void
extent_squash(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert,
@@ -291,95 +355,117 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
enum bch_extent_overlap overlap)
{
struct btree_iter_level *l = &iter->l[0];
- int u64s_delta;
+ struct bkey_on_stack tmp, split;
+
+ bkey_on_stack_init(&tmp);
+ bkey_on_stack_init(&split);
+
+ if (!btree_node_old_extent_overwrite(l->b)) {
+ if (!bkey_whiteout(&insert->k) &&
+ !bkey_cmp(k.k->p, insert->k.p)) {
+ insert->k.needs_whiteout = k.k->needs_whiteout;
+ k.k->needs_whiteout = false;
+ }
+ } else {
+ insert->k.needs_whiteout |= k.k->needs_whiteout;
+ }
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
- /* insert overlaps with start of k: */
- u64s_delta = bch2_cut_front_s(insert->k.p, k);
- btree_keys_account_val_delta(l->b, _k, u64s_delta);
+ if (bkey_written(l->b, _k)) {
+ bkey_on_stack_reassemble(&tmp, c, k.s_c);
+ bch2_cut_front(insert->k.p, tmp.k);
+
+ /*
+ * needs_whiteout was propagated to new version of @k,
+ * @tmp:
+ */
+ if (!btree_node_old_extent_overwrite(l->b))
+ k.k->needs_whiteout = false;
+
+ extent_drop(c, iter, _k, k);
+ extent_bset_insert(c, iter, tmp.k);
+ } else {
+ btree_keys_account_val_delta(l->b, _k,
+ bch2_cut_front_s(insert->k.p, k));
- EBUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+ extent_save(l->b, _k, k.k);
+ /*
+ * No need to call bset_fix_invalidated_key, start of
+ * extent changed but extents are indexed by where they
+ * end
+ */
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+ }
break;
-
case BCH_EXTENT_OVERLAP_BACK:
- /* insert overlaps with end of k: */
- u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
- btree_keys_account_val_delta(l->b, _k, u64s_delta);
+ if (bkey_written(l->b, _k)) {
+ bkey_on_stack_reassemble(&tmp, c, k.s_c);
+ bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
+
+ /*
+ * @tmp has different position than @k, needs_whiteout
+ * should not be propagated:
+ */
+ if (!btree_node_old_extent_overwrite(l->b))
+ tmp.k->k.needs_whiteout = false;
+
+ extent_drop(c, iter, _k, k);
+ extent_bset_insert(c, iter, tmp.k);
+ } else {
+ /*
+ * position of @k is changing, emit a whiteout if
+ * needs_whiteout is set:
+ */
+ if (!btree_node_old_extent_overwrite(l->b) &&
+ k.k->needs_whiteout) {
+ pack_push_whiteout(c, l->b, k.k->p);
+ k.k->needs_whiteout = false;
+ }
- EBUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
+ btree_keys_account_val_delta(l->b, _k,
+ bch2_cut_back_s(bkey_start_pos(&insert->k), k));
+ extent_save(l->b, _k, k.k);
- /*
- * As the auxiliary tree is indexed by the end of the
- * key and we've just changed the end, update the
- * auxiliary tree.
- */
- bch2_bset_fix_invalidated_key(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
+ bch2_bset_fix_invalidated_key(l->b, _k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, _k->u64s, _k->u64s);
+ }
+ break;
+ case BCH_EXTENT_OVERLAP_ALL:
+ extent_drop(c, iter, _k, k);
break;
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ bkey_on_stack_reassemble(&split, c, k.s_c);
+ bch2_cut_back(bkey_start_pos(&insert->k), split.k);
- case BCH_EXTENT_OVERLAP_ALL: {
- /* The insert key completely covers k, invalidate k */
- if (!bkey_whiteout(k.k))
- btree_account_key_drop(l->b, _k);
+ if (!btree_node_old_extent_overwrite(l->b))
+ split.k->k.needs_whiteout = false;
- k.k->size = 0;
- k.k->type = KEY_TYPE_deleted;
+ /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
+ if (bkey_written(l->b, _k)) {
+ bkey_on_stack_reassemble(&tmp, c, k.s_c);
+ bch2_cut_front(insert->k.p, tmp.k);
- if (_k >= btree_bset_last(l->b)->start) {
- unsigned u64s = _k->u64s;
+ if (!btree_node_old_extent_overwrite(l->b))
+ k.k->needs_whiteout = false;
- bch2_bset_delete(l->b, _k, _k->u64s);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, u64s, 0);
+ extent_drop(c, iter, _k, k);
+ extent_bset_insert(c, iter, tmp.k);
} else {
+ btree_keys_account_val_delta(l->b, _k,
+ bch2_cut_front_s(insert->k.p, k));
+
extent_save(l->b, _k, k.k);
bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
- break;
- }
- case BCH_EXTENT_OVERLAP_MIDDLE: {
- struct bkey_on_stack split;
-
- bkey_on_stack_init(&split);
- bkey_on_stack_reassemble(&split, c, k.s_c);
-
- /*
- * The insert key falls 'in the middle' of k
- * The insert key splits k in 3:
- * - start only in k, preserve
- * - middle common section, invalidate in k
- * - end only in k, preserve
- *
- * We update the old key to preserve the start,
- * insert will be the new common section,
- * we manually insert the end that we are preserving.
- *
- * modify k _before_ doing the insert (which will move
- * what k points to)
- */
- split.k->k.needs_whiteout |= bkey_written(l->b, _k);
-
- bch2_cut_back(bkey_start_pos(&insert->k), split.k);
- BUG_ON(bkey_deleted(&split.k->k));
-
- u64s_delta = bch2_cut_front_s(insert->k.p, k);
- btree_keys_account_val_delta(l->b, _k, u64s_delta);
-
- BUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-
extent_bset_insert(c, iter, split.k);
- bkey_on_stack_exit(&split, c);
break;
}
- }
+
+ bkey_on_stack_exit(&split, c);
+ bkey_on_stack_exit(&tmp, c);
}
/**
@@ -422,17 +508,13 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
* key insertion needs to continue/be retried.
*/
void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_insert_entry *insert_entry)
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert_entry->iter;
- struct bkey_i *insert = insert_entry->k;
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
- bool deleting = bkey_whiteout(&insert->k);
- bool update_journal = !deleting;
- bool update_btree = !deleting;
- struct bkey_i whiteout = *insert;
+ bool do_update = !bkey_whiteout(&insert->k);
struct bkey_packed *_k;
struct bkey unpacked;
@@ -443,7 +525,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
KEY_TYPE_discard))) {
struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
- struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
enum bch_extent_overlap overlap =
bch2_extent_overlap(&insert->k, k.k);
@@ -451,52 +532,17 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
break;
if (!bkey_whiteout(k.k))
- update_journal = true;
+ do_update = true;
+
+ if (!do_update) {
+ struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
- if (!update_journal) {
bch2_cut_front(cur_end, insert);
- bch2_cut_front(cur_end, &whiteout);
bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
- goto next;
- }
-
- /*
- * When deleting, if possible just do it by switching the type
- * of the key we're deleting, instead of creating and inserting
- * a new whiteout:
- */
- if (deleting &&
- !update_btree &&
- !bkey_cmp(insert->k.p, k.k->p) &&
- !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
- if (!bkey_whiteout(k.k)) {
- btree_account_key_drop(l->b, _k);
- _k->type = KEY_TYPE_discard;
- reserve_whiteout(l->b, _k);
- bch2_btree_iter_fix_key_modified(iter,
- l->b, _k);
- }
- break;
- }
-
- if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
- insert->k.needs_whiteout = true;
- update_btree = true;
- }
-
- if (update_btree &&
- overlap == BCH_EXTENT_OVERLAP_ALL &&
- bkey_whiteout(k.k) &&
- k.k->needs_whiteout) {
- unreserve_whiteout(l->b, _k);
- _k->needs_whiteout = false;
+ } else {
+ extent_squash(c, iter, insert, _k, k, overlap);
}
- extent_squash(c, iter, insert, _k, k, overlap);
-
- if (!update_btree)
- bch2_cut_front(cur_end, insert);
-next:
node_iter = l->iter;
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
@@ -507,24 +553,15 @@ next:
l->iter = node_iter;
bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
- if (update_btree) {
- if (deleting)
+ if (do_update) {
+ if (insert->k.type == KEY_TYPE_deleted)
insert->k.type = KEY_TYPE_discard;
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-
- extent_bset_insert(c, iter, insert);
- }
-
- if (update_journal) {
- struct bkey_i *k = !deleting ? insert : &whiteout;
-
- if (deleting)
- k->k.type = KEY_TYPE_discard;
-
- EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
+ if (!bkey_whiteout(&insert->k) ||
+ btree_node_old_extent_overwrite(l->b))
+ extent_bset_insert(c, iter, insert);
- bch2_btree_journal_key(trans, iter, k);
+ bch2_btree_journal_key(trans, iter, insert);
}
bch2_cut_front(insert->k.p, insert);
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 89d18e4b6758..e9dc8091ba3f 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -10,9 +10,10 @@ int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
- unsigned *);
+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, unsigned *);
void bch2_insert_fixup_extent(struct btree_trans *,
- struct btree_insert_entry *);
+ struct btree_iter *,
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 6bcc178604b0..c4b0b9e15a8f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -172,14 +172,17 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
struct bucket_mark mark;
struct bch_dev *ca;
- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked(c, k, false), c,
- "btree key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
+ if (!percpu_down_read_trylock(&c->mark_lock))
+ return;
+
+ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ !bch2_bkey_replicas_marked(c, k, false), c,
+ "btree key bad (replicas not marked in superblock):\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
bkey_for_each_ptr(ptrs, ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -194,13 +197,15 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
mark.dirty_sectors < c->opts.btree_node_size)
goto err;
}
-
+out:
+ percpu_up_read(&c->mark_lock);
return;
err:
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
- err, buf, PTR_BUCKET_NR(ca, ptr),
- mark.gen, (unsigned) mark.v.counter);
+ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ PTR_BUCKET_NR(ca, ptr),
+ mark.gen, (unsigned) mark.v.counter);
+ goto out;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -223,29 +228,18 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
char buf[160];
- /*
- * XXX: we should be doing most/all of these checks at startup time,
- * where we check bch2_bkey_invalid() in btree_node_read_done()
- *
- * But note that we can't check for stale pointers or incorrect gc marks
- * until after journal replay is done (it might be an extent that's
- * going to get overwritten during replay)
- */
-
- if (percpu_down_read_trylock(&c->mark_lock)) {
- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
- "extent key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
- percpu_up_read(&c->mark_lock);
- }
- /*
- * If journal replay hasn't finished, we might be seeing keys
- * that will be overwritten by the time journal replay is done:
- */
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) ||
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
+ if (!percpu_down_read_trylock(&c->mark_lock))
+ return;
+
+ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+ "extent key bad (replicas not marked in superblock):\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+
extent_for_each_ptr_decode(e, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
@@ -255,21 +249,24 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
? mark.cached_sectors
: mark.dirty_sectors;
- bch2_fs_bug_on(stale && !p.ptr.cached, c,
- "stale dirty pointer (ptr gen %u bucket %u",
- p.ptr.gen, mark.gen);
-
- bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
-
- bch2_fs_bug_on(!stale &&
- (mark.data_type != BCH_DATA_USER ||
- mark_sectors < disk_sectors), c,
- "extent pointer not marked: %s:\n"
- "type %u sectors %u < %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
- mark.data_type,
- mark_sectors, disk_sectors);
+ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c,
+ "stale dirty pointer (ptr gen %u bucket %u",
+ p.ptr.gen, mark.gen);
+
+ bch2_fs_inconsistent_on(stale > 96, c,
+ "key too stale: %i", stale);
+
+ bch2_fs_inconsistent_on(!stale &&
+ (mark.data_type != BCH_DATA_USER ||
+ mark_sectors < disk_sectors), c,
+ "extent pointer not marked: %s:\n"
+ "type %u sectors %u < %u",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+ mark.data_type,
+ mark_sectors, disk_sectors);
}
+
+ percpu_up_read(&c->mark_lock);
}
void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
@@ -614,7 +611,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
ret += !p.ptr.cached &&
- p.crc.compression_type == BCH_COMPRESSION_NONE;
+ p.crc.compression_type == BCH_COMPRESSION_TYPE_none;
}
return ret;
@@ -629,7 +626,7 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_NONE)
+ p.crc.compression_type != BCH_COMPRESSION_TYPE_none)
ret += p.crc.compressed_size;
return ret;
@@ -1054,7 +1051,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (!bch2_checksum_type_valid(c, crc.csum_type))
return "invalid checksum type";
- if (crc.compression_type >= BCH_COMPRESSION_NR)
+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
return "invalid compression type";
if (bch2_csum_type_is_encryption(crc.csum_type)) {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1140d01a42ab..7c5a41e6d79d 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -8,7 +8,6 @@
struct bch_fs;
struct btree_trans;
-struct btree_insert_entry;
/* extent entries: */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index a4497eeb1f1b..96f7bbe0a3ed 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -76,11 +76,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
}
int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
- u64 inum, struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
+ u64 inum, struct bch_inode_unpacked *dir_u,
+ struct bch_inode_unpacked *inode_u, const struct qstr *name)
{
struct btree_iter *dir_iter, *inode_iter;
- struct bch_inode_unpacked dir_u;
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(trans->c);
@@ -91,18 +90,19 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
- dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
if (IS_ERR(dir_iter))
return PTR_ERR(dir_iter);
- /* XXX: shouldn't we be updating mtime/ctime on the directory? */
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
- dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+ dir_hash = bch2_hash_info_init(trans->c, dir_u);
bch2_trans_iter_put(trans, dir_iter);
return bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum, BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index c1621485a526..2273b7961c9b 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -14,6 +14,7 @@ int bch2_create_trans(struct btree_trans *, u64,
int bch2_link_trans(struct btree_trans *, u64,
u64, struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
const struct qstr *);
int bch2_unlink_trans(struct btree_trans *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 274a8b364702..f393ecb93b7e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -602,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@@ -627,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -782,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1037,32 +1034,33 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
if (io->op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1086,7 +1084,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1240,7 +1238,7 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
@@ -1805,8 +1803,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct address_space *mapping = req->ki_filp->f_mapping;
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned;
+ unsigned unaligned;
u64 new_i_size;
bool sync;
long ret;
@@ -1838,7 +1837,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
ret = -EFAULT;
goto err;
@@ -1886,7 +1885,7 @@ loop:
i_size_write(&inode->v, new_i_size);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
@@ -2288,6 +2287,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
if (ret)
goto err;
+ /*
+ * check this before next assertion; on filesystem error our normal
+ * invariants are a bit broken (truncate has to truncate the page cache
+ * before the inode).
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
BUG_ON(inode->v.i_size < inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
@@ -2403,7 +2411,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct address_space *mapping = inode->v.i_mapping;
struct bkey_on_stack copy;
struct btree_trans trans;
- struct btree_iter *src, *dst, *del = NULL;
+ struct btree_iter *src, *dst;
loff_t shift, new_size;
u64 src_start;
int ret;
@@ -2485,9 +2493,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct bpos next_pos;
struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
struct bpos atomic_end;
- unsigned commit_flags = BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_USE_RESERVE;
+ unsigned trigger_flags = 0;
k = insert
? bch2_btree_iter_peek_prev(src)
@@ -2535,38 +2541,12 @@ reassemble:
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
- /*
- * If the new and old keys overlap (because we're moving an
- * extent that's bigger than the amount we're collapsing by),
- * we need to trim the delete key here so they don't overlap
- * because overlaps on insertions aren't handled before
- * triggers are run, so the overwrite will get double counted
- * by the triggers machinery:
- */
- if (insert &&
- bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
- bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
- } else if (!insert &&
- bkey_cmp(copy.k->k.p,
- bkey_start_pos(&delete.k)) > 0) {
- bch2_cut_front(copy.k->k.p, &delete);
-
- del = bch2_trans_copy_iter(&trans, src);
- BUG_ON(IS_ERR_OR_NULL(del));
-
- bch2_btree_iter_set_pos(del,
- bkey_start_pos(&delete.k));
- }
-
- bch2_trans_update(&trans, dst, copy.k);
- bch2_trans_update(&trans, del ?: src, &delete);
-
if (copy.k->k.size == k.k->size) {
/*
* If we're moving the entire extent, we can skip
* running triggers:
*/
- commit_flags |= BTREE_INSERT_NOMARK;
+ trigger_flags |= BTREE_TRIGGER_NORUN;
} else {
/* We might end up splitting compressed extents: */
unsigned nr_ptrs =
@@ -2578,15 +2558,13 @@ reassemble:
BUG_ON(ret);
}
- ret = bch2_trans_commit(&trans, &disk_res,
- &inode->ei_journal_seq,
- commit_flags);
+ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
+ bch2_trans_commit(&trans, &disk_res,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(c, &disk_res);
bkey_err:
- if (del)
- bch2_trans_iter_put(&trans, del);
- del = NULL;
-
if (!ret)
bch2_btree_iter_set_pos(src, next_pos);
@@ -2670,6 +2648,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct bkey_i_reservation reservation;
struct bkey_s_c k;
+ bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
k = bch2_btree_iter_peek_slot(iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@@ -2716,8 +2696,6 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
- bch2_trans_begin_updates(&trans);
-
ret = bch2_extent_update(&trans, iter, &reservation.k_i,
&disk_res, &inode->ei_journal_seq,
0, &i_sectors_delta);
@@ -2844,235 +2822,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3265,7 +3014,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index e627044c3409..f14f880534fd 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -140,7 +140,6 @@ retry:
bch2_inode_write(&trans, iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
if (ret == -EINTR)
@@ -269,7 +268,6 @@ retry:
goto err_before_quota;
ret = bch2_trans_commit(&trans, NULL, &journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
@@ -379,7 +377,7 @@ static int __bch2_link(struct bch_fs *c,
struct dentry *dentry)
{
struct btree_trans trans;
- struct bch_inode_unpacked inode_u;
+ struct bch_inode_unpacked dir_u, inode_u;
int ret;
mutex_lock(&inode->ei_update_lock);
@@ -389,16 +387,21 @@ static int __bch2_link(struct bch_fs *c,
bch2_trans_begin(&trans);
ret = bch2_link_trans(&trans,
dir->v.i_ino,
- inode->v.i_ino, &inode_u,
+ inode->v.i_ino, &dir_u, &inode_u,
&dentry->d_name) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
} while (ret == -EINTR);
- if (likely(!ret))
+ if (likely(!ret)) {
+ BUG_ON(inode_u.bi_inum != inode->v.i_ino);
+
+ journal_seq_copy(inode, dir->ei_journal_seq);
+ bch2_inode_update_after_write(c, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+ }
bch2_trans_exit(&trans);
mutex_unlock(&inode->ei_update_lock);
@@ -444,7 +447,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
&inode_u, &dentry->d_name) ?:
bch2_trans_commit(&trans, NULL,
&dir->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
} while (ret == -EINTR);
@@ -573,7 +575,6 @@ retry:
mode) ?:
bch2_trans_commit(&trans, NULL,
&journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
if (ret == -EINTR)
goto retry;
@@ -706,7 +707,6 @@ retry:
ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
btree_err:
@@ -963,15 +963,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode->v.i_ino, ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -989,7 +980,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1520,7 +1511,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 0f2308e53d65..9ef532d875e8 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -37,8 +37,8 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
return ret ?: sectors;
}
-static int remove_dirent(struct btree_trans *trans,
- struct bkey_s_c_dirent dirent)
+static int __remove_dirent(struct btree_trans *trans,
+ struct bkey_s_c_dirent dirent)
{
struct bch_fs *c = trans->c;
struct qstr name;
@@ -49,38 +49,47 @@ static int remove_dirent(struct btree_trans *trans,
char *buf;
name.len = bch2_dirent_name_bytes(dirent);
- buf = kmalloc(name.len + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
+ buf = bch2_trans_kmalloc(trans, name.len + 1);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
memcpy(buf, dirent.v->d_name, name.len);
buf[name.len] = '\0';
name.name = buf;
- /* Unlock so we don't deadlock, after copying name: */
- bch2_trans_unlock(trans);
-
- ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
- if (ret) {
+ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+ if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
- goto err;
- }
+ if (ret)
+ return ret;
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
- if (ret)
+ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, dir_inum, &name);
+ if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i deleting dirent", ret);
-err:
- kfree(buf);
- return ret;
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int remove_dirent(struct btree_trans *trans,
+ struct bkey_s_c_dirent dirent)
+{
+ return __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ TRANS_RESET_MEM,
+ __remove_dirent(trans, dirent));
}
static int reattach_inode(struct bch_fs *c,
struct bch_inode_unpacked *lostfound_inode,
u64 inum)
{
- struct bch_inode_unpacked inode_u;
+ struct bch_inode_unpacked dir_u, inode_u;
char name_buf[20];
struct qstr name;
int ret;
@@ -88,11 +97,10 @@ static int reattach_inode(struct bch_fs *c,
snprintf(name_buf, sizeof(name_buf), "%llu", inum);
name = (struct qstr) QSTR(name_buf);
- ret = bch2_trans_do(c, NULL,
- BTREE_INSERT_ATOMIC|
+ ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW,
bch2_link_trans(&trans, lostfound_inode->bi_inum,
- inum, &inode_u, &name));
+ inum, &dir_u, &inode_u, &name));
if (ret)
bch_err(c, "error %i reattaching inode %llu", ret, inum);
@@ -171,27 +179,26 @@ static int hash_redo_key(const struct bch_hash_desc desc,
struct btree_iter *k_iter, struct bkey_s_c k,
u64 hashed)
{
+ struct bkey_i delete;
struct bkey_i *tmp;
- int ret = 0;
- tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ bch2_trans_reset(trans, TRANS_RESET_MEM);
+
+ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
bkey_reassemble(tmp, k);
- ret = bch2_btree_delete_at(trans, k_iter, 0);
- if (ret)
- goto err;
+ bkey_init(&delete.k);
+ delete.k.p = k_iter->pos;
+ bch2_trans_update(trans, k_iter, &delete, 0);
- bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
- tmp, BCH_HASH_SET_MUST_CREATE);
- ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
-err:
- kfree(tmp);
- return ret;
+ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
+ tmp, BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
}
static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -203,7 +210,6 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
retry:
ret = bch2_hash_delete_at(trans, desc, info, iter) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
if (ret == -EINTR) {
@@ -313,9 +319,11 @@ static int hash_check_key(struct btree_trans *trans,
"hashed to %llu chain starts at %llu\n%s",
desc.btree_id, k.k->p.offset,
hashed, h->chain->pos.offset,
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
- ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
+ do {
+ ret = hash_redo_key(desc, trans, h, k_iter, k, hashed);
+ } while (ret == -EINTR);
+
if (ret) {
bch_err(c, "hash_redo_key err %i", ret);
return ret;
@@ -376,11 +384,11 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
buf, strlen(buf), d->v.d_name, len)) {
- bch2_trans_update(trans, iter, &d->k_i);
-
- ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ TRANS_RESET_MEM,
+ (bch2_trans_update(trans, iter, &d->k_i, 0), 0));
if (ret)
goto err;
@@ -402,8 +410,11 @@ err_redo:
k->k->p.offset, hash, h->chain->pos.offset,
(bch2_bkey_val_to_text(&PBUF(buf), c,
*k), buf))) {
- ret = hash_redo_key(bch2_dirent_hash_desc, trans,
- h, iter, *k, hash);
+ do {
+ ret = hash_redo_key(bch2_dirent_hash_desc, trans,
+ h, iter, *k, hash);
+ } while (ret == -EINTR);
+
if (ret)
bch_err(c, "hash_redo_key err %i", ret);
else
@@ -646,11 +657,11 @@ retry:
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(target.bi_mode);
- bch2_trans_update(&trans, iter, &n->k_i);
-
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ TRANS_RESET_MEM,
+ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
kfree(n);
if (ret)
goto err;
@@ -790,8 +801,7 @@ fsck_err:
create_lostfound:
bch2_inode_init_early(c, lostfound_inode);
- ret = bch2_trans_do(c, NULL,
- BTREE_INSERT_ATOMIC|
+ ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_create_trans(&trans,
@@ -1114,7 +1124,7 @@ static int check_inode_nlink(struct bch_fs *c,
if (!link->count &&
!(u->bi_flags & BCH_INODE_UNLINKED) &&
- (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
u->bi_inum, mode_to_type(u->bi_mode)) ==
FSCK_ERR_IGNORE)
@@ -1149,7 +1159,7 @@ static int check_inode_nlink(struct bch_fs *c,
}
if (i_nlink != real_i_nlink &&
- (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
if (fsck_err(c, "inode %llu has wrong i_nlink "
"(type %u i_nlink %u, should be %u)",
u->bi_inum, mode_to_type(u->bi_mode),
@@ -1261,12 +1271,13 @@ static int check_inode(struct btree_trans *trans,
struct bkey_inode_buf p;
bch2_inode_pack(&p, &u);
- bch2_trans_update(trans, iter, &p.inode.k_i);
- ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret && ret != -EINTR)
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ TRANS_RESET_MEM,
+ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
+ if (ret)
bch_err(c, "error in fsck: error %i "
"updating inode", ret);
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index c0642ff46ba0..e811b98d0f03 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -223,7 +223,7 @@ int bch2_inode_write(struct btree_trans *trans,
return PTR_ERR(inode_p);
bch2_inode_pack(inode_p, inode);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
@@ -411,7 +411,7 @@ again:
inode_u->bi_generation = bkey_generation(k);
bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
}
@@ -493,10 +493,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
delete.v.bi_generation = cpu_to_le32(bi_generation);
}
- bch2_trans_update(&trans, iter, &delete.k_i);
+ bch2_trans_update(&trans, iter, &delete.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL);
} while (ret == -EINTR);
@@ -533,7 +532,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
- return bch2_trans_do(c, NULL, 0,
+ return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
}
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 1d6dd19458ff..4c7dd0994a28 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -124,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -292,18 +292,17 @@ int bch2_extent_update(struct btree_trans *trans,
if (delta || new_i_size) {
bch2_inode_pack(&inode_p, &inode_u);
bch2_trans_update(trans, inode_iter,
- &inode_p.inode.k_i);
+ &inode_p.inode.k_i, 0);
}
bch2_trans_iter_put(trans, inode_iter);
}
- bch2_trans_update(trans, iter, k);
+ bch2_trans_update(trans, iter, k, 0);
ret = bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_USE_RESERVE);
if (!ret && i_sectors_delta)
*i_sectors_delta += delta;
@@ -326,6 +325,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ bch2_trans_reset(trans, TRANS_RESET_MEM);
+
ret = bkey_err(k);
if (ret)
goto btree_err;
@@ -337,8 +338,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete);
- bch2_trans_begin_updates(trans);
-
ret = bch2_extent_update(trans, iter, &delete,
&disk_res, journal_seq,
0, i_sectors_delta);
@@ -400,14 +399,14 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
+ bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
k = bch2_keylist_front(keys);
bkey_on_stack_realloc(&sk, c, k->k.u64s);
bkey_copy(sk.k, k);
bch2_cut_front(iter->pos, sk.k);
- bch2_trans_begin_updates(&trans);
-
ret = bch2_extent_update(&trans, iter, sk.k,
&op->res, op_journal_seq(op),
op->new_i_size, &op->i_sectors_delta);
@@ -501,12 +500,13 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- if (op->end_io)
+ if (op->end_io) {
+ EBUG_ON(cl->parent);
+ closure_debug_destroy(cl);
op->end_io(op);
- if (cl->parent)
+ } else {
closure_return(cl);
- else
- closure_debug_destroy(cl);
+ }
}
/**
@@ -1139,7 +1139,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
unsigned sectors;
int ret;
- bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA);
+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
ARRAY_SIZE(op->inline_keys),
@@ -1233,12 +1233,14 @@ void bch2_write(struct closure *cl)
err:
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
- if (op->end_io)
+
+ if (op->end_io) {
+ EBUG_ON(cl->parent);
+ closure_debug_destroy(cl);
op->end_io(op);
- if (cl->parent)
+ } else {
closure_return(cl);
- else
- closure_debug_destroy(cl);
+ }
}
/* Cache promotion on read */
@@ -1736,9 +1738,8 @@ retry:
if (!bch2_bkey_narrow_crcs(new.k, new_crc))
goto out;
- bch2_trans_update(&trans, iter, new.k);
+ bch2_trans_update(&trans, iter, new.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOWAIT);
if (ret == -EINTR)
@@ -1785,7 +1786,7 @@ static void __bch2_read_endio(struct work_struct *work)
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- if (crc.compression_type != BCH_COMPRESSION_NONE) {
+ if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
@@ -1978,7 +1979,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
goto hole;
iter.bi_size = pick.crc.compressed_size << 9;
- goto noclone;
+ goto get_bio;
}
if (!(flags & BCH_READ_LAST_FRAGMENT) ||
@@ -1993,7 +1994,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
- if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+ if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
@@ -2025,7 +2026,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
pick.crc.live_size = bvec_iter_sectors(iter);
offset_into_extent = 0;
}
-
+get_bio:
if (rbio) {
/*
* promote already allocated bounce rbio:
@@ -2063,7 +2064,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
rbio->bio.bi_iter = iter;
rbio->split = true;
} else {
-noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 787d9f7638d0..a21de0088753 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -121,7 +121,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl->start[nr].end = cpu_to_le64(end);
out_write_sb:
c->disk_sb.sb->features[0] |=
- 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
+ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
ret = bch2_write_super(c);
out:
@@ -309,7 +309,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
if (!new_nr)
c->disk_sb.sb->features[0] &=
- ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
+ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
bch2_write_super(c);
}
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 4b59dcd04cce..1ef62a189e33 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -53,9 +53,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
if (!bch2_bkey_has_device(k, dev_idx)) {
- ret = bch2_mark_bkey_replicas(c, k);
- if (ret)
- break;
bch2_btree_iter_next(iter);
continue;
}
@@ -76,10 +73,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
- bch2_trans_update(&trans, iter, sk.k);
+ bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL);
/*
@@ -130,34 +126,27 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
struct bkey_i_btree_ptr *new_key;
retry:
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
- dev_idx)) {
- /*
- * we might have found a btree node key we
- * needed to update, and then tried to update it
- * but got -EINTR after upgrading the iter, but
- * then raced and the node is now gone:
- */
- bch2_btree_iter_downgrade(iter);
-
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
- if (ret)
- goto err;
- } else {
- bkey_copy(&tmp.k, &b->key);
- new_key = bkey_i_to_btree_ptr(&tmp.k);
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
- dev_idx, flags, true);
- if (ret)
- goto err;
-
- ret = bch2_btree_node_update_key(c, iter, b, new_key);
- if (ret == -EINTR) {
- b = bch2_btree_iter_peek_node(iter);
- goto retry;
- }
- if (ret)
- goto err;
+ dev_idx))
+ continue;
+
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_btree_ptr(&tmp.k);
+
+ ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+ dev_idx, flags, true);
+ if (ret) {
+ bch_err(c, "Cannot drop device without losing data");
+ goto err;
+ }
+
+ ret = bch2_btree_node_update_key(c, iter, b, new_key);
+ if (ret == -EINTR) {
+ b = bch2_btree_iter_peek_node(iter);
+ goto retry;
+ }
+ if (ret) {
+ bch_err(c, "Error updating btree node key: %i", ret);
+ goto err;
}
}
bch2_trans_iter_free(&trans, iter);
@@ -168,9 +157,10 @@ retry:
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
+ if (c->btree_roots_dirty)
+ bch2_journal_meta(&c->journal);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
- bch2_journal_meta(&c->journal);
}
ret = 0;
@@ -185,6 +175,5 @@ err:
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
- bch2_dev_metadata_drop(c, dev_idx, flags) ?:
- bch2_replicas_gc2(c);
+ bch2_dev_metadata_drop(c, dev_idx, flags);
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 7a2646b9d622..257e00ae6fa7 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -150,11 +150,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
goto next;
}
- bch2_trans_update(&trans, iter, insert);
+ bch2_trans_update(&trans, iter, insert, 0);
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
- BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
m->data_opts.btree_insert_flags);
@@ -273,7 +272,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_NONE &&
+ p.crc.compression_type != BCH_COMPRESSION_TYPE_none &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
compressed_sectors += p.crc.compressed_size;
@@ -301,12 +300,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index abdeef20fde9..e9cb2304576f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -212,14 +212,36 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
buckets_to_move, buckets_not_moved);
}
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+ struct bch_dev_usage usage = bch2_dev_usage_read(c, ca);
+ u64 fragmented_allowed = ca->copygc_threshold +
+ ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1);
+
+ return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented);
+}
+
static int bch2_copygc_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
struct io_clock *clock = &c->io_clock[WRITE];
- struct bch_dev_usage usage;
- unsigned long last;
- u64 available, fragmented, reserve, next;
+ unsigned long last, wait;
set_freezable();
@@ -228,28 +250,10 @@ static int bch2_copygc_thread(void *arg)
break;
last = atomic_long_read(&clock->now);
+ wait = bch2_copygc_wait_amount(ca);
- reserve = ca->copygc_threshold;
-
- usage = bch2_dev_usage_read(c, ca);
-
- available = __dev_buckets_available(ca, usage) *
- ca->mi.bucket_size;
- if (available > reserve) {
- next = last + available - reserve;
- bch2_kthread_io_clock_wait(clock, next,
- MAX_SCHEDULE_TIMEOUT);
- continue;
- }
-
- /*
- * don't start copygc until there's more than half the copygc
- * reserve of fragmented space:
- */
- fragmented = usage.sectors_fragmented;
- if (fragmented < reserve) {
- next = last + reserve - fragmented;
- bch2_kthread_io_clock_wait(clock, next,
+ if (wait > clock->max_slop) {
+ bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
continue;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index cbacd2f36799..94d6c044a27d 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -16,18 +16,24 @@ const char * const bch2_error_actions[] = {
NULL
};
-const char * const bch2_csum_types[] = {
+const char * const bch2_sb_features[] = {
+#define x(f, n) #f,
+ BCH_SB_FEATURES()
+#undef x
+ NULL
+};
+
+const char * const bch2_csum_opts[] = {
"none",
"crc32c",
"crc64",
NULL
};
-const char * const bch2_compression_types[] = {
- "none",
- "lz4",
- "gzip",
- "zstd",
+const char * const bch2_compression_opts[] = {
+#define x(t, n) #t,
+ BCH_COMPRESSION_OPTS()
+#undef x
NULL
};
@@ -300,7 +306,7 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
break;
case Opt_erasure_code:
if (v)
- bch2_check_set_feature(c, BCH_FEATURE_EC);
+ bch2_check_set_feature(c, BCH_FEATURE_ec);
break;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1f11f4152a6f..1c05effa71e6 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -9,8 +9,9 @@
#include "bcachefs_format.h"
extern const char * const bch2_error_actions[];
-extern const char * const bch2_csum_types[];
-extern const char * const bch2_compression_types[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_data_types[];
extern const char * const bch2_cache_replacement_policies[];
@@ -112,23 +113,23 @@ enum opt_type {
"#", NULL) \
x(metadata_checksum, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_csum_types), \
+ OPT_STR(bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \
NULL, NULL) \
x(data_checksum, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
- OPT_STR(bch2_csum_types), \
+ OPT_STR(bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \
NULL, NULL) \
x(compression, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
- OPT_STR(bch2_compression_types), \
- BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE, \
+ OPT_STR(bch2_compression_opts), \
+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(background_compression, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
- OPT_STR(bch2_compression_types), \
- BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE, \
+ OPT_STR(bch2_compression_opts), \
+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(str_hash, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 0fa6f33c049b..e7787c5063ce 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
if (qdq->d_fieldmask & QC_INO_HARD)
new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
- bch2_trans_update(&trans, iter, &new_quota.k_i);
+ bch2_trans_update(&trans, iter, &new_quota.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 84b3fb6eb101..612385e9d4e4 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -183,6 +183,8 @@ static int bch2_rebalance_thread(void *arg)
prev_cputime = curr_cputime();
while (!kthread_wait_freezable(r->enabled)) {
+ cond_resched();
+
start = jiffies;
cputime = curr_cputime();
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e6b51131cff2..8ecd4abc8eeb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -300,31 +300,24 @@ retry:
bch2_cut_front(split_iter->pos, split);
bch2_cut_back(atomic_end, split);
- bch2_trans_update(&trans, split_iter, split);
+ bch2_trans_update(&trans, split_iter, split, !remark
+ ? BTREE_TRIGGER_NORUN
+ : BTREE_TRIGGER_NOOVERWRITES);
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
if (remark) {
ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
0, -((s64) k->k.size),
- BCH_BUCKET_MARK_OVERWRITE) ?:
- bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOMARK_OVERWRITES|
- BTREE_INSERT_NO_CLEAR_REPLICAS);
- } else {
- ret = bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
+ BTREE_TRIGGER_OVERWRITE);
+ if (ret)
+ goto err;
}
- if (ret)
- goto err;
+ ret = bch2_trans_commit(&trans, &disk_res, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY);
err:
if (ret == -EINTR)
goto retry;
@@ -334,6 +327,30 @@ err:
return bch2_trans_exit(&trans) ?: ret;
}
+static int __bch2_journal_replay_key(struct btree_trans *trans,
+ enum btree_id id, struct bkey_i *k)
+{
+ struct btree_iter *iter;
+
+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+ return 0;
+}
+
+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
+ struct bkey_i *k)
+{
+ return bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY,
+ __bch2_journal_replay_key(&trans, id, k));
+}
+
static int bch2_journal_replay(struct bch_fs *c,
struct journal_keys keys)
{
@@ -351,12 +368,7 @@ static int bch2_journal_replay(struct bch_fs *c,
else if (btree_node_type_is_extents(i->btree_id))
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
- ret = bch2_btree_insert(c, i->btree_id, i->k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
+ ret = bch2_journal_replay_key(c, i->btree_id, i->k);
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
@@ -869,7 +881,7 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!c->sb.clean) {
- if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
bch_info(c, "checking inode link counts");
err = "error in recovery";
ret = bch2_fsck_inode_nlink(c);
@@ -910,6 +922,8 @@ int bch2_fs_recovery(struct bch_fs *c)
c->disk_sb.sb->version_min =
le16_to_cpu(bcachefs_metadata_version_min);
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
write_sb = true;
}
@@ -920,7 +934,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck &&
!test_bit(BCH_FS_ERROR, &c->flags)) {
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
write_sb = true;
}
@@ -989,11 +1003,6 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_fs_journal_start(&c->journal, 1, &journal);
bch2_journal_set_replay_done(&c->journal);
- err = "error going read write";
- ret = __bch2_fs_read_write(c, true);
- if (ret)
- goto err;
-
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
@@ -1002,14 +1011,14 @@ int bch2_fs_initialize(struct bch_fs *c)
err = "error creating root directory";
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
- NULL, NULL, 0);
+ NULL, NULL, BTREE_INSERT_LAZY_RW);
if (ret)
goto err;
bch2_inode_init_early(c, &lostfound_inode);
err = "error creating lost+found";
- ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+ ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
&root_inode, &lostfound_inode,
&lostfound,
@@ -1032,7 +1041,9 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_lock(&c->sb_lock);
c->disk_sb.sb->version = c->disk_sb.sb->version_min =
le16_to_cpu(bcachefs_metadata_version_current);
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 53bd0e0ea058..3b8c74ca3725 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -115,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->v.refcount = 0;
memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
- bch2_trans_update(trans, reflink_iter, &r_v->k_i);
+ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0);
r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
if (IS_ERR(r_p))
@@ -126,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
- bch2_trans_update(trans, extent_iter, &r_p->k_i);
+ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
err:
if (!IS_ERR(reflink_iter)) {
c->reflink_hint = reflink_iter->pos.offset;
@@ -171,7 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c,
if (!percpu_ref_tryget(&c->writes))
return -EROFS;
- bch2_check_set_feature(c, BCH_FEATURE_REFLINK);
+ bch2_check_set_feature(c, BCH_FEATURE_reflink);
dst_end.offset += remap_sectors;
src_end.offset += remap_sectors;
@@ -185,7 +185,8 @@ s64 bch2_remap_range(struct bch_fs *c,
BTREE_ITER_INTENT);
while (1) {
- bch2_trans_begin_updates(&trans);
+ bch2_trans_reset(&trans, TRANS_RESET_MEM);
+
trans.mem_top = 0;
if (fatal_signal_pending(current)) {
@@ -287,8 +288,7 @@ err:
inode_u.bi_size < new_i_size) {
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, journal_seq,
- BTREE_INSERT_ATOMIC);
+ bch2_trans_commit(&trans, NULL, journal_seq, 0);
}
} while (ret2 == -EINTR);
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cb5ebb87c701..366888b1b36d 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -84,10 +84,10 @@ static void extent_to_replicas(struct bkey_s_c k,
if (p.ptr.cached)
continue;
- if (p.has_ec)
+ if (!p.has_ec)
+ r->devs[r->nr_devs++] = p.ptr.dev;
+ else
r->nr_required = 0;
-
- r->devs[r->nr_devs++] = p.ptr.dev;
}
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 0d6e19126021..8527d82841bb 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -72,9 +72,6 @@ int bch2_replicas_set_usage(struct bch_fs *,
/* iterate over superblock replicas - used by userspace tools: */
-#define replicas_entry_bytes(_i) \
- (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
-
#define replicas_entry_next(_i) \
((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 582e718b6bd1..f2779159a6b8 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -23,7 +23,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
case BCH_STR_HASH_OPT_CRC64:
return BCH_STR_HASH_CRC64;
case BCH_STR_HASH_OPT_SIPHASH:
- return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH)
+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
? BCH_STR_HASH_SIPHASH
: BCH_STR_HASH_SIPHASH_OLD;
default:
@@ -281,7 +281,7 @@ not_found:
swap(iter, slot);
insert->k.p = iter->pos;
- bch2_trans_update(trans, iter, insert);
+ bch2_trans_update(trans, iter, insert, 0);
}
goto out;
@@ -308,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
- bch2_trans_update(trans, iter, delete);
+ bch2_trans_update(trans, iter, delete, 0);
return 0;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index daaeaf0446a3..43927853210a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -51,7 +51,9 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
sb->page_order);
- if (!f) {
+ if (!f && !u64s) {
+ /* nothing to do: */
+ } else if (!f) {
f = vstruct_last(sb->sb);
memset(f, 0, sizeof(u64) * u64s);
f->u64s = cpu_to_le32(u64s);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 17bdf985559c..38920fff4500 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -404,7 +404,7 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
return 0;
}
-int __bch2_fs_read_write(struct bch_fs *c, bool early)
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
struct bch_dev *ca;
unsigned i;
@@ -735,9 +735,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (bch2_fs_init_fault("fs_alloc"))
goto err;
- iter_size = sizeof(struct btree_node_iter_large) +
+ iter_size = sizeof(struct sort_iter) +
(btree_blocks(c) + 1) * 2 *
- sizeof(struct btree_node_iter_set);
+ sizeof(struct sort_iter_set);
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
@@ -1416,7 +1416,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_lock(&c->state_lock);
- percpu_ref_put(&ca->ref); /* XXX */
+ /*
+ * We consume a reference to ca->ref, regardless of whether we succeed
+ * or fail:
+ */
+ percpu_ref_put(&ca->ref);
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
bch_err(ca, "Cannot remove without losing data");
@@ -1425,11 +1429,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
- /*
- * XXX: verify that dev_idx is really not in use anymore, anywhere
- *
- * flag_data_bad() does not check btree pointers
- */
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
bch_err(ca, "Remove failed: error %i dropping data", ret);
@@ -1442,17 +1441,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
- data = bch2_dev_has_data(c, ca);
- if (data) {
- char data_has_str[100];
-
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
- ret = -EBUSY;
- goto err;
- }
-
ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
@@ -1467,12 +1455,33 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* (overwritten) keys that point to the device we're removing:
*/
bch2_journal_flush_all_pins(&c->journal);
+ /*
+ * hack to ensure bch2_replicas_gc2() clears out entries to this device
+ */
+ bch2_journal_meta(&c->journal);
ret = bch2_journal_error(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
}
+ ret = bch2_replicas_gc2(c);
+ if (ret) {
+ bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+ goto err;
+ }
+
+ data = bch2_dev_has_data(c, ca);
+ if (data) {
+ char data_has_str[100];
+
+ bch2_flags_to_text(&PBUF(data_has_str),
+ bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ ret = -EBUSY;
+ goto err;
+ }
+
__bch2_dev_offline(c, ca);
mutex_lock(&c->sb_lock);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 41992e891391..4aa5dd7917cf 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -219,7 +219,6 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
-int __bch2_fs_read_write(struct bch_fs *, bool);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index e7699afd99fc..602def1ee95a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -18,6 +18,7 @@
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "clock.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -198,6 +199,9 @@ rw_attribute(pd_controllers_update_seconds);
read_attribute(meta_replicas_have);
read_attribute(data_replicas_have);
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
@@ -272,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
struct extent_ptr_decoded p;
extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_NONE) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) {
nr_uncompressed_extents++;
uncompressed_sectors += e.k->size;
} else {
@@ -404,6 +408,11 @@ SHOW(bch2_fs)
if (attr == &sysfs_new_stripes)
return bch2_new_stripes(c, buf);
+ if (attr == &sysfs_io_timers_read)
+ return bch2_io_timers_show(&c->io_clock[READ], buf);
+ if (attr == &sysfs_io_timers_write)
+ return bch2_io_timers_show(&c->io_clock[WRITE], buf);
+
#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
@@ -581,6 +590,9 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_new_stripes,
+ &sysfs_io_timers_read,
+ &sysfs_io_timers_write,
+
&sysfs_internal_uuid,
#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -904,8 +916,6 @@ SHOW(bch2_dev)
bch2_disk_path_to_text(&out, &c->disk_sb,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
- } else {
- pr_buf(&out, "none");
}
pr_buf(&out, "\n");
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 724f41e6590c..8f9b0cca17da 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, iter, &k.k_i);
+ bch2_trans_update(&trans, iter, &k.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, iter, &k.k_i);
+ bch2_trans_update(&trans, iter, &k.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p = iter->pos;
- bch2_trans_update(&trans, iter, &k.k_i);
+ bch2_trans_update(&trans, iter, &k.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
}
@@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
insert.k.p = iter->pos;
- bch2_trans_update(&trans, iter, &insert.k_i);
+ bch2_trans_update(&trans, iter, &insert.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
bkey_reassemble(&u.k_i, k);
- bch2_trans_update(&trans, iter, &u.k_i);
+ bch2_trans_update(&trans, iter, &u.k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 2b19a0038045..0128daba5970 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9b8f6f1f9a77..725a6f3ef8ce 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -327,7 +327,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
bch2_xattr_set(&trans, inode->v.i_ino,
&inode->ei_str_hash,
name, value, size,