summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2019-11-29 12:43:48 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2020-05-06 17:14:17 -0400
commit246252b293d2d2c9e8143303e508167b63a8d95b (patch)
tree3f451c6dbafabe595690a37444d28e8dffadc093
parent7d52af47abc1e3c924ba29a12149d7a527b9236d (diff)
Merge with 131853c881 bcachefs: Switch to macro for bkey_ops
-rw-r--r--fs/bcachefs/Kconfig1
-rw-r--r--fs/bcachefs/Makefile2
-rw-r--r--fs/bcachefs/acl.c53
-rw-r--r--fs/bcachefs/alloc_background.c12
-rw-r--r--fs/bcachefs/bcachefs.h7
-rw-r--r--fs/bcachefs/bcachefs_format.h30
-rw-r--r--fs/bcachefs/bkey.c28
-rw-r--r--fs/bcachefs/bkey.h15
-rw-r--r--fs/bcachefs/bkey_methods.c22
-rw-r--r--fs/bcachefs/bkey_on_stack.h43
-rw-r--r--fs/bcachefs/bkey_sort.c32
-rw-r--r--fs/bcachefs/bset.c402
-rw-r--r--fs/bcachefs/bset.h21
-rw-r--r--fs/bcachefs/btree_cache.c14
-rw-r--r--fs/bcachefs/btree_gc.c35
-rw-r--r--fs/bcachefs/btree_io.c60
-rw-r--r--fs/bcachefs/btree_io.h6
-rw-r--r--fs/bcachefs/btree_iter.c315
-rw-r--r--fs/bcachefs/btree_iter.h58
-rw-r--r--fs/bcachefs/btree_locking.h20
-rw-r--r--fs/bcachefs/btree_types.h73
-rw-r--r--fs/bcachefs/btree_update.h76
-rw-r--r--fs/bcachefs/btree_update_interior.c50
-rw-r--r--fs/bcachefs/btree_update_interior.h6
-rw-r--r--fs/bcachefs/btree_update_leaf.c672
-rw-r--r--fs/bcachefs/buckets.c259
-rw-r--r--fs/bcachefs/buckets.h12
-rw-r--r--fs/bcachefs/buckets_types.h6
-rw-r--r--fs/bcachefs/checksum.c32
-rw-r--r--fs/bcachefs/checksum.h6
-rw-r--r--fs/bcachefs/clock.c7
-rw-r--r--fs/bcachefs/clock.h13
-rw-r--r--fs/bcachefs/compress.c2
-rw-r--r--fs/bcachefs/dirent.c122
-rw-r--r--fs/bcachefs/dirent.h29
-rw-r--r--fs/bcachefs/ec.c72
-rw-r--r--fs/bcachefs/error.c13
-rw-r--r--fs/bcachefs/error.h1
-rw-r--r--fs/bcachefs/extent_update.c531
-rw-r--r--fs/bcachefs/extent_update.h18
-rw-r--r--fs/bcachefs/extents.c2089
-rw-r--r--fs/bcachefs/extents.h289
-rw-r--r--fs/bcachefs/extents_types.h4
-rw-r--r--fs/bcachefs/fs-common.c281
-rw-r--r--fs/bcachefs/fs-common.h36
-rw-r--r--fs/bcachefs/fs-io.c1452
-rw-r--r--fs/bcachefs/fs-io.h18
-rw-r--r--fs/bcachefs/fs-ioctl.c10
-rw-r--r--fs/bcachefs/fs.c772
-rw-r--r--fs/bcachefs/fs.h56
-rw-r--r--fs/bcachefs/fsck.c109
-rw-r--r--fs/bcachefs/inode.c118
-rw-r--r--fs/bcachefs/inode.h73
-rw-r--r--fs/bcachefs/io.c597
-rw-r--r--fs/bcachefs/io.h15
-rw-r--r--fs/bcachefs/io_types.h5
-rw-r--r--fs/bcachefs/journal.c2
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/migrate.c16
-rw-r--r--fs/bcachefs/move.c36
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/opts.h21
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/recovery.c52
-rw-r--r--fs/bcachefs/reflink.c100
-rw-r--r--fs/bcachefs/reflink.h6
-rw-r--r--fs/bcachefs/replicas.c4
-rw-r--r--fs/bcachefs/str_hash.h74
-rw-r--r--fs/bcachefs/super-io.c34
-rw-r--r--fs/bcachefs/super.c3
-rw-r--r--fs/bcachefs/sysfs.c4
-rw-r--r--fs/bcachefs/tests.c10
-rw-r--r--fs/bcachefs/util.c2
-rw-r--r--fs/bcachefs/util.h59
75 files changed, 4748 insertions, 4785 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index e695ab786f80..10abddae6a80 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -2,7 +2,6 @@
config BCACHEFS_FS
tristate "bcachefs filesystem support"
depends on BLOCK
- depends on (64BIT || LBDAF)
select EXPORTFS
select CLOSURES
select LIBCRC32C
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 414ea2a74a5a..c7727d05cf49 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -26,7 +26,9 @@ bcachefs-y := \
ec.o \
error.o \
extents.o \
+ extent_update.o \
fs.o \
+ fs-common.o \
fs-ioctl.o \
fs-io.o \
fsck.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 59d4af1326ee..dcd0dfe87b51 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -280,49 +280,52 @@ int bch2_set_acl_trans(struct btree_trans *trans,
return ret == -ENOENT ? 0 : ret;
}
-static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- umode_t mode = (unsigned long) p;
-
- bi->bi_ctime = bch2_current_time(c);
- bi->bi_mode = mode;
- return 0;
-}
-
-int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
+ struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
- umode_t mode = inode->v.i_mode;
+ struct posix_acl *acl;
+ umode_t mode;
int ret;
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+ acl = _acl;
- if (type == ACL_TYPE_ACCESS && acl) {
+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+ BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto btree_err;
+
+ mode = inode_u.bi_mode;
+
+ if (type == ACL_TYPE_ACCESS) {
ret = posix_acl_update_mode(&inode->v, &mode, &acl);
if (ret)
goto err;
}
-retry:
- bch2_trans_begin(&trans);
- ret = bch2_set_acl_trans(&trans,
- &inode->ei_inode,
- &inode->ei_str_hash,
- acl, type) ?:
- bch2_write_inode_trans(&trans, inode, &inode_u,
- inode_update_for_set_acl_fn,
- (void *)(unsigned long) mode) ?:
+ ret = bch2_set_acl_trans(&trans, &inode_u,
+ &inode->ei_str_hash,
+ acl, type);
+ if (ret)
+ goto btree_err;
+
+ inode_u.bi_ctime = bch2_current_time(c);
+ inode_u.bi_mode = mode;
+
+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
+btree_err:
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
@@ -375,7 +378,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
}
new->k.p = iter->pos;
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i));
+ bch2_trans_update(trans, iter, &new->k_i);
*new_acl = acl;
acl = NULL;
err:
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9814179a6406..e252a039dc2b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -152,6 +152,7 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst,
{
unsigned idx = 0;
void *d = dst->v.data;
+ unsigned bytes;
dst->v.fields = 0;
dst->v.gen = src.gen;
@@ -160,7 +161,9 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst,
BCH_ALLOC_FIELDS()
#undef x
- set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+ bytes = (void *) d - (void *) &dst->v;
+ set_bkey_val_bytes(&dst->k, bytes);
+ memset_u64s_tail(&dst->v, 0, bytes);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
@@ -311,7 +314,7 @@ retry:
a->k.p = iter->pos;
bch2_alloc_pack(a, new_u);
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+ bch2_trans_update(trans, iter, &a->k_i);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
@@ -899,7 +902,7 @@ retry:
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i));
+ bch2_trans_update(trans, iter, &a->k_i);
/*
* XXX:
@@ -1438,6 +1441,9 @@ again:
cond_resched();
nodes_unwritten = false;
+ if (bch2_journal_error(&c->journal))
+ return true;
+
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (btree_node_need_write(b)) {
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 033b73821fdb..9b186872c129 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -299,7 +299,6 @@ do { \
x(btree_node_sort) \
x(btree_node_read) \
x(btree_gc) \
- x(btree_update) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
@@ -426,7 +425,6 @@ struct bch_dev {
*/
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
- spinlock_t freelist_lock;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
@@ -498,6 +496,7 @@ enum {
/* misc: */
BCH_FS_BDEV_MOUNTED,
BCH_FS_FIXED_GENS,
+ BCH_FS_ALLOC_WRITTEN,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
@@ -720,11 +719,13 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
+ mempool_t large_bkey_pool;
+
/* REBALANCE */
struct bch_fs_rebalance rebalance;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 4577d77a9f38..3d85012a15fd 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -338,7 +338,8 @@ static inline void bkey_init(struct bkey *k)
x(quota, 13) \
x(stripe, 14) \
x(reflink_p, 15) \
- x(reflink_v, 16)
+ x(reflink_v, 16) \
+ x(inline_data, 17)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -911,6 +912,13 @@ struct bch_reflink_v {
__u64 _data[0];
};
+/* Inline data */
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[0];
+};
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1314,6 +1322,8 @@ enum bch_sb_features {
BCH_FEATURE_EC = 4,
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
BCH_FEATURE_REFLINK = 6,
+ BCH_FEATURE_NEW_SIPHASH = 7,
+ BCH_FEATURE_INLINE_DATA = 8,
BCH_FEATURE_NR,
};
@@ -1340,11 +1350,19 @@ enum bch_csum_opts {
BCH_CSUM_OPT_NR = 3,
};
-enum bch_str_hash_opts {
+enum bch_str_hash_type {
BCH_STR_HASH_CRC32C = 0,
BCH_STR_HASH_CRC64 = 1,
- BCH_STR_HASH_SIPHASH = 2,
- BCH_STR_HASH_NR = 3,
+ BCH_STR_HASH_SIPHASH_OLD = 2,
+ BCH_STR_HASH_SIPHASH = 3,
+ BCH_STR_HASH_NR = 4,
+};
+
+enum bch_str_hash_opts {
+ BCH_STR_HASH_OPT_CRC32C = 0,
+ BCH_STR_HASH_OPT_CRC64 = 1,
+ BCH_STR_HASH_OPT_SIPHASH = 2,
+ BCH_STR_HASH_OPT_NR = 3,
};
#define BCH_COMPRESSION_TYPES() \
@@ -1494,14 +1512,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
/* Btree: */
-#define BCH_BTREE_IDS() \
+#define BCH_BTREE_IDS() \
x(EXTENTS, 0, "extents") \
x(INODES, 1, "inodes") \
x(DIRENTS, 2, "dirents") \
x(XATTRS, 3, "xattrs") \
x(ALLOC, 4, "alloc") \
x(QUOTAS, 5, "quotas") \
- x(EC, 6, "erasure_coding") \
+ x(EC, 6, "stripes") \
x(REFLINK, 7, "reflink")
enum btree_id {
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 0f9dfe37b0af..4d0c9129cd4a 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -327,7 +327,7 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
const struct bkey_packed *src)
{
- dst->k = bkey_unpack_key(b, src);
+ __bkey_unpack_key(b, &dst->k, src);
memcpy_u64s(&dst->v,
bkeyp_val(&b->format, src),
@@ -1058,26 +1058,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
const struct bkey_packed *r,
const struct btree *b)
{
- int packed = bkey_lr_packed(l, r);
+ struct bkey unpacked;
- if (likely(packed == BKEY_PACKED_BOTH))
+ if (likely(bkey_packed(l) && bkey_packed(r)))
return __bch2_bkey_cmp_packed_format_checked(l, r, b);
- switch (packed) {
- case BKEY_PACKED_NONE:
- return bkey_cmp(((struct bkey *) l)->p,
- ((struct bkey *) r)->p);
- case BKEY_PACKED_LEFT:
- return __bch2_bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) l,
- &((struct bkey *) r)->p);
- case BKEY_PACKED_RIGHT:
- return -__bch2_bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) r,
- &((struct bkey *) l)->p);
- default:
- unreachable();
+ if (bkey_packed(l)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
+ l = (void*) &unpacked;
+ } else if (bkey_packed(r)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
+ r = (void*) &unpacked;
}
+
+ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
}
__pure __flatten
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 5ef66aed338d..f2d5f3009b21 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -33,6 +33,16 @@ struct bkey_s {
#define bkey_next(_k) vstruct_next(_k)
+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
+ struct bkey_packed *end)
+{
+ k = bkey_next(k);
+
+ while (k != end && !k->u64s)
+ k = (void *) ((u64 *) k + 1);
+ return k;
+}
+
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -87,8 +97,8 @@ do { \
(u64 *) (_dst) < (u64 *) (_src) + \
((struct bkey *) (_src))->u64s); \
\
- __memmove_u64s_down((_dst), (_src), \
- ((struct bkey *) (_src))->u64s); \
+ memcpy_u64s_small((_dst), (_src), \
+ ((struct bkey *) (_src))->u64s); \
} while (0)
struct btree;
@@ -554,6 +564,7 @@ BKEY_VAL_ACCESSORS(quota);
BKEY_VAL_ACCESSORS(stripe);
BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v);
+BKEY_VAL_ACCESSORS(inline_data);
/* byte order helpers */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index f01405dd502b..ed448fad83c5 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
.key_invalid = empty_val_key_invalid, \
}
+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ return NULL;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+}
+
+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \
+ .key_invalid = key_type_inline_data_invalid, \
+ .val_to_text = key_type_inline_data_to_text, \
+}
+
static const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
- if ((btree_node_type_is_extents(type) ||
- type == BKEY_TYPE_BTREE) &&
- bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+ if (type == BKEY_TYPE_BTREE &&
+ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
if (btree_node_type_is_extents(type)) {
diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h
new file mode 100644
index 000000000000..f607a0cb37ed
--- /dev/null
+++ b/fs/bcachefs/bkey_on_stack.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_ON_STACK_H
+#define _BCACHEFS_BKEY_ON_STACK_H
+
+#include "bcachefs.h"
+
+struct bkey_on_stack {
+ struct bkey_i *k;
+ u64 onstack[12];
+};
+
+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
+ struct bch_fs *c, unsigned u64s)
+{
+ if (s->k == (void *) s->onstack &&
+ u64s > ARRAY_SIZE(s->onstack)) {
+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ memcpy(s->k, s->onstack, sizeof(s->onstack));
+ }
+}
+
+static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bkey_on_stack_realloc(s, c, k.k->u64s);
+ bkey_reassemble(s->k, k);
+}
+
+static inline void bkey_on_stack_init(struct bkey_on_stack *s)
+{
+ s->k = (void *) s->onstack;
+}
+
+static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
+ struct bch_fs *c)
+{
+ if (s->k != (void *) s->onstack)
+ mempool_free(s->k, &c->large_bkey_pool);
+ s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_ON_STACK_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index e32fad5a91ac..2e205db5433d 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_on_stack.h"
#include "bkey_sort.h"
#include "bset.h"
#include "extents.h"
@@ -74,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter,
{
i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+ while (i->k != i->end &&
+ !__btree_node_offset_to_key(b, i->k)->u64s)
+ i->k++;
+
if (i->k == i->end)
*i = iter->data[--iter->used];
}
@@ -118,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
{
- iter->data->k = bkey_next(iter->data->k);
+ iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end);
BUG_ON(iter->data->k > iter->data->end);
@@ -292,8 +297,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct bkey l_unpacked, r_unpacked;
struct bkey_s l, r;
struct btree_nr_keys nr;
+ struct bkey_on_stack split;
memset(&nr, 0, sizeof(nr));
+ bkey_on_stack_init(&split);
heap_resort(iter, extent_sort_cmp, NULL);
@@ -343,29 +350,28 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
if (bkey_cmp(l.k->p, r.k->p) >= 0) {
sort_key_next(iter, b, _r);
} else {
- __bch2_cut_front(l.k->p, r);
+ bch2_cut_front_s(l.k->p, r);
extent_save(b, rk, r.k);
}
extent_sort_sift(iter, b, _r - iter->data);
} else if (bkey_cmp(l.k->p, r.k->p) > 0) {
- BKEY_PADDED(k) tmp;
/*
* r wins, but it overlaps in the middle of l - split l:
*/
- bkey_reassemble(&tmp.k, l.s_c);
- bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+ bkey_on_stack_reassemble(&split, c, l.s_c);
+ bch2_cut_back(bkey_start_pos(r.k), split.k);
- __bch2_cut_front(r.k->p, l);
+ bch2_cut_front_s(r.k->p, l);
extent_save(b, lk, l.k);
extent_sort_sift(iter, b, 0);
extent_sort_append(c, f, &nr, dst->start,
- &prev, bkey_i_to_s(&tmp.k));
+ &prev, bkey_i_to_s(split.k));
} else {
- bch2_cut_back(bkey_start_pos(r.k), l.k);
+ bch2_cut_back_s(bkey_start_pos(r.k), l);
extent_save(b, lk, l.k);
}
}
@@ -373,6 +379,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
extent_sort_advance_prev(f, &nr, dst->start, &prev);
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+ bkey_on_stack_exit(&split, c);
return nr;
}
@@ -418,7 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
struct bkey_packed *prev = NULL, *k_packed;
struct bkey_s k;
struct btree_nr_keys nr;
- BKEY_PADDED(k) tmp;
+ struct bkey unpacked;
memset(&nr, 0, sizeof(nr));
@@ -426,11 +434,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
if (filter_whiteouts && bkey_whiteout(k_packed))
continue;
- EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- bch2_bkey_unpack(src, &tmp.k, k_packed);
- k = bkey_i_to_s(&tmp.k);
+ k = __bkey_disassemble(src, k_packed, &unpacked);
if (filter_whiteouts &&
bch2_bkey_normalize(c, k))
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 32436ed5cc80..a0f0b0eadffb 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -76,7 +76,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
for (_k = i->start, k = bkey_unpack_key(b, _k);
_k < vstruct_last(i);
_k = _n, k = n) {
- _n = bkey_next(_k);
+ _n = bkey_next_skip_noops(_k, vstruct_last(i));
bch2_bkey_to_text(&PBUF(buf), &k);
printk(KERN_ERR "block %u key %5u: %s\n", set,
@@ -144,9 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
struct btree_nr_keys nr = { 0 };
for_each_bset(b, t)
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k))
+ bset_tree_for_each_key(b, t, k)
if (!bkey_whiteout(k))
btree_keys_account_key_add(&nr, t - b->set, k);
@@ -294,38 +292,23 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
/* Auxiliary search trees */
-#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0)
-#define BFLOAT_FAILED_PREV (U8_MAX - 1)
-#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2)
-#define BFLOAT_FAILED (U8_MAX - 2)
-
-#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
+#define BFLOAT_FAILED_UNPACKED U8_MAX
+#define BFLOAT_FAILED U8_MAX
struct bkey_float {
u8 exponent;
u8 key_offset;
- union {
- u32 mantissa32;
- struct {
- u16 mantissa16;
- u16 _pad;
- };
- };
-} __packed;
-
-#define BFLOAT_32BIT_NR 32U
+ u16 mantissa;
+};
+#define BKEY_MANTISSA_BITS 16
static unsigned bkey_float_byte_offset(unsigned idx)
{
- int d = (idx - BFLOAT_32BIT_NR) << 1;
-
- d &= ~(d >> 31);
-
- return idx * 6 - d;
+ return idx * sizeof(struct bkey_float);
}
struct ro_aux_tree {
- struct bkey_float _d[0];
+ struct bkey_float f[0];
};
struct rw_aux_tree {
@@ -380,8 +363,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
- DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
- sizeof(u8) * t->size, 8);
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+ t->size * sizeof(u8), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -420,17 +403,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b,
return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
}
-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
- unsigned idx)
-{
- return (void *) b + bkey_float_byte_offset(idx);
-}
-
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
{
- return bkey_float_get(ro_aux_tree_base(b, t), idx);
+ return ro_aux_tree_base(b, t)->f + idx;
}
static void bset_aux_tree_verify(struct btree *b)
@@ -633,7 +610,7 @@ start:
rw_aux_tree(b, t)[j - 1].offset);
}
- k = bkey_next(k);
+ k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
BUG_ON(k >= btree_bkey_last(b, t));
}
}
@@ -669,21 +646,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
return idx;
}
-static inline unsigned bfloat_mantissa(const struct bkey_float *f,
- unsigned idx)
-{
- return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
-}
-
-static inline void bfloat_mantissa_set(struct bkey_float *f,
- unsigned idx, unsigned mantissa)
-{
- if (idx < BFLOAT_32BIT_NR)
- f->mantissa32 = mantissa;
- else
- f->mantissa16 = mantissa;
-}
-
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
const struct bkey_float *f,
unsigned idx)
@@ -703,9 +665,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
v >>= f->exponent & 7;
#else
- v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
#endif
- return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
+ return (u16) v;
}
static void make_bfloat(struct btree *b, struct bset_tree *t,
@@ -715,14 +677,10 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
struct bkey_packed *l, *r;
- unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
unsigned mantissa;
int shift, exponent, high_bit;
- EBUG_ON(bkey_next(p) != m);
-
if (is_power_of_2(j)) {
l = min_key;
@@ -764,8 +722,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* the original key.
*/
- if (!bkey_packed(l) || !bkey_packed(r) ||
- !bkey_packed(p) || !bkey_packed(m) ||
+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
!b->nr_key_bits) {
f->exponent = BFLOAT_FAILED_UNPACKED;
return;
@@ -782,8 +739,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
* of the key: we handle this later:
*/
high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
- min_t(unsigned, bits, b->nr_key_bits) - 1);
- exponent = high_bit - (bits - 1);
+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
/*
* Then we calculate the actual shift value, from the start of the key
@@ -792,12 +749,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
- EBUG_ON(shift + bits > b->format.key_u64s * 64);
+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
#else
shift = high_bit_offset +
b->nr_key_bits -
exponent -
- bits;
+ BKEY_MANTISSA_BITS;
EBUG_ON(shift < KEY_PACKED_BITS_START);
#endif
@@ -813,37 +770,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
if (exponent < 0)
mantissa |= ~(~0U << -exponent);
- bfloat_mantissa_set(f, j, mantissa);
-
- /*
- * The bfloat must be able to tell its key apart from the previous key -
- * if its key and the previous key don't differ in the required bits,
- * flag as failed - unless the keys are actually equal, in which case
- * we aren't required to return a specific one:
- */
- if (exponent > 0 &&
- bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
- bkey_cmp_packed(b, p, m)) {
- f->exponent = BFLOAT_FAILED_PREV;
- return;
- }
-
- /*
- * f->mantissa must compare >= the original key - for transitivity with
- * the comparison in bset_search_tree. If we're dropping set bits,
- * increment it:
- */
- if (exponent > (int) bch2_bkey_ffs(b, m)) {
- if (j < BFLOAT_32BIT_NR
- ? f->mantissa32 == U32_MAX
- : f->mantissa16 == U16_MAX)
- f->exponent = BFLOAT_FAILED_OVERFLOW;
-
- if (j < BFLOAT_32BIT_NR)
- f->mantissa32++;
- else
- f->mantissa16++;
- }
+ f->mantissa = mantissa;
}
/* bytes remaining - only valid for last bset: */
@@ -856,14 +783,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
{
- unsigned bytes = __bset_tree_capacity(b, t);
-
- if (bytes < 7 * BFLOAT_32BIT_NR)
- return bytes / 7;
-
- bytes -= 7 * BFLOAT_32BIT_NR;
-
- return BFLOAT_32BIT_NR + bytes / 5;
+ return __bset_tree_capacity(b, t) /
+ (sizeof(struct bkey_float) + sizeof(u8));
}
static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
@@ -880,9 +801,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
rw_aux_tree(b, t)[0].offset =
__btree_node_key_to_offset(b, btree_bkey_first(b, t));
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k)) {
+ bset_tree_for_each_key(b, t, k) {
if (t->size == bset_rw_tree_capacity(b, t))
break;
@@ -915,7 +834,7 @@ retry:
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_next(k);
+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -931,10 +850,10 @@ retry:
EBUG_ON(tree_to_bkey(b, t, j) != k);
}
- while (bkey_next(k) != btree_bkey_last(b, t))
- k = bkey_next(k);
+ while (k != btree_bkey_last(b, t))
+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
- t->max_key = bkey_unpack_pos(b, k);
+ t->max_key = bkey_unpack_pos(b, prev);
/* Then we build the tree */
eytzinger1_for_each(j, t->size)
@@ -1060,7 +979,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_next(i))
+ for (i = p; i != k; i = bkey_next_skip_noops(i, k))
if (i->type >= min_key_type)
ret = i;
@@ -1070,9 +989,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
if (btree_keys_expensive_checks(b)) {
BUG_ON(ret >= orig_k);
- for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+ for (i = ret
+ ? bkey_next_skip_noops(ret, orig_k)
+ : btree_bkey_first(b, t);
i != orig_k;
- i = bkey_next(i))
+ i = bkey_next_skip_noops(i, orig_k))
BUG_ON(i->type >= min_key_type);
}
@@ -1107,7 +1028,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
/* signal to make_bfloat() that they're uninitialized: */
min_key.u64s = max_key.u64s = 0;
- if (bkey_next(k) == btree_bkey_last(b, t)) {
+ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
t->max_key = bkey_unpack_pos(b, k);
for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1231,7 +1152,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
struct bkey_packed *k = start;
while (1) {
- k = bkey_next(k);
+ k = bkey_next_skip_noops(k, end);
if (k == end)
break;
@@ -1333,14 +1254,38 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
return rw_aux_to_bkey(b, t, l);
}
-noinline
-static int bset_search_tree_slowpath(const struct btree *b,
- struct bset_tree *t, struct bpos *search,
- const struct bkey_packed *packed_search,
- unsigned n)
+static inline void prefetch_four_cachelines(void *p)
{
- return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
- packed_search, search) < 0;
+#ifdef CONFIG_X86_64
+ asm(".intel_syntax noprefix;"
+ "prefetcht0 [%0 - 127 + 64 * 0];"
+ "prefetcht0 [%0 - 127 + 64 * 1];"
+ "prefetcht0 [%0 - 127 + 64 * 2];"
+ "prefetcht0 [%0 - 127 + 64 * 3];"
+ ".att_syntax prefix;"
+ :
+ : "r" (p + 127));
+#else
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+ const struct bkey_float *f,
+ unsigned idx)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+ return f->exponent > key_bits_start;
+#else
+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
}
__flatten
@@ -1350,44 +1295,37 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
const struct bkey_packed *packed_search)
{
struct ro_aux_tree *base = ro_aux_tree_base(b, t);
- struct bkey_float *f = bkey_float_get(base, 1);
- void *p;
- unsigned inorder, n = 1;
+ struct bkey_float *f;
+ struct bkey_packed *k;
+ unsigned inorder, n = 1, l, r;
+ int cmp;
- while (1) {
- if (likely(n << 4 < t->size)) {
- p = bkey_float_get(base, n << 4);
- prefetch(p);
- } else if (n << 3 < t->size) {
- inorder = __eytzinger1_to_inorder(n, t->size, t->extra);
- p = bset_cacheline(b, t, inorder);
-#ifdef CONFIG_X86_64
- asm(".intel_syntax noprefix;"
- "prefetcht0 [%0 - 127 + 64 * 0];"
- "prefetcht0 [%0 - 127 + 64 * 1];"
- "prefetcht0 [%0 - 127 + 64 * 2];"
- "prefetcht0 [%0 - 127 + 64 * 3];"
- ".att_syntax prefix;"
- :
- : "r" (p + 127));
-#else
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- prefetch(p + L1_CACHE_BYTES * 3);
-#endif
- } else if (n >= t->size)
- break;
+ do {
+ if (likely(n << 4 < t->size))
+ prefetch(&base->f[n << 4]);
- f = bkey_float_get(base, n);
+ f = &base->f[n];
- if (packed_search &&
- likely(f->exponent < BFLOAT_FAILED))
- n = n * 2 + (bfloat_mantissa(f, n) <
- bkey_mantissa(packed_search, f, n));
- else
- n = n * 2 + bset_search_tree_slowpath(b, t,
- search, packed_search, n);
+ if (!unlikely(packed_search))
+ goto slowpath;
+ if (unlikely(f->exponent >= BFLOAT_FAILED))
+ goto slowpath;
+
+ l = f->mantissa;
+ r = bkey_mantissa(packed_search, f, n);
+
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ goto slowpath;
+
+ n = n * 2 + (l < r);
+ continue;
+slowpath:
+ k = tree_to_bkey(b, t, n);
+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+ if (!cmp)
+ return k;
+
+ n = n * 2 + (cmp < 0);
} while (n < t->size);
inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
@@ -1396,29 +1334,23 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
* n would have been the node we recursed to - the low bit tells us if
* we recursed left or recursed right.
*/
- if (n & 1) {
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else {
- if (--inorder) {
- n = eytzinger1_prev(n >> 1, t->size);
- f = bkey_float_get(base, n);
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else
+ if (likely(!(n & 1))) {
+ --inorder;
+ if (unlikely(!inorder))
return btree_bkey_first(b, t);
+
+ f = &base->f[eytzinger1_prev(n >> 1, t->size)];
}
+
+ return cacheline_to_bkey(b, t, inorder, f->key_offset);
}
-/*
- * Returns the first key greater than or equal to @search
- */
-__always_inline __flatten
-static struct bkey_packed *bch2_bset_search(struct btree *b,
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
struct bset_tree *t,
struct bpos *search,
- struct bkey_packed *packed_search,
const struct bkey_packed *lossy_packed_search)
{
- struct bkey_packed *m;
/*
* First, we search for a cacheline, then lastly we do a linear search
@@ -1437,11 +1369,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
switch (bset_aux_tree_type(t)) {
case BSET_NO_AUX_TREE:
- m = btree_bkey_first(b, t);
- break;
+ return btree_bkey_first(b, t);
case BSET_RW_AUX_TREE:
- m = bset_search_write_set(b, t, search, lossy_packed_search);
- break;
+ return bset_search_write_set(b, t, search, lossy_packed_search);
case BSET_RO_AUX_TREE:
/*
* Each node in the auxiliary search tree covers a certain range
@@ -1453,20 +1383,30 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
if (bkey_cmp(*search, t->max_key) > 0)
return btree_bkey_last(b, t);
- m = bset_search_tree(b, t, search, lossy_packed_search);
- break;
+ return bset_search_tree(b, t, search, lossy_packed_search);
+ default:
+ unreachable();
}
+}
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ struct bkey_packed *packed_search,
+ const struct bkey_packed *lossy_packed_search,
+ struct bkey_packed *m)
+{
if (lossy_packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
m) > 0)
- m = bkey_next(m);
+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_pos_cmp(b, search, m) > 0)
- m = bkey_next(m);
+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (btree_keys_expensive_checks(b)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1479,6 +1419,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
return m;
}
+/*
+ * Returns the first key greater than or equal to @search
+ */
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search(struct btree *b,
+ struct bset_tree *t,
+ struct bpos *search,
+ struct bkey_packed *packed_search,
+ const struct bkey_packed *lossy_packed_search)
+{
+ struct bkey_packed *m = __bch2_bset_search(b, t, search,
+ lossy_packed_search);
+
+ return bch2_bset_search_linear(b, t, search,
+ packed_search, lossy_packed_search, m);
+}
+
/* Btree node iterator */
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1565,11 +1522,14 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
* So we've got to search for start_of_range, then after the lookup iterate
* past any extents that compare equal to the position we searched for.
*/
+__flatten
void bch2_btree_node_iter_init(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
- struct bset_tree *t;
struct bkey_packed p, *packed_search = NULL;
+ struct btree_node_iter_set *pos = iter->data;
+ struct bkey_packed *k[MAX_BSETS];
+ unsigned i;
EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
bset_aux_tree_verify(b);
@@ -1588,11 +1548,23 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
return;
}
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- bch2_bset_search(b, t, search,
- packed_search, &p),
- btree_bkey_last(b, t));
+ for (i = 0; i < b->nsets; i++) {
+ k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+ prefetch_four_cachelines(k[i]);
+ }
+
+ for (i = 0; i < b->nsets; i++) {
+ struct bset_tree *t = b->set + i;
+ struct bkey_packed *end = btree_bkey_last(b, t);
+
+ k[i] = bch2_bset_search_linear(b, t, search,
+ packed_search, &p, k[i]);
+ if (k[i] != end)
+ *pos++ = (struct btree_node_iter_set) {
+ __btree_node_key_to_offset(b, k[i]),
+ __btree_node_key_to_offset(b, end)
+ };
+ }
bch2_btree_node_iter_sort(iter, b);
}
@@ -1668,6 +1640,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
EBUG_ON(iter->data->k > iter->data->end);
+ while (!__btree_node_iter_set_end(iter, 0) &&
+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
+ iter->data->k++;
+
if (unlikely(__btree_node_iter_set_end(iter, 0))) {
bch2_btree_node_iter_set_drop(iter, iter->data);
return;
@@ -1786,17 +1762,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
stats->floats += t->size - 1;
for (j = 1; j < t->size; j++)
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
- stats->failed_unpacked++;
- break;
- case BFLOAT_FAILED_PREV:
- stats->failed_prev++;
- break;
- case BFLOAT_FAILED_OVERFLOW:
- stats->failed_overflow++;
- break;
- }
+ stats->failed +=
+ bkey_float(b, t, j)->exponent ==
+ BFLOAT_FAILED;
}
}
}
@@ -1805,9 +1773,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
struct bkey_packed *k)
{
struct bset_tree *t = bch2_bkey_to_bset(b, k);
- struct bkey_packed *l, *r, *p;
- struct bkey uk, up;
- char buf1[200], buf2[200];
+ struct bkey uk;
unsigned j, inorder;
if (out->pos != out->end)
@@ -1825,7 +1791,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
return;
switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
+ case BFLOAT_FAILED:
uk = bkey_unpack_key(b, k);
pr_buf(out,
" failed unpacked at depth %u\n"
@@ -1833,41 +1799,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
ilog2(j),
uk.p.inode, uk.p.offset);
break;
- case BFLOAT_FAILED_PREV:
- p = tree_to_prev_bkey(b, t, j);
- l = is_power_of_2(j)
- ? btree_bkey_first(b, t)
- : tree_to_prev_bkey(b, t, j >> ffs(j));
- r = is_power_of_2(j + 1)
- ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))
- : tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- up = bkey_unpack_key(b, p);
- uk = bkey_unpack_key(b, k);
- bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
- bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
- pr_buf(out,
- " failed prev at depth %u\n"
- "\tkey starts at bit %u but first differing bit at %u\n"
- "\t%llu:%llu\n"
- "\t%llu:%llu\n"
- "\t%s\n"
- "\t%s\n",
- ilog2(j),
- bch2_bkey_greatest_differing_bit(b, l, r),
- bch2_bkey_greatest_differing_bit(b, p, k),
- uk.p.inode, uk.p.offset,
- up.p.inode, up.p.offset,
- buf1, buf2);
- break;
- case BFLOAT_FAILED_OVERFLOW:
- uk = bkey_unpack_key(b, k);
- pr_buf(out,
- " failed overflow at depth %u\n"
- "\t%llu:%llu\n",
- ilog2(j),
- uk.p.inode, uk.p.offset);
- break;
}
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 643bd9e8bc4d..2653a74b3b14 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
}
-#define for_each_bset(_b, _t) \
+#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+#define bset_tree_for_each_key(_b, _t, _k) \
+ for (_k = btree_bkey_first(_b, _t); \
+ _k != btree_bkey_last(_b, _t); \
+ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
@@ -564,6 +569,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
n->unpacked_keys += sign;
}
+static inline void btree_keys_account_val_delta(struct btree *b,
+ struct bkey_packed *k,
+ int delta)
+{
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+ b->nr.live_u64s += delta;
+ b->nr.bset_u64s[t - b->set] += delta;
+}
+
#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
btree_keys_account_key(_nr, _bset_idx, _k, 1)
#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
@@ -582,9 +597,7 @@ struct bset_stats {
} sets[BSET_TREE_NR_TYPES];
size_t floats;
- size_t failed_unpacked;
- size_t failed_prev;
- size_t failed_overflow;
+ size_t failed;
};
void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 046524c8d5ea..5d3acba525c2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -674,10 +674,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
EBUG_ON(!btree_node_locked(iter, level + 1));
EBUG_ON(level >= BTREE_MAX_DEPTH);
retry:
- rcu_read_lock();
b = btree_cache_find(bc, k);
- rcu_read_unlock();
-
if (unlikely(!b)) {
/*
* We must have the parent locked to call bch2_btree_node_fill(),
@@ -878,10 +875,7 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
BUG_ON(!btree_node_locked(iter, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
- rcu_read_lock();
b = btree_cache_find(bc, k);
- rcu_read_unlock();
-
if (b)
return;
@@ -915,9 +909,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
- " failed unpacked %zu\n"
- " failed prev %zu\n"
- " failed overflow %zu\n",
+ " failed unpacked %zu\n",
f->key_u64s,
f->bits_per_field[0],
f->bits_per_field[1],
@@ -934,7 +926,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,
- stats.failed_unpacked,
- stats.failed_prev,
- stats.failed_overflow);
+ stats.failed);
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index f4adb07a3de2..8bbf60b07736 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -216,7 +216,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
: expensive_debug_checks(c) ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
- u8 max_stale;
+ u8 max_stale = 0;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -640,12 +640,7 @@ static int bch2_gc_start(struct bch_fs *c,
{
struct bch_dev *ca;
unsigned i;
-
- /*
- * indicate to stripe code that we need to allocate for the gc stripes
- * radix tree, too
- */
- gc_pos_set(c, gc_phase(GC_PHASE_START));
+ int ret;
BUG_ON(c->usage_gc);
@@ -673,6 +668,18 @@ static int bch2_gc_start(struct bch_fs *c,
}
}
+ ret = bch2_ec_mem_alloc(c, true);
+ if (ret)
+ return ret;
+
+ percpu_down_write(&c->mark_lock);
+
+ /*
+ * indicate to stripe code that we need to allocate for the gc stripes
+ * radix tree, too
+ */
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
+
for_each_member_device(ca, c, i) {
struct bucket_array *dst = __bucket_array(ca, 1);
struct bucket_array *src = __bucket_array(ca, 0);
@@ -697,7 +704,9 @@ static int bch2_gc_start(struct bch_fs *c,
}
};
- return bch2_ec_mem_alloc(c, true);
+ percpu_up_write(&c->mark_lock);
+
+ return 0;
}
/**
@@ -730,10 +739,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
down_write(&c->gc_lock);
again:
- percpu_down_write(&c->mark_lock);
ret = bch2_gc_start(c, metadata_only);
- percpu_up_write(&c->mark_lock);
-
if (ret)
goto out;
@@ -916,7 +922,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
k < vstruct_last(s2) &&
vstruct_blocks_plus(n1->data, c->block_bits,
u64s + k->u64s) <= blocks;
- k = bkey_next(k)) {
+ k = bkey_next_skip_noops(k, vstruct_last(s2))) {
last = k;
u64s += k->u64s;
}
@@ -1034,11 +1040,12 @@ next:
old_nodes[i] = new_nodes[i];
} else {
old_nodes[i] = NULL;
- if (new_nodes[i])
- six_unlock_intent(&new_nodes[i]->lock);
}
}
+ for (i = 0; i < nr_new_nodes; i++)
+ six_unlock_intent(&new_nodes[i]->lock);
+
bch2_btree_update_done(as);
bch2_keylist_free(&keylist, NULL);
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7c88b9d64935..c345262d804b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -26,34 +26,33 @@ static void verify_no_dups(struct btree *b,
struct bkey_packed *end)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bkey_packed *k;
+ struct bkey_packed *k, *p;
+
+ if (start == end)
+ return;
- for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
- struct bkey l = bkey_unpack_key(b, k);
- struct bkey r = bkey_unpack_key(b, bkey_next(k));
+ for (p = start, k = bkey_next_skip_noops(start, end);
+ k != end;
+ p = k, k = bkey_next_skip_noops(k, end)) {
+ struct bkey l = bkey_unpack_key(b, p);
+ struct bkey r = bkey_unpack_key(b, k);
BUG_ON(btree_node_is_extents(b)
? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
: bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
- //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
}
#endif
}
-static void clear_needs_whiteout(struct bset *i)
-{
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
- k->needs_whiteout = false;
-}
-
-static void set_needs_whiteout(struct bset *i)
+static void set_needs_whiteout(struct bset *i, int v)
{
struct bkey_packed *k;
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
- k->needs_whiteout = true;
+ for (k = i->start;
+ k != vstruct_last(i);
+ k = bkey_next_skip_noops(k, vstruct_last(i)))
+ k->needs_whiteout = v;
}
static void btree_bounce_free(struct bch_fs *c, unsigned order,
@@ -168,7 +167,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
out = i->start;
for (k = start; k != end; k = n) {
- n = bkey_next(k);
+ n = bkey_next_skip_noops(k, end);
if (bkey_deleted(k) && btree_node_is_extents(b))
continue;
@@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
out = i->start;
for (k = start; k != end; k = n) {
- n = bkey_next(k);
+ n = bkey_next_skip_noops(k, end);
if (!bkey_whiteout(k)) {
bkey_copy(out, k);
@@ -510,7 +509,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -680,14 +679,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
struct bkey tmp;
const char *invalid;
- if (btree_err_on(!k->u64s,
- BTREE_ERR_FIXABLE, c, b, i,
- "KEY_U64s 0: %zu bytes of metadata lost",
- vstruct_end(i) - (void *) k)) {
- i->u64s = cpu_to_le16((u64 *) k - i->_data);
- break;
- }
-
if (btree_err_on(bkey_next(k) > vstruct_last(i),
BTREE_ERR_FIXABLE, c, b, i,
"key extends past end of bset")) {
@@ -756,7 +747,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
prev_pos = u.k->p;
prev = k;
- k = bkey_next(k);
+ k = bkey_next_skip_noops(k, vstruct_last(i));
}
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
@@ -915,12 +906,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
continue;
}
- k = bkey_next(k);
+ k = bkey_next_skip_noops(k, vstruct_last(i));
}
bch2_bset_build_aux_tree(b, b->set, false);
- set_needs_whiteout(btree_bset_first(b));
+ set_needs_whiteout(btree_bset_first(b), true);
btree_node_reset_sib_u64s(b);
out:
@@ -1425,7 +1416,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
: bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
- clear_needs_whiteout(i);
+ set_needs_whiteout(i, false);
/* do we have data to write? */
if (b->written && !i->u64s)
@@ -1500,10 +1491,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
wbio->data = data;
wbio->wbio.order = order;
wbio->wbio.used_mempool = used_mempool;
- wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
+ if (b->level || !b->written)
+ wbio->wbio.bio.bi_opf |= REQ_FUA;
+
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
@@ -1576,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
}
for_each_bset(b, t)
- set_needs_whiteout(bset(b, t));
+ set_needs_whiteout(bset(b, t), true);
bch2_btree_verify(c, b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index c817aeed878a..955a80cafae3 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -62,10 +62,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode
static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
{
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
- unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+ unsigned total_u64s = bset_u64s(t);
+ unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
- return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
}
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 40cd87d73a4f..a4180124d7d1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -64,21 +64,9 @@ static inline int btree_iter_pos_cmp(struct btree_iter *iter,
/* Btree node locking: */
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
{
- struct btree_iter *linked;
-
- EBUG_ON(iter->l[b->level].b != b);
- EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
-
- trans_for_each_iter_with_node(iter->trans, b, linked)
- linked->l[b->level].lock_seq += 2;
-
- six_unlock_write(&b->lock);
+ bch2_btree_node_unlock_write_inlined(b, iter);
}
void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
@@ -306,9 +294,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
__flatten
static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
{
- return iter->uptodate >= BTREE_ITER_NEED_RELOCK
- ? btree_iter_get_locks(iter, false, trace)
- : true;
+ return btree_iter_get_locks(iter, false, trace);
}
bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
@@ -473,7 +459,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
}
BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
- (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
+ btree_iter_type(iter) == BTREE_ITER_KEYS &&
!bkey_whiteout(&iter->k) &&
bch2_btree_node_iter_end(&l->iter));
}
@@ -513,6 +499,30 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
}
+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_node_iter *node_iter = &iter->l[0].iter;
+
+ if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
+ bkey_disassemble(b, where, &iter->k);
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+ }
+}
+
+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_iter *linked;
+
+ trans_for_each_iter_with_node(iter->trans, b, linked) {
+ __bch2_btree_iter_fix_key_modified(linked, b, where);
+ __bch2_btree_iter_verify(linked, b);
+ }
+}
+
static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
@@ -833,8 +843,6 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
btree_iter_node_set(linked, b);
}
-
- six_unlock_intent(&b->lock);
}
void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
@@ -941,7 +949,7 @@ static void btree_iter_prefetch(struct btree_iter *iter)
btree_node_unlock(iter, iter->level);
}
-static inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter)
{
struct bch_fs *c = iter->trans->c;
struct btree_iter_level *l = &iter->l[iter->level];
@@ -950,7 +958,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
enum six_lock_type lock_type = __btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
- BUG_ON(!btree_node_locked(iter, iter->level));
+ EBUG_ON(!btree_node_locked(iter, iter->level));
bch2_bkey_unpack(l->b, &tmp.k,
bch2_btree_node_iter_peek(&l->iter, l->b));
@@ -1010,8 +1018,11 @@ retry_all:
if (unlikely(ret == -EIO)) {
trans->error = true;
- orig_iter->flags |= BTREE_ITER_ERROR;
- orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR;
+ if (orig_iter) {
+ orig_iter->flags |= BTREE_ITER_ERROR;
+ orig_iter->l[orig_iter->level].b =
+ BTREE_ITER_NO_NODE_ERROR;
+ }
goto out;
}
@@ -1085,7 +1096,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
- if (bch2_btree_iter_relock(iter, false))
+ /*
+ * if we need interior nodes locked, call btree_iter_relock() to make
+ * sure we walk back up enough that we lock them:
+ */
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
+ iter->locks_want > 1)
+ bch2_btree_iter_relock(iter, false);
+
+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
/*
@@ -1152,6 +1171,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
(btree_node_type_is_extents(iter->btree_id) &&
type != BTREE_ITER_NODES));
+ EBUG_ON(btree_iter_type(iter) != type);
bch2_btree_trans_verify_locks(iter->trans);
}
@@ -1357,6 +1377,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
if (debug_check_iterators(iter->trans->c)) {
struct bkey k = bkey_unpack_key(l->b, _k);
+
+ /*
+ * this flag is internal to the btree code,
+ * we don't care if it doesn't match - if it's now set
+ * it just means the key has been written out to disk:
+ */
+ k.needs_whiteout = iter->k.needs_whiteout;
BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
}
@@ -1436,6 +1463,14 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return bch2_btree_iter_peek(iter);
}
+ if (unlikely(bkey_deleted(&iter->k))) {
+ /*
+ * we're currently pointed at a hole, because previously we were
+ * iterating over slots:
+ */
+ return bch2_btree_iter_peek(iter);
+ }
+
do {
bch2_btree_node_iter_advance(&l->iter, l->b);
p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1661,7 +1696,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
{
int ret;
- bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
if (iter->uptodate == BTREE_ITER_UPTODATE)
return btree_iter_peek_uptodate(iter);
@@ -1675,7 +1710,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
- bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
+ bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
@@ -1729,15 +1764,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
/* new transactional stuff: */
-int bch2_trans_iter_put(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- int ret = btree_iter_err(iter);
-
- trans->iters_live &= ~(1ULL << iter->idx);
- return ret;
-}
-
static inline void __bch2_trans_iter_free(struct btree_trans *trans,
unsigned idx)
{
@@ -1745,26 +1771,27 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
trans->iters_linked &= ~(1ULL << idx);
trans->iters_live &= ~(1ULL << idx);
trans->iters_touched &= ~(1ULL << idx);
- trans->iters_unlink_on_restart &= ~(1ULL << idx);
- trans->iters_unlink_on_commit &= ~(1ULL << idx);
}
-int bch2_trans_iter_free(struct btree_trans *trans,
- struct btree_iter *iter)
+int bch2_trans_iter_put(struct btree_trans *trans,
+ struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
- __bch2_trans_iter_free(trans, iter->idx);
+ if (!(trans->iters_touched & (1ULL << iter->idx)) &&
+ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
+ __bch2_trans_iter_free(trans, iter->idx);
+
+ trans->iters_live &= ~(1ULL << iter->idx);
return ret;
}
-int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
- struct btree_iter *iter)
+int bch2_trans_iter_free(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- int ret = btree_iter_err(iter);
+ trans->iters_touched &= ~(1ULL << iter->idx);
- trans->iters_unlink_on_commit |= 1ULL << iter->idx;
- return ret;
+ return bch2_trans_iter_put(trans, iter);
}
static int bch2_trans_realloc_iters(struct btree_trans *trans,
@@ -1830,7 +1857,7 @@ success:
return 0;
}
-static int btree_trans_iter_alloc(struct btree_trans *trans)
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
{
unsigned idx = __ffs64(~trans->iters_linked);
@@ -1838,9 +1865,27 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
goto got_slot;
if (trans->nr_iters == trans->size) {
- int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
+ int ret;
+
+ if (trans->nr_iters >= BTREE_ITER_MAX) {
+ struct btree_iter *iter;
+
+ trans_for_each_iter(trans, iter) {
+ pr_err("iter: btree %s pos %llu:%llu%s%s%s",
+ bch2_btree_ids[iter->btree_id],
+ iter->pos.inode,
+ iter->pos.offset,
+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
+ }
+
+ panic("trans iter oveflow\n");
+ }
+
+ ret = bch2_trans_realloc_iters(trans, trans->size * 2);
if (ret)
- return ret;
+ return ERR_PTR(ret);
}
idx = trans->nr_iters++;
@@ -1850,71 +1895,97 @@ static int btree_trans_iter_alloc(struct btree_trans *trans)
got_slot:
BUG_ON(trans->iters_linked & (1ULL << idx));
trans->iters_linked |= 1ULL << idx;
- return idx;
+ return &trans->iters[idx];
+}
+
+static inline void btree_iter_copy(struct btree_iter *dst,
+ struct btree_iter *src)
+{
+ unsigned i, idx = dst->idx;
+
+ *dst = *src;
+ dst->idx = idx;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_locked(dst, i))
+ six_lock_increment(&dst->l[i].b->lock,
+ __btree_lock_want(dst, i));
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+ if (bkey_cmp(l, r) > 0)
+ swap(l, r);
+
+ return POS(r.inode - l.inode, r.offset - l.offset);
}
static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
- unsigned flags, u64 iter_id)
+ unsigned flags)
{
- struct btree_iter *iter;
- int idx;
+ struct btree_iter *iter, *best = NULL;
BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
- for (idx = 0; idx < trans->nr_iters; idx++) {
- if (!(trans->iters_linked & (1ULL << idx)))
+ trans_for_each_iter(trans, iter) {
+ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
- iter = &trans->iters[idx];
- if (iter_id
- ? iter->id == iter_id
- : (iter->btree_id == btree_id &&
- !bkey_cmp(iter->pos, pos)))
- goto found;
+ if (iter->btree_id != btree_id)
+ continue;
+
+ if (best &&
+ bkey_cmp(bpos_diff(best->pos, pos),
+ bpos_diff(iter->pos, pos)) < 0)
+ continue;
+
+ best = iter;
}
- idx = -1;
-found:
- if (idx < 0) {
- idx = btree_trans_iter_alloc(trans);
- if (idx < 0)
- return ERR_PTR(idx);
- iter = &trans->iters[idx];
- iter->id = iter_id;
+ if (!best) {
+ iter = btree_trans_iter_alloc(trans);
+ if (IS_ERR(iter))
+ return iter;
bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
- } else {
- iter = &trans->iters[idx];
-
- iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
- iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+ } else if ((trans->iters_live & (1ULL << best->idx)) ||
+ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+ iter = btree_trans_iter_alloc(trans);
+ if (IS_ERR(iter))
+ return iter;
- if ((iter->flags & BTREE_ITER_INTENT) &&
- !bch2_btree_iter_upgrade(iter, 1)) {
- trace_trans_restart_upgrade(trans->ip);
- return ERR_PTR(-EINTR);
- }
+ btree_iter_copy(iter, best);
+ } else {
+ iter = best;
}
- BUG_ON(iter->btree_id != btree_id);
- BUG_ON(trans->iters_live & (1ULL << idx));
- trans->iters_live |= 1ULL << idx;
- trans->iters_touched |= 1ULL << idx;
+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+ iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+ iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+ if (iter->flags & BTREE_ITER_INTENT)
+ bch2_btree_iter_upgrade(iter, 1);
+ else
+ bch2_btree_iter_downgrade(iter);
BUG_ON(iter->btree_id != btree_id);
BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
+ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+ BUG_ON(trans->iters_live & (1ULL << iter->idx));
+
+ trans->iters_live |= 1ULL << iter->idx;
+ trans->iters_touched |= 1ULL << iter->idx;
return iter;
}
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos, unsigned flags,
- u64 iter_id)
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
- __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
+ __btree_trans_get_iter(trans, btree_id, pos, flags);
if (!IS_ERR(iter))
bch2_btree_iter_set_pos(iter, pos);
@@ -1930,7 +2001,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos,
- flags|BTREE_ITER_NODES, 0);
+ flags|BTREE_ITER_NODES);
unsigned i;
BUG_ON(IS_ERR(iter));
@@ -1950,28 +2021,22 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
struct btree_iter *src)
{
struct btree_iter *iter;
- int i, idx;
-
- idx = btree_trans_iter_alloc(trans);
- if (idx < 0)
- return ERR_PTR(idx);
- trans->iters_live |= 1ULL << idx;
- trans->iters_touched |= 1ULL << idx;
- trans->iters_unlink_on_restart |= 1ULL << idx;
+ iter = btree_trans_iter_alloc(trans);
+ if (IS_ERR(iter))
+ return iter;
- iter = &trans->iters[idx];
+ btree_iter_copy(iter, src);
- memcpy(&iter->trans,
- &src->trans,
- (void *) &iter[1] - (void *) &iter->trans);
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_locked(iter, i))
- six_lock_increment(&iter->l[i].b->lock,
- __btree_lock_want(iter, i));
+ trans->iters_live |= 1ULL << iter->idx;
+ /*
+ * Don't mark it as touched, we don't need to preserve this iter since
+ * it's cheap to copy it again:
+ */
+ trans->iters_touched &= ~(1ULL << iter->idx);
+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
- return &trans->iters[idx];
+ return iter;
}
static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
@@ -2010,10 +2075,11 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
-inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
+inline void bch2_trans_unlink_iters(struct btree_trans *trans)
{
- iters &= trans->iters_linked;
- iters &= ~trans->iters_live;
+ u64 iters = trans->iters_linked &
+ ~trans->iters_touched &
+ ~trans->iters_live;
while (iters) {
unsigned idx = __ffs64(iters);
@@ -2023,33 +2089,24 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
}
}
-void bch2_trans_begin(struct btree_trans *trans)
+void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
{
- u64 iters_to_unlink;
+ struct btree_iter *iter;
- /*
- * On transaction restart, the transaction isn't required to allocate
- * all the same iterators it on the last iteration:
- *
- * Unlink any iterators it didn't use this iteration, assuming it got
- * further (allocated an iter with a higher idx) than where the iter
- * was originally allocated:
- */
- iters_to_unlink = ~trans->iters_live &
- ((1ULL << fls64(trans->iters_live)) - 1);
+ trans_for_each_iter(trans, iter)
+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
- iters_to_unlink |= trans->iters_unlink_on_restart;
- iters_to_unlink |= trans->iters_unlink_on_commit;
+ bch2_trans_unlink_iters(trans);
- trans->iters_live = 0;
+ if (flags & TRANS_RESET_ITERS)
+ trans->iters_live = 0;
- bch2_trans_unlink_iters(trans, iters_to_unlink);
+ trans->iters_touched &= trans->iters_live;
- trans->iters_touched = 0;
- trans->iters_unlink_on_restart = 0;
- trans->iters_unlink_on_commit = 0;
trans->nr_updates = 0;
- trans->mem_top = 0;
+
+ if (flags & TRANS_RESET_MEM)
+ trans->mem_top = 0;
bch2_btree_iter_traverse_all(trans);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e4967215e1d9..4c5032222319 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -48,6 +48,11 @@ static inline int btree_iter_err(const struct btree_iter *iter)
/* Iterate over iters within a transaction: */
+#define trans_for_each_iter_all(_trans, _iter) \
+ for (_iter = (_trans)->iters; \
+ _iter < (_trans)->iters + (_trans)->nr_iters; \
+ _iter++)
+
static inline struct btree_iter *
__trans_next_iter(struct btree_trans *trans, unsigned idx)
{
@@ -99,6 +104,8 @@ static inline void bch2_btree_iter_verify(struct btree_iter *iter,
static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
#endif
+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
+ struct bkey_packed *);
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_packed *,
unsigned, unsigned);
@@ -246,6 +253,11 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
: bch2_btree_iter_next(iter);
}
+static inline int bkey_err(struct bkey_s_c k)
+{
+ return PTR_ERR_OR_ZERO(k.k);
+}
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \
@@ -257,57 +269,39 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
(_ret) = PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_next(_iter, _flags)).k))
-#define for_each_btree_key_continue(_iter, _flags, _k) \
+#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \
for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
- !IS_ERR_OR_NULL((_k).k); \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
(_k) = __bch2_btree_iter_next(_iter, _flags))
-static inline int bkey_err(struct bkey_s_c k)
-{
- return PTR_ERR_OR_ZERO(k.k);
-}
-
/* new multiple iterator interface: */
int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
-void bch2_trans_unlink_iters(struct btree_trans *, u64);
+void bch2_trans_unlink_iters(struct btree_trans *);
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned, u64);
+struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+ struct bpos, unsigned);
struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
struct btree_iter *);
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
+ enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned);
-static __always_inline u64 __btree_iter_id(void)
-{
- u64 ret = 0;
+#define TRANS_RESET_ITERS (1 << 0)
+#define TRANS_RESET_MEM (1 << 1)
- ret <<= 32;
- ret |= _RET_IP_ & U32_MAX;
- ret <<= 32;
- ret |= _THIS_IP_ & U32_MAX;
- return ret;
-}
+void bch2_trans_reset(struct btree_trans *, unsigned);
-static __always_inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
- struct bpos pos, unsigned flags)
+static inline void bch2_trans_begin(struct btree_trans *trans)
{
- return __bch2_trans_get_iter(trans, btree_id, pos, flags,
- __btree_iter_id());
+ return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
}
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
- enum btree_id, struct bpos,
- unsigned, unsigned, unsigned);
-
-void bch2_trans_begin(struct btree_trans *);
-
static inline void bch2_trans_begin_updates(struct btree_trans *trans)
{
- trans->nr_updates = 0;
+ return bch2_trans_reset(trans, TRANS_RESET_MEM);
}
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index ea07ba19c5dc..aaad2d289e79 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -203,6 +203,24 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
__bch2_btree_node_relock(iter, level);
}
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+{
+ struct btree_iter *linked;
+
+ EBUG_ON(iter->l[b->level].b != b);
+ EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
+
+ trans_for_each_iter_with_node(iter->trans, b, linked)
+ linked->l[b->level].lock_seq += 2;
+
+ six_unlock_write(&b->lock);
+}
+
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
@@ -212,7 +230,7 @@ static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
- if (!six_trylock_write(&b->lock))
+ if (unlikely(!six_trylock_write(&b->lock)))
__bch2_btree_node_lock_write(b, iter);
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b0da09630911..efa68bb578ab 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -180,20 +180,21 @@ struct btree_node_iter {
enum btree_iter_type {
BTREE_ITER_KEYS,
- BTREE_ITER_SLOTS,
BTREE_ITER_NODES,
};
#define BTREE_ITER_TYPE ((1 << 2) - 1)
-#define BTREE_ITER_INTENT (1 << 2)
-#define BTREE_ITER_PREFETCH (1 << 3)
+#define BTREE_ITER_SLOTS (1 << 2)
+#define BTREE_ITER_INTENT (1 << 3)
+#define BTREE_ITER_PREFETCH (1 << 4)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 4)
-#define BTREE_ITER_ERROR (1 << 5)
+#define BTREE_ITER_IS_EXTENTS (1 << 6)
+#define BTREE_ITER_ERROR (1 << 7)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -234,33 +235,16 @@ struct btree_iter {
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
-
- u64 id;
};
-struct deferred_update {
- struct journal_preres res;
- struct journal_entry_pin journal;
-
- spinlock_t lock;
- unsigned dirty:1;
-
- u8 allocated_u64s;
- enum btree_id btree_id;
-
- /* must be last: */
- struct bkey_i k;
-};
+static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
+{
+ return iter->flags & BTREE_ITER_TYPE;
+}
struct btree_insert_entry {
struct bkey_i *k;
-
- union {
struct btree_iter *iter;
- struct deferred_update *d;
- };
-
- bool deferred;
};
#define BTREE_ITER_MAX 64
@@ -268,13 +252,10 @@ struct btree_insert_entry {
struct btree_trans {
struct bch_fs *c;
unsigned long ip;
- u64 commit_start;
u64 iters_linked;
u64 iters_live;
u64 iters_touched;
- u64 iters_unlink_on_restart;
- u64 iters_unlink_on_commit;
u8 nr_iters;
u8 nr_updates;
@@ -298,12 +279,11 @@ struct btree_trans {
struct disk_reservation *disk_res;
unsigned flags;
unsigned journal_u64s;
+ struct replicas_delta_list *fs_usage_deltas;
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[6];
u8 updates_sorted_onstack[6];
-
- struct replicas_delta_list *fs_usage_deltas;
};
#define BTREE_FLAG(flag) \
@@ -435,6 +415,12 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
__btree_node_offset_to_key(_b, (_t)->end_offset); \
})
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+ return t->end_offset - t->data_offset -
+ sizeof(struct bset) / sizeof(u64);
+}
+
static inline unsigned bset_byte_offset(struct btree *b, void *i)
{
return i - (void *) b->data;
@@ -475,19 +461,22 @@ static inline bool btree_node_is_extents(struct btree *b)
return btree_node_type_is_extents(btree_node_type(b));
}
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_ALLOC)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK)| \
+ (1U << BKEY_TYPE_EC)| \
+ (1U << BKEY_TYPE_BTREE))
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK))
+
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
- switch (type) {
- case BKEY_TYPE_ALLOC:
- case BKEY_TYPE_BTREE:
- case BKEY_TYPE_EXTENTS:
- case BKEY_TYPE_INODES:
- case BKEY_TYPE_EC:
- case BKEY_TYPE_REFLINK:
- return true;
- default:
- return false;
- }
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
}
struct btree_root {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 36e34b3d9213..ad8cbf3fb778 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,24 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
-void bch2_deferred_update_free(struct bch_fs *,
- struct deferred_update *);
-struct deferred_update *
-bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned);
-
-#define BTREE_INSERT_ENTRY(_iter, _k) \
- ((struct btree_insert_entry) { \
- .iter = (_iter), \
- .k = (_k), \
- })
-
-#define BTREE_INSERT_DEFERRED(_d, _k) \
- ((struct btree_insert_entry) { \
- .k = (_k), \
- .d = (_d), \
- .deferred = true, \
- })
-
enum {
__BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK,
@@ -45,7 +27,6 @@ enum {
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_NOMARK_OVERWRITES,
__BTREE_INSERT_NOMARK,
- __BTREE_INSERT_MARK_INMEM,
__BTREE_INSERT_NO_CLEAR_REPLICAS,
__BTREE_INSERT_BUCKET_INVALIDATE,
__BTREE_INSERT_NOWAIT,
@@ -86,9 +67,6 @@ enum {
/* Don't call mark new key at all: */
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
-/* Don't mark transactionally: */
-#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM)
-
#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE)
@@ -115,16 +93,42 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_btree_ptr *);
-int bch2_trans_commit(struct btree_trans *,
- struct disk_reservation *,
- u64 *, unsigned);
+int __bch2_trans_commit(struct btree_trans *);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ * if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+ struct disk_reservation *disk_res,
+ u64 *journal_seq,
+ unsigned flags)
+{
+ trans->disk_res = disk_res;
+ trans->journal_seq = journal_seq;
+ trans->flags = flags;
+
+ return __bch2_trans_commit(trans);
+}
static inline void bch2_trans_update(struct btree_trans *trans,
- struct btree_insert_entry entry)
+ struct btree_iter *iter,
+ struct bkey_i *k)
{
EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
- trans->updates[trans->nr_updates++] = entry;
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+ trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
+ .iter = iter, .k = k
+ };
}
#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
@@ -145,23 +149,9 @@ static inline void bch2_trans_update(struct btree_trans *trans,
_ret; \
})
-#define __trans_next_update(_trans, _i, _filter) \
-({ \
- while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\
- (_i)++; \
- \
- (_i) < (_trans)->updates + (_trans->nr_updates); \
-})
-
-#define __trans_for_each_update(_trans, _i, _filter) \
+#define trans_for_each_update(_trans, _i) \
for ((_i) = (_trans)->updates; \
- __trans_next_update(_trans, _i, _filter); \
+ (_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-#define trans_for_each_update(trans, i) \
- __trans_for_each_update(trans, i, true)
-
-#define trans_for_each_update_iter(trans, i) \
- __trans_for_each_update(trans, i, !(i)->deferred)
-
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6813eddd26f5..f8a30cb34750 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
bch2_bkey_format_add_pos(s, b->data->min_key);
for_each_bset(b, t)
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k))
+ bset_tree_for_each_key(b, t, k)
if (!bkey_whiteout(k)) {
uk = bkey_unpack_key(b, k);
bch2_bkey_format_add_key(s, &uk);
@@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as,
*/
k = set1->start;
while (1) {
- if (bkey_next(k) == vstruct_last(set1))
+ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+
+ if (n == vstruct_last(set1))
break;
if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
break;
@@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
nr_unpacked++;
prev = k;
- k = bkey_next(k);
+ k = n;
}
BUG_ON(!prev);
@@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
{
struct btree_node_iter node_iter;
struct bkey_i *k = bch2_keylist_front(keys);
- struct bkey_packed *p;
+ struct bkey_packed *src, *dst, *n;
struct bset *i;
BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
@@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
* for the pivot:
*/
i = btree_bset_first(b);
- p = i->start;
- while (p != vstruct_last(i))
- if (bkey_deleted(p)) {
- le16_add_cpu(&i->u64s, -p->u64s);
- set_btree_bset_end(b, b->set);
- memmove_u64s_down(p, bkey_next(p),
- (u64 *) vstruct_last(i) -
- (u64 *) p);
- } else
- p = bkey_next(p);
+ src = dst = i->start;
+ while (src != vstruct_last(i)) {
+ n = bkey_next_skip_noops(src, vstruct_last(i));
+ if (!bkey_deleted(src)) {
+ memmove_u64s_down(dst, src, src->u64s);
+ dst = bkey_next(dst);
+ }
+ src = n;
+ }
+
+ i->u64s = cpu_to_le16((u64 *) dst - i->_data);
+ set_btree_bset_end(b, b->set);
BUG_ON(b->nsets != 1 ||
b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
@@ -1446,8 +1448,20 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_iter_node_replace(iter, n2);
bch2_btree_iter_node_replace(iter, n1);
+ /*
+ * The old node must be freed (in memory) _before_ unlocking the new
+ * nodes - else another thread could re-acquire a read lock on the old
+ * node after another thread has locked and updated the new node, thus
+ * seeing stale data:
+ */
bch2_btree_node_free_inmem(c, b, iter);
+ if (n3)
+ six_unlock_intent(&n3->lock);
+ if (n2)
+ six_unlock_intent(&n2->lock);
+ six_unlock_intent(&n1->lock);
+
bch2_btree_trans_verify_locks(iter->trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
@@ -1761,6 +1775,8 @@ retry:
bch2_btree_node_free_inmem(c, b, iter);
bch2_btree_node_free_inmem(c, m, iter);
+ six_unlock_intent(&n->lock);
+
bch2_btree_update_done(as);
if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
@@ -1855,6 +1871,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_replace(iter, n);
bch2_btree_node_free_inmem(c, b, iter);
+ six_unlock_intent(&n->lock);
bch2_btree_update_done(as);
return 0;
@@ -2172,6 +2189,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
bch2_bset_init_first(b, &b->data->keys);
bch2_btree_build_aux_trees(b);
+ b->data->flags = 0;
b->data->min_key = POS_MIN;
b->data->max_key = POS_MAX;
b->data->format = bch2_btree_calc_format(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e5156e908110..c5a0ab5d7bb8 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -284,17 +284,17 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
- struct bset *i = btree_bset_last(b);
+ struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
- if (unlikely(bset_written(b, i))) {
+ if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
return bne;
} else {
- if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
return bne;
}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0d32fb8726c7..d37a95299240 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -10,27 +10,22 @@
#include "buckets.h"
#include "debug.h"
#include "error.h"
-#include "extents.h"
+#include "extent_update.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "replicas.h"
+#include <linux/prefetch.h>
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
static inline bool same_leaf_as_prev(struct btree_trans *trans,
- unsigned sorted_idx)
+ unsigned idx)
{
- struct btree_insert_entry *i = trans->updates +
- trans->updates_sorted[sorted_idx];
- struct btree_insert_entry *prev = sorted_idx
- ? trans->updates + trans->updates_sorted[sorted_idx - 1]
- : NULL;
-
- return !i->deferred &&
- prev &&
- i->iter->l[0].b == prev->iter->l[0].b;
+ return idx &&
+ trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
+ trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
}
#define trans_for_each_update_sorted(_trans, _i, _iter) \
@@ -44,7 +39,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
{
bch2_btree_node_lock_write(b, iter);
- if (btree_node_just_written(b) &&
+ if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_btree_iter_reinit_node(iter, b);
@@ -56,30 +51,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
bch2_btree_init_next(c, b, iter);
}
-static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned iter;
-
- trans_for_each_update_sorted(trans, i, iter) {
- if (same_leaf_as_prev(trans, iter))
- continue;
-
- if (lock)
- bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
- else
- bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
- }
-}
-
-static inline int btree_trans_cmp(struct btree_insert_entry l,
- struct btree_insert_entry r)
-{
- return cmp_int(l.deferred, r.deferred) ?:
- btree_iter_cmp(l.iter, r.iter);
-}
-
static inline void btree_trans_sort_updates(struct btree_trans *trans)
{
struct btree_insert_entry *l, *r;
@@ -89,7 +60,7 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans)
for (pos = 0; pos < nr; pos++) {
r = trans->updates + trans->updates_sorted[pos];
- if (btree_trans_cmp(*l, *r) <= 0)
+ if (btree_iter_cmp(l->iter, r->iter) <= 0)
break;
}
@@ -100,8 +71,6 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans)
trans->updates_sorted[pos] = l - trans->updates;
nr++;
}
-
- BUG_ON(nr != trans->nr_updates);
}
/* Inserting into a given leaf node (last stage of insert): */
@@ -274,8 +243,8 @@ static void bch2_insert_fixup_key(struct btree_trans *trans,
EBUG_ON(insert->k->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
- if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
- insert->k))
+ if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+ insert->k)))
bch2_btree_journal_key(trans, iter, insert->k);
}
@@ -288,7 +257,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
- int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+ struct bset_tree *t = bset_tree_last(b);
+ int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -298,7 +268,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_insert_fixup_extent(trans, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+ u64s_added = (int) bset_u64s(t) - old_u64s;
if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
@@ -312,165 +282,31 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
trace_btree_insert_key(c, b, insert->k);
}
-/* Deferred btree updates: */
-
-static void deferred_update_flush(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct deferred_update *d =
- container_of(pin, struct deferred_update, journal);
- struct journal_preres res = { 0 };
- u64 tmp[32];
- struct bkey_i *k = (void *) tmp;
- int ret;
-
- if (d->allocated_u64s > ARRAY_SIZE(tmp)) {
- k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS);
-
- BUG_ON(!k); /* XXX */
- }
-
- spin_lock(&d->lock);
- if (d->dirty) {
- BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s);
-
- swap(res, d->res);
-
- BUG_ON(d->k.k.u64s > d->allocated_u64s);
-
- bkey_copy(k, &d->k);
- d->dirty = false;
- spin_unlock(&d->lock);
-
- ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_JOURNAL_RESERVED);
- bch2_fs_fatal_err_on(ret && !bch2_journal_error(j),
- c, "error flushing deferred btree update: %i", ret);
-
- spin_lock(&d->lock);
- }
-
- if (!d->dirty)
- bch2_journal_pin_drop(j, &d->journal);
- spin_unlock(&d->lock);
-
- bch2_journal_preres_put(j, &res);
- if (k != (void *) tmp)
- kfree(k);
-}
-
-static void btree_insert_key_deferred(struct btree_trans *trans,
- struct btree_insert_entry *insert)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct deferred_update *d = insert->d;
- int difference;
-
- BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY);
- BUG_ON(insert->k->u64s > d->allocated_u64s);
-
- __btree_journal_key(trans, d->btree_id, insert->k);
-
- spin_lock(&d->lock);
- BUG_ON(jset_u64s(insert->k->u64s) >
- trans->journal_preres.u64s);
-
- difference = jset_u64s(insert->k->u64s) - d->res.u64s;
- if (difference > 0) {
- trans->journal_preres.u64s -= difference;
- d->res.u64s += difference;
- }
-
- bkey_copy(&d->k, insert->k);
- d->dirty = true;
-
- bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal,
- deferred_update_flush);
- spin_unlock(&d->lock);
-}
-
-void bch2_deferred_update_free(struct bch_fs *c,
- struct deferred_update *d)
-{
- deferred_update_flush(&c->journal, &d->journal, 0);
-
- BUG_ON(journal_pin_active(&d->journal));
-
- bch2_journal_pin_flush(&c->journal, &d->journal);
- kfree(d);
-}
-
-struct deferred_update *
-bch2_deferred_update_alloc(struct bch_fs *c,
- enum btree_id btree_id,
- unsigned u64s)
-{
- struct deferred_update *d;
-
- BUG_ON(u64s > U8_MAX);
-
- d = kmalloc(offsetof(struct deferred_update, k) +
- u64s * sizeof(u64), GFP_NOFS);
- BUG_ON(!d);
-
- memset(d, 0, offsetof(struct deferred_update, k));
-
- spin_lock_init(&d->lock);
- d->allocated_u64s = u64s;
- d->btree_id = btree_id;
-
- return d;
-}
-
/* Normal update interface: */
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
struct bch_fs *c = trans->c;
- enum btree_id btree_id = !i->deferred
- ? i->iter->btree_id
- : i->d->btree_id;
-
- if (!i->deferred) {
- BUG_ON(i->iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
- EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- !(trans->flags & BTREE_INSERT_ATOMIC));
- }
+
+ BUG_ON(i->iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+ EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
+ EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !(trans->flags & BTREE_INSERT_ATOMIC));
BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&i->k->k) &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id));
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
}
-static int bch2_trans_journal_preres_get(struct btree_trans *trans)
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned u64s = 0;
int ret;
- trans_for_each_update(trans, i)
- if (i->deferred)
- u64s += jset_u64s(i->k->k.u64s);
-
- if (!u64s)
- return 0;
-
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, u64s,
- JOURNAL_RES_GET_NONBLOCK);
- if (ret != -EAGAIN)
- return ret;
-
bch2_trans_unlock(trans);
ret = bch2_journal_preres_get(&c->journal,
@@ -486,8 +322,8 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans)
return 0;
}
-static int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
int ret;
@@ -525,102 +361,63 @@ btree_key_can_insert(struct btree_trans *trans,
return BTREE_INSERT_OK;
}
-static int btree_trans_check_can_insert(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
+static inline void do_btree_insert_one(struct btree_trans *trans,
+ struct btree_insert_entry *insert)
{
- struct btree_insert_entry *i;
- unsigned iter, u64s = 0;
- int ret;
-
- trans_for_each_update_sorted(trans, i, iter) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, iter))
- u64s = 0;
-
- u64s += i->k->k.u64s;
- ret = btree_key_can_insert(trans, i, &u64s);
- if (ret) {
- *stopped_at = i;
- return ret;
- }
- }
-
- return 0;
+ btree_insert_key_leaf(trans, insert);
}
-static inline void do_btree_insert_one(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
{
- if (likely(!insert->deferred))
- btree_insert_key_leaf(trans, insert);
- else
- btree_insert_key_deferred(trans, insert);
+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
}
-static inline bool update_triggers_transactional(struct btree_trans *trans,
- struct btree_insert_entry *i)
+static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
{
- return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
- (i->iter->btree_id == BTREE_ID_EXTENTS ||
- i->iter->btree_id == BTREE_ID_INODES ||
- i->iter->btree_id == BTREE_ID_REFLINK);
+ return (BTREE_NODE_TYPE_HAS_TRIGGERS &
+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+ (1U << i->iter->btree_id);
}
-static inline bool update_has_triggers(struct btree_trans *trans,
- struct btree_insert_entry *i)
+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
{
- return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- !i->deferred &&
- btree_node_type_needs_gc(i->iter->btree_id);
+ __bch2_btree_iter_unlock(iter);
}
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_btree_insert_at(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
? BCH_BUCKET_MARK_BUCKET_INVALIDATE
: 0;
- int ret;
- trans_for_each_update_iter(trans, i)
- BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
-
- /*
- * note: running triggers will append more updates to the list of
- * updates as we're walking it:
- */
- trans_for_each_update_iter(trans, i)
- if (update_has_triggers(trans, i) &&
- update_triggers_transactional(trans, i)) {
- ret = bch2_trans_mark_update(trans, i->iter, i->k);
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip);
- if (ret)
- goto out_clear_replicas;
- }
+ if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
+ return;
trans_for_each_update(trans, i)
- btree_insert_entry_checks(trans, i);
- bch2_btree_trans_verify_locks(trans);
-
- /*
- * No more updates can be added - sort updates so we can take write
- * locks in the correct order:
- */
- btree_trans_sort_updates(trans);
+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+ bch2_mark_update(trans, i, NULL,
+ mark_flags|BCH_BUCKET_MARK_GC);
+}
- btree_trans_lock_write(trans, true);
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans,
+ struct btree_insert_entry **stopped_at)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *fs_usage = NULL;
+ struct btree_insert_entry *i;
+ unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+ ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+ : 0;
+ unsigned iter, u64s = 0;
+ bool marking = false;
+ int ret;
if (race_fault()) {
- ret = -EINTR;
trace_trans_restart_fault_inject(trans->ip);
- goto out;
+ return -EINTR;
}
/*
@@ -628,24 +425,28 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
* held, otherwise another thread could write the node changing the
* amount of space available:
*/
- ret = btree_trans_check_can_insert(trans, stopped_at);
- if (ret)
- goto out;
- trans_for_each_update_iter(trans, i) {
- if (!btree_node_type_needs_gc(i->iter->btree_id))
- continue;
+ prefetch(&trans->c->journal.flags);
- if (!fs_usage) {
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
- }
+ trans_for_each_update_sorted(trans, i, iter) {
+ /* Multiple inserts might go to same leaf: */
+ if (!same_leaf_as_prev(trans, iter))
+ u64s = 0;
- if (!bch2_bkey_replicas_marked_locked(c,
- bkey_i_to_s_c(i->k), true)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto out;
+ u64s += i->k->k.u64s;
+ ret = btree_key_can_insert(trans, i, &u64s);
+ if (ret) {
+ *stopped_at = i;
+ return ret;
}
+
+ if (btree_node_type_needs_gc(i->iter->btree_id))
+ marking = true;
+ }
+
+ if (marking) {
+ percpu_down_read(&c->mark_lock);
+ fs_usage = bch2_fs_usage_scratch_get(c);
}
/*
@@ -653,16 +454,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
* succeed:
*/
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- trans->journal_u64s = 0;
-
- trans_for_each_update(trans, i)
- trans->journal_u64s += jset_u64s(i->k->k.u64s);
-
- ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
+ ret = bch2_trans_journal_res_get(trans,
+ JOURNAL_RES_GET_NONBLOCK);
if (ret)
- goto out;
+ goto err;
}
+ /*
+ * Not allowed to fail after we've gotten our journal reservation - we
+ * have to use it:
+ */
+
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_update(trans, i)
@@ -672,49 +474,146 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
i->k->k.version = MAX_VERSION;
}
- trans_for_each_update_iter(trans, i)
- if (update_has_triggers(trans, i) &&
- !update_triggers_transactional(trans, i))
- bch2_mark_update(trans, i, fs_usage, mark_flags);
+ /* Must be called under mark_lock: */
+ if (marking && trans->fs_usage_deltas &&
+ bch2_replicas_delta_list_apply(c, fs_usage,
+ trans->fs_usage_deltas)) {
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+ goto err;
+ }
- if (fs_usage && trans->fs_usage_deltas)
- bch2_replicas_delta_list_apply(c, fs_usage,
- trans->fs_usage_deltas);
+ trans_for_each_update(trans, i)
+ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ update_has_nontrans_triggers(i))
+ bch2_mark_update(trans, i, fs_usage, mark_flags);
- if (fs_usage)
+ if (marking)
bch2_trans_fs_usage_apply(trans, fs_usage);
- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- unlikely(c->gc_pos.phase))
- trans_for_each_update_iter(trans, i)
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
- bch2_mark_update(trans, i, NULL,
- mark_flags|
- BCH_BUCKET_MARK_GC);
+ if (unlikely(c->gc_pos.phase))
+ bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
-out:
- BUG_ON(ret &&
- (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
- trans->journal_res.ref);
-
- btree_trans_lock_write(trans, false);
-
- if (fs_usage) {
+err:
+ if (marking) {
bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
}
- bch2_journal_res_put(&c->journal, &trans->journal_res);
-out_clear_replicas:
- if (trans->fs_usage_deltas) {
- memset(&trans->fs_usage_deltas->fs_usage, 0,
- sizeof(trans->fs_usage_deltas->fs_usage));
- trans->fs_usage_deltas->used = 0;
+ return ret;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans,
+ struct btree_insert_entry **stopped_at)
+{
+ struct btree_insert_entry *i;
+ struct btree_iter *iter;
+ unsigned idx, u64s, journal_preres_u64s = 0;
+ int ret;
+
+ /*
+ * note: running triggers will append more updates to the list of
+ * updates as we're walking it:
+ */
+ trans_for_each_update(trans, i) {
+ /* we know trans->nounlock won't be set here: */
+ if (unlikely(!(i->iter->locks_want < 1
+ ? __bch2_btree_iter_upgrade(i->iter, 1)
+ : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+ trace_trans_restart_upgrade(trans->ip);
+ return -EINTR;
+ }
+
+ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ update_has_trans_triggers(i)) {
+ ret = bch2_trans_mark_update(trans, i->iter, i->k);
+ if (unlikely(ret)) {
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip);
+ return ret;
+ }
+ }
+
+ u64s = jset_u64s(i->k->k.u64s);
+ if (0)
+ journal_preres_u64s += u64s;
+ trans->journal_u64s += u64s;
}
- return ret;
+ ret = bch2_journal_preres_get(&trans->c->journal,
+ &trans->journal_preres, journal_preres_u64s,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (unlikely(ret == -EAGAIN))
+ ret = bch2_trans_journal_preres_get_cold(trans,
+ journal_preres_u64s);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * Can't be holding any read locks when we go to take write locks:
+ *
+ * note - this must be done after bch2_trans_journal_preres_get_cold()
+ * or anything else that might call bch2_trans_relock(), since that
+ * would just retake the read locks:
+ */
+ trans_for_each_iter_all(trans, iter) {
+ if (iter->nodes_locked != iter->nodes_intent_locked) {
+ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+ EBUG_ON(trans->iters_live & (1ULL << iter->idx));
+ bch2_btree_iter_unlock_noinline(iter);
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+ trans_for_each_update(trans, i)
+ btree_insert_entry_checks(trans, i);
+ bch2_btree_trans_verify_locks(trans);
+
+ /*
+ * No more updates can be added - sort updates so we can take write
+ * locks in the correct order:
+ */
+ btree_trans_sort_updates(trans);
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_btree_node_lock_for_insert(trans->c,
+ i->iter->l[0].b, i->iter);
+
+ ret = bch2_trans_commit_write_locked(trans, stopped_at);
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+ i->iter);
+
+ /*
+ * Drop journal reservation after dropping write locks, since dropping
+ * the journal reservation may kick off a journal write:
+ */
+ bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+
+ if (unlikely(ret))
+ return ret;
+
+ if (trans->flags & BTREE_INSERT_NOUNLOCK)
+ trans->nounlock = true;
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_foreground_maybe_merge(trans->c, i->iter,
+ 0, trans->flags);
+
+ trans->nounlock = false;
+
+ trans_for_each_update(trans, i)
+ bch2_btree_iter_downgrade(i->iter);
+
+ return 0;
}
static noinline
@@ -771,7 +670,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
- trans_for_each_update_iter(trans, i) {
+ trans_for_each_update(trans, i) {
ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
if (ret)
return ret;
@@ -822,67 +721,29 @@ int bch2_trans_commit_error(struct btree_trans *trans,
return ret;
}
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static int __bch2_trans_commit(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned iter;
int ret;
- trans_for_each_update_iter(trans, i) {
- if (!bch2_btree_iter_upgrade(i->iter, 1)) {
- trace_trans_restart_upgrade(trans->ip);
- ret = -EINTR;
- goto err;
- }
-
- ret = btree_iter_err(i->iter);
- if (ret)
- goto err;
- }
-
- ret = do_btree_insert_at(trans, stopped_at);
- if (unlikely(ret))
- goto err;
-
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- trans->nounlock = true;
-
- trans_for_each_update_sorted(trans, i, iter)
- if (!same_leaf_as_prev(trans, iter))
- bch2_foreground_maybe_merge(c, i->iter,
- 0, trans->flags);
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+ return -EROFS;
- trans->nounlock = false;
+ bch2_trans_unlock(trans);
- trans_for_each_update_iter(trans, i)
- bch2_btree_iter_downgrade(i->iter);
-err:
- /* make sure we didn't drop or screw up locks: */
- bch2_btree_trans_verify_locks(trans);
+ ret = bch2_fs_read_write_early(c);
+ if (ret)
+ return ret;
- return ret;
+ percpu_ref_get(&c->writes);
+ return 0;
}
-int bch2_trans_commit(struct btree_trans *trans,
- struct disk_reservation *disk_res,
- u64 *journal_seq,
- unsigned flags)
+int __bch2_trans_commit(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
+ struct btree_iter *iter;
unsigned orig_nr_updates = trans->nr_updates;
unsigned orig_mem_top = trans->mem_top;
int ret = 0;
@@ -891,63 +752,50 @@ int bch2_trans_commit(struct btree_trans *trans,
goto out_noupdates;
/* for the sake of sanity: */
- BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
-
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
+ EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
- if (!trans->commit_start)
- trans->commit_start = local_clock();
+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&trans->c->gc_lock);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
- trans->disk_res = disk_res;
- trans->journal_seq = journal_seq;
- trans->flags = flags;
-
- if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- !percpu_ref_tryget(&c->writes))) {
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
- return -EROFS;
-
- bch2_trans_unlock(trans);
- ret = bch2_fs_read_write_early(c);
+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+ unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+ ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
return ret;
+ }
+retry:
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ trans->journal_u64s = 0;
- percpu_ref_get(&c->writes);
+ ret = do_bch2_trans_commit(trans, &i);
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
- goto err;
- }
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset(&trans->fs_usage_deltas->memset_start, 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
}
-retry:
- ret = bch2_trans_journal_preres_get(trans);
- if (ret)
- goto err;
- ret = __bch2_trans_commit(trans, &i);
+ /* make sure we didn't drop or screw up locks: */
+ bch2_btree_trans_verify_locks(trans);
+
if (ret)
goto err;
out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
- if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
- percpu_ref_put(&c->writes);
+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+ percpu_ref_put(&trans->c->writes);
out_noupdates:
- if (!ret && trans->commit_start) {
- bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
- trans->commit_start);
- trans->commit_start = 0;
- }
+ EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
- BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+ trans_for_each_iter_all(trans, iter)
+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
if (!ret) {
- bch2_trans_unlink_iters(trans, ~trans->iters_touched|
- trans->iters_unlink_on_commit);
+ bch2_trans_unlink_iters(trans);
trans->iters_touched = 0;
}
trans->nr_updates = 0;
@@ -957,18 +805,16 @@ out_noupdates:
err:
ret = bch2_trans_commit_error(trans, i, ret);
- /* free updates and memory used by triggers, they'll be reexecuted: */
- trans->nr_updates = orig_nr_updates;
- trans->mem_top = orig_mem_top;
-
/* can't loop if it was passed in and we changed it: */
if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
ret = -EINTR;
+ if (ret)
+ goto out;
- if (!ret)
- goto retry;
-
- goto out;
+ /* free updates and memory used by triggers, they'll be reexecuted: */
+ trans->nr_updates = orig_nr_updates;
+ trans->mem_top = orig_mem_top;
+ goto retry;
}
/**
@@ -994,7 +840,7 @@ retry:
iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k));
+ bch2_trans_update(&trans, iter, k);
ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags);
if (ret == -EINTR)
@@ -1037,14 +883,14 @@ retry:
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete.k);
+ bch2_cut_back(end, &delete);
ret = bch2_extent_trim_atomic(&delete, iter);
if (ret)
break;
}
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
+ bch2_trans_update(trans, iter, &delete);
ret = bch2_trans_commit(trans, NULL, journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL);
@@ -1071,7 +917,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
bkey_init(&k.k);
k.k.p = iter->pos;
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k));
+ bch2_trans_update(trans, iter, &k);
return bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|flags);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6a4773a92029..8d223aa2bee5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -499,14 +499,18 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c)
}
}
-static inline void update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
- s64 sectors)
+static inline int update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
- BUG_ON(idx < 0);
+ if (idx < 0)
+ return -1;
+
+ if (!fs_usage)
+ return 0;
switch (r->data_type) {
case BCH_DATA_BTREE:
@@ -520,6 +524,7 @@ static inline void update_replicas(struct bch_fs *c,
break;
}
fs_usage->replicas[idx] += sectors;
+ return 0;
}
static inline void update_cached_sectors(struct bch_fs *c,
@@ -579,23 +584,41 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_replicas_delta_list_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct replicas_delta_list *r)
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_apply(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct replicas_delta_list *r)
{
struct replicas_delta *d = r->d;
struct replicas_delta *top = (void *) r->d + r->used;
+ unsigned i;
- acc_u64s((u64 *) fs_usage,
- (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ if (update_replicas(c, fs_usage, &d->r, d->delta)) {
+ top = d;
+ goto unwind;
+ }
- while (d != top) {
- BUG_ON((void *) d > (void *) top);
+ if (!fs_usage)
+ return 0;
- update_replicas(c, fs_usage, &d->r, d->delta);
+ fs_usage->nr_inodes += r->nr_inodes;
- d = (void *) d + replicas_entry_bytes(&d->r) + 8;
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ fs_usage->reserved += r->persistent_reserved[i];
+ fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
}
+
+ return 0;
+unwind:
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ update_replicas(c, fs_usage, &d->r, -d->delta);
+ return -1;
}
#define do_mark_fn(fn, c, pos, flags, ...) \
@@ -807,28 +830,44 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
preempt_enable();
}
-static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
- unsigned offset, s64 delta,
- unsigned flags)
+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
{
+ return DIV_ROUND_UP(sectors * n, d);
+}
+
+static s64 __ptr_disk_sectors_delta(unsigned old_size,
+ unsigned offset, s64 delta,
+ unsigned flags,
+ unsigned n, unsigned d)
+{
+ BUG_ON(!n || !d);
+
if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
- BUG_ON(offset + -delta > p.crc.live_size);
+ BUG_ON(offset + -delta > old_size);
- return -((s64) ptr_disk_sectors(p)) +
- __ptr_disk_sectors(p, offset) +
- __ptr_disk_sectors(p, p.crc.live_size -
- offset + delta);
+ return -disk_sectors_scaled(n, d, old_size) +
+ disk_sectors_scaled(n, d, offset) +
+ disk_sectors_scaled(n, d, old_size - offset + delta);
} else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
- BUG_ON(offset + -delta > p.crc.live_size);
+ BUG_ON(offset + -delta > old_size);
- return -((s64) ptr_disk_sectors(p)) +
- __ptr_disk_sectors(p, p.crc.live_size +
- delta);
+ return -disk_sectors_scaled(n, d, old_size) +
+ disk_sectors_scaled(n, d, old_size + delta);
} else {
- return ptr_disk_sectors(p);
+ return disk_sectors_scaled(n, d, delta);
}
}
+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
+ unsigned offset, s64 delta,
+ unsigned flags)
+{
+ return __ptr_disk_sectors_delta(p.crc.live_size,
+ offset, delta, flags,
+ p.crc.compressed_size,
+ p.crc.uncompressed_size);
+}
+
static void bucket_set_stripe(struct bch_fs *c,
const struct bch_stripe *v,
struct bch_fs_usage *fs_usage,
@@ -964,15 +1003,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
- s64 sectors, unsigned flags)
+ s64 sectors, unsigned flags,
+ struct bch_replicas_padded *r,
+ unsigned *nr_data,
+ unsigned *nr_parity)
{
bool gc = flags & BCH_BUCKET_MARK_GC;
struct stripe *m;
- unsigned old, new, nr_data;
+ unsigned old, new;
int blocks_nonempty_delta;
- s64 parity_sectors;
-
- BUG_ON(!sectors);
m = genradix_ptr(&c->stripes[gc], p.idx);
@@ -987,13 +1026,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
BUG_ON(m->r.e.data_type != data_type);
- nr_data = m->nr_blocks - m->nr_redundant;
-
- parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
-
- if (sectors < 0)
- parity_sectors = -parity_sectors;
- sectors += parity_sectors;
+ *nr_data = m->nr_blocks - m->nr_redundant;
+ *nr_parity = m->nr_redundant;
+ *r = m->r;
old = m->block_sectors[p.block];
m->block_sectors[p.block] += sectors;
@@ -1011,8 +1046,6 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
- update_replicas(c, fs_usage, &m->r.e, sectors);
-
return 0;
}
@@ -1027,7 +1060,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded p;
struct bch_replicas_padded r;
s64 dirty_sectors = 0;
- unsigned i;
int ret;
r.e.data_type = data_type;
@@ -1041,29 +1073,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
? sectors
: ptr_disk_sectors_delta(p, offset, sectors, flags);
bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
- fs_usage, journal_seq, flags);
+ fs_usage, journal_seq, flags);
if (p.ptr.cached) {
if (!stale)
update_cached_sectors(c, fs_usage, p.ptr.dev,
disk_sectors);
- } else if (!p.ec_nr) {
+ } else if (!p.has_ec) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- for (i = 0; i < p.ec_nr; i++) {
- ret = bch2_mark_stripe_ptr(c, p.ec[i],
- data_type, fs_usage,
- disk_sectors, flags);
- if (ret)
- return ret;
- }
-
+ struct bch_replicas_padded ec_r;
+ unsigned nr_data, nr_parity;
+ s64 parity_sectors;
+
+ ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
+ fs_usage, disk_sectors, flags,
+ &ec_r, &nr_data, &nr_parity);
+ if (ret)
+ return ret;
+
+ parity_sectors =
+ __ptr_disk_sectors_delta(p.crc.live_size,
+ offset, sectors, flags,
+ p.crc.compressed_size * nr_parity,
+ p.crc.uncompressed_size * nr_data);
+
+ update_replicas(c, fs_usage, &ec_r.e,
+ disk_sectors + parity_sectors);
+
+ /*
+ * There may be other dirty pointers in this extent, but
+ * if so they're not required for mounting if we have an
+ * erasure coded pointer in this extent:
+ */
r.e.nr_required = 0;
}
}
- update_replicas(c, fs_usage, &r.e, dirty_sectors);
+ if (r.e.nr_devs)
+ update_replicas(c, fs_usage, &r.e, dirty_sectors);
return 0;
}
@@ -1316,7 +1365,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch_err(c, "disk usage increased more than %llu sectors reserved",
disk_res_sectors);
- trans_for_each_update_iter(trans, i) {
+ trans_for_each_update(trans, i) {
struct btree_iter *iter = i->iter;
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
@@ -1358,7 +1407,7 @@ static int trans_get_key(struct btree_trans *trans,
struct btree_insert_entry *i;
int ret;
- trans_for_each_update_iter(trans, i)
+ trans_for_each_update(trans, i)
if (i->iter->btree_id == btree_id &&
(btree_node_type_is_extents(btree_id)
? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
@@ -1369,13 +1418,11 @@ static int trans_get_key(struct btree_trans *trans,
return 1;
}
- *iter = __bch2_trans_get_iter(trans, btree_id, pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
+ *iter = bch2_trans_get_iter(trans, btree_id, pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
if (IS_ERR(*iter))
return PTR_ERR(*iter);
- bch2_trans_iter_free_on_commit(trans, *iter);
-
*k = bch2_btree_iter_peek_slot(*iter);
ret = bkey_err(*k);
if (ret)
@@ -1397,13 +1444,13 @@ static void *trans_update_key(struct btree_trans *trans,
bkey_init(&new_k->k);
new_k->k.p = iter->pos;
- trans_for_each_update_iter(trans, i)
+ trans_for_each_update(trans, i)
if (i->iter == iter) {
i->k = new_k;
return new_k;
}
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k));
+ bch2_trans_update(trans, iter, new_k);
return new_k;
}
@@ -1417,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
- unsigned old;
+ u16 *dst_sectors;
bool overflow;
int ret;
@@ -1427,7 +1474,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
if (ret < 0)
return ret;
- if (!ret) {
+ if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
/*
* During journal replay, and if gc repairs alloc info at
* runtime, the alloc info in the btree might not be up to date
@@ -1472,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
}
- if (!p.ptr.cached) {
- old = u.dirty_sectors;
- overflow = checked_add(u.dirty_sectors, sectors);
- } else {
- old = u.cached_sectors;
- overflow = checked_add(u.cached_sectors, sectors);
+ dst_sectors = !p.ptr.cached
+ ? &u.dirty_sectors
+ : &u.cached_sectors;
+
+ overflow = checked_add(*dst_sectors, sectors);
+
+ if (overflow) {
+ bch2_fs_inconsistent(c,
+ "bucket sector count overflow: %u + %lli > U16_MAX",
+ *dst_sectors, sectors);
+ /* return an error indicating that we need full fsck */
+ ret = -EIO;
+ goto out;
}
u.data_type = u.dirty_sectors || u.cached_sectors
? data_type : 0;
- bch2_fs_inconsistent_on(overflow, c,
- "bucket sector count overflow: %u + %lli > U16_MAX",
- old, sectors);
- BUG_ON(overflow);
-
a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
@@ -1503,16 +1552,16 @@ out:
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
- s64 sectors, enum bch_data_type data_type)
+ s64 sectors, enum bch_data_type data_type,
+ struct bch_replicas_padded *r,
+ unsigned *nr_data,
+ unsigned *nr_parity)
{
struct bch_fs *c = trans->c;
- struct bch_replicas_padded r;
struct btree_iter *iter;
struct bkey_i *new_k;
struct bkey_s_c k;
struct bkey_s_stripe s;
- unsigned nr_data;
- s64 parity_sectors;
int ret = 0;
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@@ -1535,20 +1584,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
bkey_reassemble(new_k, k);
s = bkey_i_to_s_stripe(new_k);
- nr_data = s.v->nr_blocks - s.v->nr_redundant;
-
- parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data);
-
- if (sectors < 0)
- parity_sectors = -parity_sectors;
-
stripe_blockcount_set(s.v, p.block,
stripe_blockcount_get(s.v, p.block) +
- sectors + parity_sectors);
+ sectors);
- bch2_bkey_to_replicas(&r.e, s.s_c);
-
- update_replicas_list(trans, &r.e, sectors);
+ *nr_data = s.v->nr_blocks - s.v->nr_redundant;
+ *nr_parity = s.v->nr_redundant;
+ bch2_bkey_to_replicas(&r->e, s.s_c);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@@ -1565,7 +1607,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
struct bch_replicas_padded r;
s64 dirty_sectors = 0;
bool stale;
- unsigned i;
int ret;
r.e.data_type = data_type;
@@ -1590,22 +1631,35 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
if (!stale)
update_cached_sectors_list(trans, p.ptr.dev,
disk_sectors);
- } else if (!p.ec_nr) {
+ } else if (!p.has_ec) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- for (i = 0; i < p.ec_nr; i++) {
- ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i],
- disk_sectors, data_type);
- if (ret)
- return ret;
- }
+ struct bch_replicas_padded ec_r;
+ unsigned nr_data, nr_parity;
+ s64 parity_sectors;
+
+ ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+ disk_sectors, data_type,
+ &ec_r, &nr_data, &nr_parity);
+ if (ret)
+ return ret;
+
+ parity_sectors =
+ __ptr_disk_sectors_delta(p.crc.live_size,
+ offset, sectors, flags,
+ p.crc.compressed_size * nr_parity,
+ p.crc.uncompressed_size * nr_data);
+
+ update_replicas_list(trans, &ec_r.e,
+ disk_sectors + parity_sectors);
r.e.nr_required = 0;
}
}
- update_replicas_list(trans, &r.e, dirty_sectors);
+ if (r.e.nr_devs)
+ update_replicas_list(trans, &r.e, dirty_sectors);
return 0;
}
@@ -1710,9 +1764,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
d = replicas_deltas_realloc(trans, 0);
if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
- d->fs_usage.nr_inodes++;
+ d->nr_inodes++;
else
- d->fs_usage.nr_inodes--;
+ d->nr_inodes--;
return 0;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1721,10 +1775,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
sectors *= replicas;
replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->fs_usage.persistent_reserved));
+ ARRAY_SIZE(d->persistent_reserved));
- d->fs_usage.reserved += sectors;
- d->fs_usage.persistent_reserved[replicas - 1] += sectors;
+ d->persistent_reserved[replicas - 1] += sectors;
return 0;
}
case KEY_TYPE_reflink_p:
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a4bab66d8d17..ad6f731b1cea 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -137,8 +137,8 @@ static inline u8 ptr_stale(struct bch_dev *ca,
return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
}
-static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p,
- unsigned live_size)
+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
+ unsigned live_size)
{
return live_size && p.crc.compression_type
? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
@@ -146,7 +146,7 @@ static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p,
: live_size;
}
-static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p)
+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
{
return __ptr_disk_sectors(p, p.crc.live_size);
}
@@ -279,9 +279,9 @@ int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
struct bch_fs_usage *, unsigned);
-void bch2_replicas_delta_list_apply(struct bch_fs *,
- struct bch_fs_usage *,
- struct replicas_delta_list *);
+int bch2_replicas_delta_list_apply(struct bch_fs *,
+ struct bch_fs_usage *,
+ struct replicas_delta_list *);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 94bd9da34847..f3ff4a18b1fd 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -102,7 +102,11 @@ struct replicas_delta {
struct replicas_delta_list {
unsigned size;
unsigned used;
- struct bch_fs_usage fs_usage;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
struct replicas_delta d[0];
};
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 607f57a64009..a5c947e8adf3 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,7 +10,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <keys/user-type.h>
@@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -126,7 +127,6 @@ static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
do_encrypt(c->chacha20, nonce, key, sizeof(key));
desc->tfm = c->poly1305;
- desc->flags = 0;
crypto_shash_init(desc);
crypto_shash_update(desc, key, sizeof(key));
}
@@ -199,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -224,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -462,7 +462,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -545,7 +545,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -573,7 +573,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -605,7 +605,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 0b359aba2526..b84e81bac8ff 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 8ac6990c6971..f18266330687 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
return ret;
}
-void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+void __bch2_increment_clock(struct io_clock *clock)
{
- struct io_clock *clock = &c->io_clock[rw];
struct io_timer *timer;
unsigned long now;
+ unsigned sectors;
/* Buffer up one megabyte worth of IO in the percpu counter */
preempt_disable();
- if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
- IO_CLOCK_PCPU_SECTORS)) {
+ if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) {
preempt_enable();
return;
}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 5cb043c579d8..bfbbca8a207b 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
-void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void __bch2_increment_clock(struct io_clock *);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+ int rw)
+{
+ struct io_clock *clock = &c->io_clock[rw];
+
+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+ IO_CLOCK_PCPU_SECTORS))
+ __bch2_increment_clock(clock);
+}
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 24f565614cd9..3787390da47f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
#ifndef CONFIG_HIGHMEM
- __bio_for_each_contig_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (bv.bv_len == start.bi_size)
return (struct bbuf) {
.b = page_address(bv.bv_page) + bv.bv_offset,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 1442dacef0de..38017699c04a 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -138,10 +138,10 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
-int __bch2_dirent_create(struct btree_trans *trans,
- u64 dir_inum, const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- int flags)
+int bch2_dirent_create(struct btree_trans *trans,
+ u64 dir_inum, const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ int flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -155,16 +155,6 @@ int __bch2_dirent_create(struct btree_trans *trans,
dir_inum, &dirent->k_i, flags);
}
-int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *journal_seq, int flags)
-{
- return bch2_trans_do(c, journal_seq, flags,
- __bch2_dirent_create(&trans, dir_inum, hash_info,
- type, name, dst_inum, flags));
-}
-
static void dirent_copy_target(struct bkey_i_dirent *dst,
struct bkey_s_c_dirent src)
{
@@ -172,23 +162,22 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
-static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
- const struct qstr *name)
-{
- return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
-}
-
int bch2_dirent_rename(struct btree_trans *trans,
- struct bch_inode_info *src_dir, const struct qstr *src_name,
- struct bch_inode_info *dst_dir, const struct qstr *dst_name,
- enum bch_rename_mode mode)
+ u64 src_dir, struct bch_hash_info *src_hash,
+ u64 dst_dir, struct bch_hash_info *dst_hash,
+ const struct qstr *src_name, u64 *src_inum,
+ const struct qstr *dst_name, u64 *dst_inum,
+ enum bch_rename_mode mode)
{
struct btree_iter *src_iter, *dst_iter;
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
- struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
+ struct bpos dst_pos =
+ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
int ret;
+ *src_inum = *dst_inum = 0;
+
/*
* Lookup dst:
*
@@ -198,24 +187,25 @@ int bch2_dirent_rename(struct btree_trans *trans,
*/
dst_iter = mode == BCH_RENAME
? bch2_hash_hole(trans, bch2_dirent_hash_desc,
- &dst_dir->ei_str_hash,
- dst_dir->v.i_ino, dst_name)
+ dst_hash, dst_dir, dst_name)
: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- &dst_dir->ei_str_hash,
- dst_dir->v.i_ino, dst_name,
+ dst_hash, dst_dir, dst_name,
BTREE_ITER_INTENT);
if (IS_ERR(dst_iter))
return PTR_ERR(dst_iter);
old_dst = bch2_btree_iter_peek_slot(dst_iter);
+ if (mode != BCH_RENAME)
+ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+
/* Lookup src: */
src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- &src_dir->ei_str_hash,
- src_dir->v.i_ino, src_name,
+ src_hash, src_dir, src_name,
BTREE_ITER_INTENT);
if (IS_ERR(src_iter))
return PTR_ERR(src_iter);
old_src = bch2_btree_iter_peek_slot(src_iter);
+ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
@@ -255,9 +245,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
* new_dst at the src position:
*/
new_dst->k.p = src_iter->pos;
- bch2_trans_update(trans,
- BTREE_INSERT_ENTRY(src_iter,
- &new_dst->k_i));
+ bch2_trans_update(trans, src_iter,
+ &new_dst->k_i);
return 0;
} else {
/* If we're overwriting, we can't insert new_dst
@@ -270,8 +259,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
} else {
/* Check if we need a whiteout to delete src: */
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
- &src_dir->ei_str_hash,
- src_iter);
+ src_hash, src_iter);
if (ret < 0)
return ret;
@@ -280,17 +268,17 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i));
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i));
+ bch2_trans_update(trans, src_iter, &new_src->k_i);
+ bch2_trans_update(trans, dst_iter, &new_dst->k_i);
return 0;
}
-int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name)
+int bch2_dirent_delete_at(struct btree_trans *trans,
+ const struct bch_hash_info *hash_info,
+ struct btree_iter *iter)
{
- return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, name);
+ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ hash_info, iter);
}
int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
@@ -301,7 +289,17 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
return bch2_trans_do(c, journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
- __bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+ bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, name));
+}
+
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, unsigned flags)
+{
+ return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+ hash_info, dir_inum, name, flags);
}
u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
@@ -315,8 +313,8 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
- hash_info, dir_inum, name, 0);
+ iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
+ hash_info, name, 0);
if (IS_ERR(iter)) {
BUG_ON(PTR_ERR(iter) == -EINTR);
goto out;
@@ -350,53 +348,37 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
return ret;
}
-int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
{
- return bch2_trans_do(c, NULL, 0,
- bch2_empty_dir_trans(&trans, dir_inum));
-}
-
-int bch2_readdir(struct bch_fs *c, struct file *file,
- struct dir_context *ctx)
-{
- struct bch_inode_info *inode = file_bch_inode(file);
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
- unsigned len;
int ret;
- if (!dir_emit_dots(file, ctx))
- return 0;
-
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
- POS(inode->v.i_ino, ctx->pos), 0, k, ret) {
+ POS(inum, ctx->pos), 0, k, ret) {
+ if (k.k->p.inode > inum)
+ break;
+
if (k.k->type != KEY_TYPE_dirent)
continue;
dirent = bkey_s_c_to_dirent(k);
- if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
- continue;
-
- if (k.k->p.inode > inode->v.i_ino)
- break;
-
- len = bch2_dirent_name_bytes(dirent);
-
/*
* XXX: dir_emit() can fault and block, while we're holding
* locks
*/
- if (!dir_emit(ctx, dirent.v->d_name, len,
+ ctx->pos = dirent.k->p.offset;
+ if (!dir_emit(ctx, dirent.v->d_name,
+ bch2_dirent_name_bytes(dirent),
le64_to_cpu(dirent.v->d_inum),
dirent.v->d_type))
break;
-
- ctx->pos = k.k->p.offset + 1;
+ ctx->pos = dirent.k->p.offset + 1;
}
ret = bch2_trans_exit(&trans) ?: ret;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index bc64718a7832..e6184dc796d3 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,15 +29,13 @@ static inline unsigned dirent_val_u64s(unsigned len)
sizeof(u64));
}
-int __bch2_dirent_create(struct btree_trans *, u64,
- const struct bch_hash_info *, u8,
- const struct qstr *, u64, int);
-int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
- u8, const struct qstr *, u64, u64 *, int);
-
-int __bch2_dirent_delete(struct btree_trans *, u64,
- const struct bch_hash_info *,
- const struct qstr *);
+int bch2_dirent_create(struct btree_trans *, u64,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, int);
+
+int bch2_dirent_delete_at(struct btree_trans *,
+ const struct bch_hash_info *,
+ struct btree_iter *);
int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *, u64 *);
@@ -48,15 +46,20 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- struct bch_inode_info *, const struct qstr *,
- struct bch_inode_info *, const struct qstr *,
+ u64, struct bch_hash_info *,
+ u64, struct bch_hash_info *,
+ const struct qstr *, u64 *,
+ const struct qstr *, u64 *,
enum bch_rename_mode);
+struct btree_iter *
+__bch2_dirent_lookup_trans(struct btree_trans *, u64,
+ const struct bch_hash_info *,
+ const struct qstr *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *);
int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_empty_dir(struct bch_fs *, u64);
-int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index be2eca0fcdf7..5287b5ee7d4a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,6 +4,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_on_stack.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
@@ -135,8 +136,6 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
(u64) s->ptrs[i].offset,
stripe_blockcount_get(s, i));
-
- bch2_bkey_ptrs_to_text(out, c, k);
}
static int ptr_matches_stripe(struct bch_fs *c,
@@ -433,10 +432,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
closure_init_stack(&cl);
- BUG_ON(!rbio->pick.idx ||
- rbio->pick.idx - 1 >= rbio->pick.ec_nr);
+ BUG_ON(!rbio->pick.has_ec);
- stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
+ stripe_idx = rbio->pick.ec.idx;
buf = kzalloc(sizeof(*buf), GFP_NOIO);
if (!buf)
@@ -561,7 +559,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c,
size_t idx = iter->pos.offset;
int ret = 0;
- if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT))
+ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
return ret;
bch2_trans_unlock(iter->trans);
@@ -738,7 +736,7 @@ found_slot:
stripe->k.p = iter->pos;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
+ bch2_trans_update(&trans, iter, &stripe->k_i);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
@@ -779,10 +777,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_s_extent e;
- struct bch_extent_ptr *ptr;
- BKEY_PADDED(k) tmp;
+ struct bkey_on_stack sk;
int ret = 0, dev, idx;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -792,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+ struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+
if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
bch2_btree_iter_next(iter);
continue;
@@ -807,19 +807,19 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
dev = s->key.v.ptrs[idx].dev;
- bkey_reassemble(&tmp.k, k);
- e = bkey_i_to_s_extent(&tmp.k);
+ bkey_on_stack_reassemble(&sk, c, k);
+ e = bkey_i_to_s_extent(sk.k);
- extent_for_each_ptr(e, ptr)
- if (ptr->dev != dev)
+ extent_for_each_ptr(e, ptr) {
+ if (ptr->dev == dev)
+ ec_ptr = ptr;
+ else
ptr->cached = true;
+ }
- ptr = (void *) bch2_extent_has_device(e.c, dev);
- BUG_ON(!ptr);
-
- extent_stripe_ptr_add(e, s, ptr, idx);
+ extent_stripe_ptr_add(e, s, ec_ptr, idx);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
+ bch2_trans_update(&trans, iter, sk.k);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
@@ -832,6 +832,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
}
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
return ret;
}
@@ -1231,7 +1232,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
spin_unlock(&c->ec_stripes_heap_lock);
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
+ bch2_trans_update(trans, iter, &new_key->k_i);
return bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags);
@@ -1278,7 +1279,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
struct btree_trans trans;
struct btree_iter *btree_iter;
struct journal_iter journal_iter;
- struct bkey_s_c btree_k, journal_k, k;
+ struct bkey_s_c btree_k, journal_k;
int ret;
ret = bch2_fs_ec_start(c);
@@ -1294,33 +1295,31 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
journal_k = bch2_journal_iter_peek(&journal_iter);
while (1) {
+ bool btree;
+
if (btree_k.k && journal_k.k) {
int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
- if (cmp < 0) {
- k = btree_k;
+ if (!cmp)
btree_k = bch2_btree_iter_next(btree_iter);
- } else if (cmp == 0) {
- btree_k = bch2_btree_iter_next(btree_iter);
- k = journal_k;
- journal_k = bch2_journal_iter_next(&journal_iter);
- } else {
- k = journal_k;
- journal_k = bch2_journal_iter_next(&journal_iter);
- }
+ btree = cmp < 0;
} else if (btree_k.k) {
- k = btree_k;
- btree_k = bch2_btree_iter_next(btree_iter);
+ btree = true;
} else if (journal_k.k) {
- k = journal_k;
- journal_k = bch2_journal_iter_next(&journal_iter);
+ btree = false;
} else {
break;
}
- bch2_mark_key(c, k, 0, 0, NULL, 0,
+ bch2_mark_key(c, btree ? btree_k : journal_k,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
+
+ if (btree)
+ btree_k = bch2_btree_iter_next(btree_iter);
+ else
+ journal_k = bch2_journal_iter_next(&journal_iter);
}
ret = bch2_trans_exit(&trans) ?: ret;
@@ -1351,6 +1350,9 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
if (ret)
return ret;
+ if (!idx)
+ return 0;
+
if (!gc &&
!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
GFP_KERNEL))
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 304ff92500be..5a5cfee623e2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca)
enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
const char *fmt, ...)
{
- struct fsck_err_state *s;
+ struct fsck_err_state *s = NULL;
va_list args;
bool fix = false, print = true, suppressing = false;
char _buf[sizeof(s->buf)], *buf = _buf;
@@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
- suppressing = s->nr == FSCK_ERR_RATELIMIT_NR;
- print = s->nr <= FSCK_ERR_RATELIMIT_NR;
+ if (c->opts.ratelimit_errors &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ suppressing = true;
+ else
+ print = false;
+ }
buf = s->buf;
print:
va_start(args, fmt);
@@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_lock(&c->fsck_error_lock);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
- if (s->nr > FSCK_ERR_RATELIMIT_NR)
+ if (s->ratelimited)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
list_del(&s->list);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 2591e12305b7..7dcb0f6552fc 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -114,6 +114,7 @@ struct fsck_err_state {
struct list_head list;
const char *fmt;
u64 nr;
+ bool ratelimited;
char buf[512];
};
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
new file mode 100644
index 000000000000..742b4d78cb3a
--- /dev/null
+++ b/fs/bcachefs/extent_update.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_on_stack.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ unsigned ret = 0;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ switch (__extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned offset,
+ struct bpos *end,
+ unsigned *nr_iters,
+ unsigned max_iters,
+ bool overwrite)
+{
+ int ret = 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ ret = 1;
+ }
+
+ break;
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx = le64_to_cpu(p.v->idx);
+ unsigned sectors = bpos_min(*end, p.k->p).offset -
+ bkey_start_offset(p.k);
+ struct btree_iter *iter;
+ struct bkey_s_c r_k;
+
+ for_each_btree_key(trans, iter,
+ BTREE_ID_REFLINK, POS(0, idx + offset),
+ BTREE_ITER_SLOTS, r_k, ret) {
+ if (bkey_cmp(bkey_start_pos(r_k.k),
+ POS(0, idx + sectors)) >= 0)
+ break;
+
+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+ if (*nr_iters >= max_iters) {
+ struct bpos pos = bkey_start_pos(k.k);
+ pos.offset += r_k.k->p.offset - idx;
+
+ *end = bpos_min(*end, pos);
+ ret = 1;
+ break;
+ }
+ }
+
+ bch2_trans_iter_put(trans, iter);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bpos *end)
+{
+ struct btree_trans *trans = iter->trans;
+ struct btree *b;
+ struct btree_node_iter node_iter;
+ struct bkey_packed *_k;
+ unsigned nr_iters = 0;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ b = iter->l[0].b;
+ node_iter = iter->l[0].iter;
+
+ BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+ *end = bpos_min(insert->k.p, b->key.k.p);
+
+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+ &nr_iters, EXTENT_ITERS_MAX / 2, false);
+ if (ret < 0)
+ return ret;
+
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ KEY_TYPE_discard))) {
+ struct bkey unpacked;
+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ unsigned offset = 0;
+
+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
+ break;
+
+ if (bkey_cmp(bkey_start_pos(&insert->k),
+ bkey_start_pos(k.k)) > 0)
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
+
+ ret = count_iters_for_insert(trans, k, offset, end,
+ &nr_iters, EXTENT_ITERS_MAX, true);
+ if (ret)
+ break;
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter, k, &end);
+ if (ret)
+ return ret;
+
+ bch2_cut_back(end, k);
+ return 0;
+}
+
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter, k, &end);
+ if (ret)
+ return ret;
+
+ return !bkey_cmp(end, k->k.p);
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *trans,
+ struct btree_insert_entry *insert,
+ unsigned *u64s)
+{
+ struct btree_iter_level *l = &insert->iter->l[0];
+ struct btree_node_iter node_iter = l->iter;
+ enum bch_extent_overlap overlap;
+ struct bkey_packed *_k;
+ struct bkey unpacked;
+ struct bkey_s_c k;
+ int sectors;
+
+ /*
+ * We avoid creating whiteouts whenever possible when deleting, but
+ * those optimizations mean we may potentially insert two whiteouts
+ * instead of one (when we overlap with the front of one extent and the
+ * back of another):
+ */
+ if (bkey_whiteout(&insert->k->k))
+ *u64s += BKEY_U64s;
+
+ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+ KEY_TYPE_discard);
+ if (!_k)
+ return BTREE_INSERT_OK;
+
+ k = bkey_disassemble(l->b, _k, &unpacked);
+
+ overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+ /* account for having to split existing extent: */
+ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+ *u64s += _k->u64s;
+
+ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+ (sectors = bch2_bkey_sectors_compressed(k))) {
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+ switch (bch2_disk_reservation_add(trans->c,
+ trans->disk_res,
+ sectors, flags)) {
+ case 0:
+ break;
+ case -ENOSPC:
+ return BTREE_INSERT_ENOSPC;
+ default:
+ BUG();
+ }
+ }
+
+ return BTREE_INSERT_OK;
+}
+
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter *_iter,
+ struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct btree_node_iter iter;
+ struct bkey_packed *k;
+ struct bkey uk;
+
+ if (!expensive_debug_checks(c))
+ return;
+
+ iter = *_iter;
+ k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+ BUG_ON(k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+ iter = *_iter;
+ k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+ BUG_ON(k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+ if (k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+ char buf1[100];
+ char buf2[100];
+
+ bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+ bch2_dump_btree_node(b);
+ panic("insert > next :\n"
+ "insert %s\n"
+ "next %s\n",
+ buf1, buf2);
+ }
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ struct bkey_packed *k =
+ bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+ verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+ if (debug_check_bkeys(c))
+ bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+ bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
+static void
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bkey_packed *_k, struct bkey_s k,
+ enum bch_extent_overlap overlap)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ int u64s_delta;
+
+ switch (overlap) {
+ case BCH_EXTENT_OVERLAP_FRONT:
+ /* insert overlaps with start of k: */
+ u64s_delta = bch2_cut_front_s(insert->k.p, k);
+ btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+ EBUG_ON(bkey_deleted(k.k));
+ extent_save(l->b, _k, k.k);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+ break;
+
+ case BCH_EXTENT_OVERLAP_BACK:
+ /* insert overlaps with end of k: */
+ u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
+ btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+ EBUG_ON(bkey_deleted(k.k));
+ extent_save(l->b, _k, k.k);
+
+ /*
+ * As the auxiliary tree is indexed by the end of the
+ * key and we've just changed the end, update the
+ * auxiliary tree.
+ */
+ bch2_bset_fix_invalidated_key(l->b, _k);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, _k->u64s, _k->u64s);
+ break;
+
+ case BCH_EXTENT_OVERLAP_ALL: {
+ /* The insert key completely covers k, invalidate k */
+ if (!bkey_whiteout(k.k))
+ btree_account_key_drop(l->b, _k);
+
+ k.k->size = 0;
+ k.k->type = KEY_TYPE_deleted;
+
+ if (_k >= btree_bset_last(l->b)->start) {
+ unsigned u64s = _k->u64s;
+
+ bch2_bset_delete(l->b, _k, _k->u64s);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+ _k, u64s, 0);
+ } else {
+ extent_save(l->b, _k, k.k);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+ }
+
+ break;
+ }
+ case BCH_EXTENT_OVERLAP_MIDDLE: {
+ struct bkey_on_stack split;
+
+ bkey_on_stack_init(&split);
+ bkey_on_stack_reassemble(&split, c, k.s_c);
+
+ /*
+ * The insert key falls 'in the middle' of k
+ * The insert key splits k in 3:
+ * - start only in k, preserve
+ * - middle common section, invalidate in k
+ * - end only in k, preserve
+ *
+ * We update the old key to preserve the start,
+ * insert will be the new common section,
+ * we manually insert the end that we are preserving.
+ *
+ * modify k _before_ doing the insert (which will move
+ * what k points to)
+ */
+ split.k->k.needs_whiteout |= bkey_written(l->b, _k);
+
+ bch2_cut_back(bkey_start_pos(&insert->k), split.k);
+ BUG_ON(bkey_deleted(&split.k->k));
+
+ u64s_delta = bch2_cut_front_s(insert->k.p, k);
+ btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+ BUG_ON(bkey_deleted(k.k));
+ extent_save(l->b, _k, k.k);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+
+ extent_bset_insert(c, iter, split.k);
+ bkey_on_stack_exit(&split, c);
+ break;
+ }
+ }
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ * k.size != 0 ∧ j.size != 0 →
+ * ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+ struct btree_insert_entry *insert_entry)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter = insert_entry->iter;
+ struct bkey_i *insert = insert_entry->k;
+ struct btree_iter_level *l = &iter->l[0];
+ struct btree_node_iter node_iter = l->iter;
+ bool deleting = bkey_whiteout(&insert->k);
+ bool update_journal = !deleting;
+ bool update_btree = !deleting;
+ struct bkey_i whiteout = *insert;
+ struct bkey_packed *_k;
+ struct bkey unpacked;
+
+ EBUG_ON(iter->level);
+ EBUG_ON(!insert->k.size);
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+ while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+ KEY_TYPE_discard))) {
+ struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+ struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
+ enum bch_extent_overlap overlap =
+ bch2_extent_overlap(&insert->k, k.k);
+
+ if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ break;
+
+ if (!bkey_whiteout(k.k))
+ update_journal = true;
+
+ if (!update_journal) {
+ bch2_cut_front(cur_end, insert);
+ bch2_cut_front(cur_end, &whiteout);
+ bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
+ goto next;
+ }
+
+ /*
+ * When deleting, if possible just do it by switching the type
+ * of the key we're deleting, instead of creating and inserting
+ * a new whiteout:
+ */
+ if (deleting &&
+ !update_btree &&
+ !bkey_cmp(insert->k.p, k.k->p) &&
+ !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+ if (!bkey_whiteout(k.k)) {
+ btree_account_key_drop(l->b, _k);
+ _k->type = KEY_TYPE_discard;
+ reserve_whiteout(l->b, _k);
+ bch2_btree_iter_fix_key_modified(iter,
+ l->b, _k);
+ }
+ break;
+ }
+
+ if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
+ insert->k.needs_whiteout = true;
+ update_btree = true;
+ }
+
+ if (update_btree &&
+ overlap == BCH_EXTENT_OVERLAP_ALL &&
+ bkey_whiteout(k.k) &&
+ k.k->needs_whiteout) {
+ unreserve_whiteout(l->b, _k);
+ _k->needs_whiteout = false;
+ }
+
+ extent_squash(c, iter, insert, _k, k, overlap);
+
+ if (!update_btree)
+ bch2_cut_front(cur_end, insert);
+next:
+ node_iter = l->iter;
+
+ if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+ overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+ break;
+ }
+
+ l->iter = node_iter;
+ bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+
+ if (update_btree) {
+ if (deleting)
+ insert->k.type = KEY_TYPE_discard;
+
+ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+ extent_bset_insert(c, iter, insert);
+ }
+
+ if (update_journal) {
+ struct bkey_i *k = !deleting ? insert : &whiteout;
+
+ if (deleting)
+ k->k.type = KEY_TYPE_discard;
+
+ EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
+
+ bch2_btree_journal_key(trans, iter, k);
+ }
+
+ bch2_cut_front(insert->k.p, insert);
+}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
new file mode 100644
index 000000000000..89d18e4b6758
--- /dev/null
+++ b/fs/bcachefs/extent_update.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+ struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
+ unsigned *);
+void bch2_insert_fixup_extent(struct btree_trans *,
+ struct btree_insert_entry *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4b1c652cdbce..6bcc178604b0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -9,12 +9,10 @@
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
+#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
#include "debug.h"
-#include "dirent.h"
#include "disk_groups.h"
#include "error.h"
#include "extents.h"
@@ -24,85 +22,18 @@
#include "super.h"
#include "super-io.h"
#include "util.h"
-#include "xattr.h"
#include <trace/events/bcachefs.h>
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
- unsigned nr_ptrs = 0;
-
- bkey_for_each_ptr(p, ptr)
- nr_ptrs++;
-
- return nr_ptrs;
-}
-
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
-{
- unsigned nr_ptrs = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v: {
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(p, ptr)
- nr_ptrs += !ptr->cached;
- BUG_ON(!nr_ptrs);
- break;
- }
- case KEY_TYPE_reservation:
- nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
- break;
- }
-
- return nr_ptrs;
-}
-
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
- struct extent_ptr_decoded p)
-{
- unsigned i, durability = 0;
- struct bch_dev *ca;
-
- if (p.ptr.cached)
- return 0;
-
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
- if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
- durability = max_t(unsigned, durability, ca->mi.durability);
-
- for (i = 0; i < p.ec_nr; i++) {
- struct stripe *s =
- genradix_ptr(&c->stripes[0], p.idx);
-
- if (WARN_ON(!s))
- continue;
-
- durability = max_t(unsigned, durability, s->nr_redundant);
- }
-
- return durability;
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned durability = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- durability += bch2_extent_ptr_durability(c, p);
+static unsigned bch2_crc_field_size_max[] = {
+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
- return durability;
-}
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+ struct bch_extent_crc_unpacked,
+ enum bch_extent_entry_type);
static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
unsigned dev)
@@ -206,10 +137,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
p.idx++;
if (force_reconstruct_read(c) &&
- !p.idx && p.ec_nr)
+ !p.idx && p.has_ec)
p.idx++;
- if (p.idx >= p.ec_nr + 1)
+ if (p.idx >= (unsigned) p.has_ec + 1)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@@ -222,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
return ret;
}
-void bch2_bkey_append_ptr(struct bkey_i *k,
- struct bch_extent_ptr ptr)
+/* KEY_TYPE_btree_ptr: */
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+ return "value too big";
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_extent:
- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+ return bch2_bkey_ptrs_invalid(c, k);
+}
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ const char *err;
+ char buf[160];
+ struct bucket_mark mark;
+ struct bch_dev *ca;
- memcpy((void *) &k->v + bkey_val_bytes(&k->k),
- &ptr,
- sizeof(ptr));
- k->u64s++;
- break;
- default:
- BUG();
+ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ !bch2_bkey_replicas_marked(c, k, false), c,
+ "btree key bad (replicas not marked in superblock):\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+ return;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ mark = ptr_bucket_mark(ca, ptr);
+
+ err = "stale";
+ if (gen_after(mark.gen, ptr->gen))
+ goto err;
+
+ err = "inconsistent";
+ if (mark.data_type != BCH_DATA_BTREE ||
+ mark.dirty_sectors < c->opts.btree_node_size)
+ goto err;
}
+
+ return;
+err:
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+ err, buf, PTR_BUCKET_NR(ca, ptr),
+ mark.gen, (unsigned) mark.v.counter);
}
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
{
- struct bch_extent_ptr *ptr;
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
- bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+/* KEY_TYPE_extent: */
+
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ return bch2_bkey_ptrs_invalid(c, k);
}
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ char buf[160];
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == dev)
- return ptr;
+ /*
+ * XXX: we should be doing most/all of these checks at startup time,
+ * where we check bch2_bkey_invalid() in btree_node_read_done()
+ *
+ * But note that we can't check for stale pointers or incorrect gc marks
+ * until after journal replay is done (it might be an extent that's
+ * going to get overwritten during replay)
+ */
- return NULL;
-}
+ if (percpu_down_read_trylock(&c->mark_lock)) {
+ bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+ "extent key bad (replicas not marked in superblock):\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+ percpu_up_read(&c->mark_lock);
+ }
+ /*
+ * If journal replay hasn't finished, we might be seeing keys
+ * that will be overwritten by the time journal replay is done:
+ */
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ return;
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
+ extent_for_each_ptr_decode(e, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+ unsigned stale = gen_after(mark.gen, p.ptr.gen);
+ unsigned disk_sectors = ptr_disk_sectors(p);
+ unsigned mark_sectors = p.ptr.cached
+ ? mark.cached_sectors
+ : mark.dirty_sectors;
- bkey_for_each_ptr(ptrs, ptr)
- if (bch2_dev_in_target(c, ptr->dev, target) &&
- (!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return true;
+ bch2_fs_bug_on(stale && !p.ptr.cached, c,
+ "stale dirty pointer (ptr gen %u bucket %u",
+ p.ptr.gen, mark.gen);
- return false;
+ bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+ bch2_fs_bug_on(!stale &&
+ (mark.data_type != BCH_DATA_USER ||
+ mark_sectors < disk_sectors), c,
+ "extent pointer not marked: %s:\n"
+ "type %u sectors %u < %u",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+ mark.data_type,
+ mark_sectors, disk_sectors);
+ }
}
-/* extent specific utility code */
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+enum merge_result bch2_extent_merge(struct bch_fs *c,
+ struct bkey_s _l, struct bkey_s _r)
{
- const struct bch_extent_ptr *ptr;
+ struct bkey_s_extent l = bkey_s_to_extent(_l);
+ struct bkey_s_extent r = bkey_s_to_extent(_r);
+ union bch_extent_entry *en_l = l.v->start;
+ union bch_extent_entry *en_r = r.v->start;
+ struct bch_extent_crc_unpacked crc_l, crc_r;
- extent_for_each_ptr(e, ptr)
- if (ptr->dev == dev)
- return ptr;
+ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
+ return BCH_MERGE_NOMERGE;
- return NULL;
-}
+ crc_l = bch2_extent_crc_unpack(l.k, NULL);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
-{
- const struct bch_extent_ptr *ptr;
+ extent_for_each_entry(l, en_l) {
+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
- extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (extent_entry_type(en_l) != extent_entry_type(en_r))
+ return BCH_MERGE_NOMERGE;
- if (ca->mi.group &&
- ca->mi.group - 1 == group)
- return ptr;
+ switch (extent_entry_type(en_l)) {
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *lp = &en_l->ptr;
+ const struct bch_extent_ptr *rp = &en_r->ptr;
+ struct bch_dev *ca;
+
+ if (lp->offset + crc_l.compressed_size != rp->offset ||
+ lp->dev != rp->dev ||
+ lp->gen != rp->gen)
+ return BCH_MERGE_NOMERGE;
+
+ /* We don't allow extents to straddle buckets: */
+ ca = bch_dev_bkey_exists(c, lp->dev);
+
+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+ return BCH_MERGE_NOMERGE;
+
+ break;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block ||
+ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx)
+ return BCH_MERGE_NOMERGE;
+ break;
+ case BCH_EXTENT_ENTRY_crc32:
+ case BCH_EXTENT_ENTRY_crc64:
+ case BCH_EXTENT_ENTRY_crc128:
+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ if (crc_l.csum_type != crc_r.csum_type ||
+ crc_l.compression_type != crc_r.compression_type ||
+ crc_l.nonce != crc_r.nonce)
+ return BCH_MERGE_NOMERGE;
+
+ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
+ crc_r.offset)
+ return BCH_MERGE_NOMERGE;
+
+ if (!bch2_checksum_mergeable(crc_l.csum_type))
+ return BCH_MERGE_NOMERGE;
+
+ if (crc_l.compression_type)
+ return BCH_MERGE_NOMERGE;
+
+ if (crc_l.csum_type &&
+ crc_l.uncompressed_size +
+ crc_r.uncompressed_size > c->sb.encoded_extent_max)
+ return BCH_MERGE_NOMERGE;
+
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+ bch2_crc_field_size_max[extent_entry_type(en_l)])
+ return BCH_MERGE_NOMERGE;
+
+ break;
+ default:
+ return BCH_MERGE_NOMERGE;
+ }
}
- return NULL;
-}
+ extent_for_each_entry(l, en_l) {
+ struct bch_extent_crc_unpacked crc_l, crc_r;
-unsigned bch2_extent_is_compressed(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ret = 0;
+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_NONE)
- ret += p.crc.compressed_size;
+ if (!extent_entry_is_crc(en_l))
+ continue;
- return ret;
-}
+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct bch_extent_ptr m, u64 offset)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+ crc_l.csum,
+ crc_r.csum,
+ crc_r.uncompressed_size << 9);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == m.dev &&
- p.ptr.gen == m.gen &&
- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
- (s64) m.offset - offset)
- return true;
+ crc_l.uncompressed_size += crc_r.uncompressed_size;
+ crc_l.compressed_size += crc_r.compressed_size;
- return false;
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+ extent_entry_type(en_l));
+ }
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+
+ return BCH_MERGE_MERGE;
}
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
- union bch_extent_entry *entry)
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- union bch_extent_entry *i = ptrs.start;
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- if (i == entry)
- return NULL;
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+ return "incorrect value size";
- while (extent_entry_next(i) != entry)
- i = extent_entry_next(i);
- return i;
+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+ return "invalid nr_replicas";
+
+ return NULL;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *dst, *src, *prev;
- bool drop_crc = true;
-
- EBUG_ON(ptr < &ptrs.start->ptr ||
- ptr >= &ptrs.end->ptr);
- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- src = extent_entry_next(to_entry(ptr));
- if (src != ptrs.end &&
- !extent_entry_is_crc(src))
- drop_crc = false;
+ pr_buf(out, "generation %u replicas %u",
+ le32_to_cpu(r.v->generation),
+ r.v->nr_replicas);
+}
- dst = to_entry(ptr);
- while ((prev = extent_entry_prev(ptrs, dst))) {
- if (extent_entry_is_ptr(prev))
- break;
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+ struct bkey_s _l, struct bkey_s _r)
+{
+ struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+ struct bkey_s_reservation r = bkey_s_to_reservation(_r);
- if (extent_entry_is_crc(prev)) {
- if (drop_crc)
- dst = prev;
- break;
- }
+ if (l.v->generation != r.v->generation ||
+ l.v->nr_replicas != r.v->nr_replicas)
+ return BCH_MERGE_NOMERGE;
- dst = prev;
+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+ bch2_key_resize(l.k, KEY_SIZE_MAX);
+ bch2_cut_front_s(l.k->p, r.s);
+ return BCH_MERGE_PARTIAL;
}
- memmove_u64s_down(dst, src,
- (u64 *) ptrs.end - (u64 *) src);
- k.k->u64s -= (u64 *) src - (u64 *) dst;
+ bch2_key_resize(l.k, l.k->size + r.k->size);
- return dst;
+ return BCH_MERGE_MERGE;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+ struct bch_extent_crc_unpacked r)
+{
+ return (l.csum_type != r.csum_type ||
+ l.compression_type != r.compression_type ||
+ l.compressed_size != r.compressed_size ||
+ l.uncompressed_size != r.uncompressed_size ||
+ l.offset != r.offset ||
+ l.live_size != r.live_size ||
+ l.nonce != r.nonce ||
+ bch2_crc_cmp(l.csum, r.csum));
}
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
@@ -466,52 +524,404 @@ restart_narrow_pointers:
return ret;
}
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
- struct bch_extent_crc_unpacked r)
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+ struct bch_extent_crc_unpacked src,
+ enum bch_extent_entry_type type)
{
- return (l.csum_type != r.csum_type ||
- l.compression_type != r.compression_type ||
- l.compressed_size != r.compressed_size ||
- l.uncompressed_size != r.uncompressed_size ||
- l.offset != r.offset ||
- l.live_size != r.live_size ||
- l.nonce != r.nonce ||
- bch2_crc_cmp(l.csum, r.csum));
+#define set_common_fields(_dst, _src) \
+ _dst.type = 1 << type; \
+ _dst.csum_type = _src.csum_type, \
+ _dst.compression_type = _src.compression_type, \
+ _dst._compressed_size = _src.compressed_size - 1, \
+ _dst._uncompressed_size = _src.uncompressed_size - 1, \
+ _dst.offset = _src.offset
+
+ switch (type) {
+ case BCH_EXTENT_ENTRY_crc32:
+ set_common_fields(dst->crc32, src);
+ dst->crc32.csum = *((__le32 *) &src.csum.lo);
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ set_common_fields(dst->crc64, src);
+ dst->crc64.nonce = src.nonce;
+ dst->crc64.csum_lo = src.csum.lo;
+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ set_common_fields(dst->crc128, src);
+ dst->crc128.nonce = src.nonce;
+ dst->crc128.csum = src.csum;
+ break;
+ default:
+ BUG();
+ }
+#undef set_common_fields
}
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_extent_crc_append(struct bkey_i *k,
+ struct bch_extent_crc_unpacked new)
{
- union bch_extent_entry *entry;
- u64 *d = (u64 *) bkeyp_val(f, k);
- unsigned i;
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_crc *crc = (void *) ptrs.end;
+ enum bch_extent_entry_type type;
- for (i = 0; i < bkeyp_val_u64s(f, k); i++)
- d[i] = swab64(d[i]);
+ if (bch_crc_bytes[new.csum_type] <= 4 &&
+ new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
+ new.nonce <= CRC32_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc32;
+ else if (bch_crc_bytes[new.csum_type] <= 10 &&
+ new.uncompressed_size - 1 <= CRC64_SIZE_MAX &&
+ new.nonce <= CRC64_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc64;
+ else if (bch_crc_bytes[new.csum_type] <= 16 &&
+ new.uncompressed_size - 1 <= CRC128_SIZE_MAX &&
+ new.nonce <= CRC128_NONCE_MAX)
+ type = BCH_EXTENT_ENTRY_crc128;
+ else
+ BUG();
- for (entry = (union bch_extent_entry *) d;
- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
- entry = extent_entry_next(entry)) {
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- break;
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.csum = swab32(entry->crc32.csum);
+ bch2_extent_crc_pack(crc, new, type);
+
+ k->k.u64s += extent_entry_u64s(ptrs.end);
+
+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+}
+
+/* Generic code for keys with pointers: */
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+ return bch2_bkey_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+ return k.k->type == KEY_TYPE_reservation
+ ? bkey_s_c_to_reservation(k).v->nr_replicas
+ : bch2_bkey_dirty_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
+{
+ unsigned ret = 0;
+
+ if (k.k->type == KEY_TYPE_reservation) {
+ ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+ } else {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ ret += !p.ptr.cached &&
+ p.crc.compression_type == BCH_COMPRESSION_NONE;
+ }
+
+ return ret;
+}
+
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned ret = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ p.crc.compression_type != BCH_COMPRESSION_NONE)
+ ret += p.crc.compressed_size;
+
+ return ret;
+}
+
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+ unsigned nr_replicas)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ struct bpos end = pos;
+ struct bkey_s_c k;
+ bool ret = true;
+ int err;
+
+ end.offset += size;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
- entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+
+ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+ ret = false;
break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = (__force __le64)
- swab64((__force u64) entry->crc128.csum.hi);
- entry->crc128.csum.lo = (__force __le64)
- swab64((__force u64) entry->crc128.csum.lo);
+ }
+ }
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+ struct extent_ptr_decoded p)
+{
+ unsigned durability = 0;
+ struct bch_dev *ca;
+
+ if (p.ptr.cached)
+ return 0;
+
+ ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+ if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+ durability = max_t(unsigned, durability, ca->mi.durability);
+
+ if (p.has_ec) {
+ struct stripe *s =
+ genradix_ptr(&c->stripes[0], p.ec.idx);
+
+ if (WARN_ON(!s))
+ goto out;
+
+ durability = max_t(unsigned, durability, s->nr_redundant);
+ }
+out:
+ return durability;
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned durability = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ durability += bch2_extent_ptr_durability(c, p);
+
+ return durability;
+}
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+ unsigned target,
+ unsigned nr_desired_replicas)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
+
+ if (target && extra > 0)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ int n = bch2_extent_ptr_durability(c, p);
+
+ if (n && n <= extra &&
+ !bch2_dev_in_target(c, p.ptr.dev, target)) {
+ entry->ptr.cached = true;
+ extra -= n;
+ }
+ }
+
+ if (extra > 0)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ int n = bch2_extent_ptr_durability(c, p);
+
+ if (n && n <= extra) {
+ entry->ptr.cached = true;
+ extra -= n;
+ }
+ }
+}
+
+void bch2_bkey_append_ptr(struct bkey_i *k,
+ struct bch_extent_ptr ptr)
+{
+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_extent:
+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+ memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+ &ptr,
+ sizeof(ptr));
+ k->u64s++;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void __extent_entry_insert(struct bkey_i *k,
+ union bch_extent_entry *dst,
+ union bch_extent_entry *new)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+ dst, (u64 *) end - (u64 *) dst);
+ k->k.u64s += extent_entry_u64s(new);
+ memcpy(dst, new, extent_entry_bytes(new));
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+ struct extent_ptr_decoded *p)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(&k->k, NULL);
+ union bch_extent_entry *pos;
+
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+ pos = ptrs.start;
+ goto found;
+ }
+
+ bkey_for_each_crc(&k->k, ptrs, crc, pos)
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+ pos = extent_entry_next(pos);
+ goto found;
+ }
+
+ bch2_extent_crc_append(k, p->crc);
+ pos = bkey_val_end(bkey_i_to_s(k));
+found:
+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+ __extent_entry_insert(k, pos, to_entry(&p->ptr));
+
+ if (p->has_ec) {
+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+ __extent_entry_insert(k, pos, to_entry(&p->ec));
+ }
+}
+
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+ union bch_extent_entry *entry)
+{
+ union bch_extent_entry *i = ptrs.start;
+
+ if (i == entry)
+ return NULL;
+
+ while (extent_entry_next(i) != entry)
+ i = extent_entry_next(i);
+ return i;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *dst, *src, *prev;
+ bool drop_crc = true;
+
+ EBUG_ON(ptr < &ptrs.start->ptr ||
+ ptr >= &ptrs.end->ptr);
+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+ src = extent_entry_next(to_entry(ptr));
+ if (src != ptrs.end &&
+ !extent_entry_is_crc(src))
+ drop_crc = false;
+
+ dst = to_entry(ptr);
+ while ((prev = extent_entry_prev(ptrs, dst))) {
+ if (extent_entry_is_ptr(prev))
break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
+
+ if (extent_entry_is_crc(prev)) {
+ if (drop_crc)
+ dst = prev;
break;
}
+
+ dst = prev;
}
+
+ memmove_u64s_down(dst, src,
+ (u64 *) ptrs.end - (u64 *) src);
+ k.k->u64s -= (u64 *) src - (u64 *) dst;
+
+ return dst;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ return ptr;
+
+ return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (!ptr->cached ||
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+ return true;
+
+ return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_extent_ptr m, u64 offset)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == m.dev &&
+ p.ptr.gen == m.gen &&
+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+ (s64) m.offset - offset)
+ return true;
+
+ return false;
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(k, ptr,
+ ptr->cached &&
+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+
+ /* will only happen if all pointers were cached: */
+ if (!bch2_bkey_nr_ptrs(k.s_c))
+ k.k->type = KEY_TYPE_discard;
+
+ return bkey_whiteout(k.k);
}
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
@@ -662,70 +1072,50 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
-/* Btree ptrs */
-
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
-
- return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
- const char *err;
- char buf[160];
- struct bucket_mark mark;
- struct bch_dev *ca;
-
- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked(c, k, false), c,
- "btree key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
- return;
-
- bkey_for_each_ptr(ptrs, ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
-
- mark = ptr_bucket_mark(ca, ptr);
+ union bch_extent_entry *entry;
+ u64 *d = (u64 *) bkeyp_val(f, k);
+ unsigned i;
- err = "stale";
- if (gen_after(mark.gen, ptr->gen))
- goto err;
+ for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+ d[i] = swab64(d[i]);
- err = "inconsistent";
- if (mark.data_type != BCH_DATA_BTREE ||
- mark.dirty_sectors < c->opts.btree_node_size)
- goto err;
+ for (entry = (union bch_extent_entry *) d;
+ entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+ entry = extent_entry_next(entry)) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_ptr:
+ break;
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.csum = swab32(entry->crc32.csum);
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.csum.hi = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.hi);
+ entry->crc128.csum.lo = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.lo);
+ break;
+ case BCH_EXTENT_ENTRY_stripe_ptr:
+ break;
+ }
}
-
- return;
-err:
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
- err, buf, PTR_BUCKET_NR(ca, ptr),
- mark.gen, (unsigned) mark.v.counter);
}
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
+/* Generic extent code: */
-/* Extents */
-
-void __bch2_cut_front(struct bpos where, struct bkey_s k)
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
{
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
+ int val_u64s_delta;
u64 sub;
if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
- return;
+ return 0;
EBUG_ON(bkey_cmp(where, k.k->p) > 0);
@@ -733,15 +1123,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
k.k->size -= sub;
- if (!k.k->size)
+ if (!k.k->size) {
k.k->type = KEY_TYPE_deleted;
+ new_val_u64s = 0;
+ }
switch (k.k->type) {
- case KEY_TYPE_deleted:
- case KEY_TYPE_discard:
- case KEY_TYPE_error:
- case KEY_TYPE_cookie:
- break;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v: {
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
@@ -779,1119 +1166,59 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
le64_add_cpu(&p.v->idx, sub);
break;
}
- case KEY_TYPE_reservation:
- break;
- default:
- BUG();
- }
-}
-
-bool bch2_cut_back(struct bpos where, struct bkey *k)
-{
- u64 len = 0;
-
- if (bkey_cmp(where, k->p) >= 0)
- return false;
-
- EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
-
- len = where.offset - bkey_start_offset(k);
-
- k->p = where;
- k->size = len;
-
- if (!len)
- k->type = KEY_TYPE_deleted;
-
- return true;
-}
-
-static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
- struct bkey_i *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
- struct bkey_packed tmp;
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- dst_unpacked->k = src->k;
- else if (bch2_bkey_pack_key(&tmp, &src->k, f))
- memcpy_u64s(dst, &tmp, f->key_u64s);
- else
- return false;
-
- memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
- return true;
-}
-
-static bool bch2_extent_merge_inline(struct bch_fs *,
- struct btree_iter *,
- struct bkey_packed *,
- struct bkey_packed *,
- bool);
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter *_iter,
- struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_node_iter iter;
- struct bkey_packed *k;
- struct bkey uk;
-
- if (!expensive_debug_checks(c))
- return;
-
- iter = *_iter;
- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
- iter = *_iter;
- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
- if (k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
- char buf1[100];
- char buf2[100];
-
- bch2_bkey_to_text(&PBUF(buf1), &insert->k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
-
- bch2_dump_btree_node(b);
- panic("insert > next :\n"
- "insert %s\n"
- "next %s\n",
- buf1, buf2);
- }
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter;
- struct bkey_packed *k;
-
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
- node_iter = l->iter;
- k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
- if (k && !bkey_written(l->b, k) &&
- bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
- return;
-
- node_iter = l->iter;
- k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
- if (k && !bkey_written(l->b, k) &&
- bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
- return;
-
- /*
- * may have skipped past some deleted extents greater than the insert
- * key, before we got to a non deleted extent and knew we could bail out
- * rewind the iterator a bit if necessary:
- */
- node_iter = l->iter;
- while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
- bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
- l->iter = node_iter;
-
- k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
- bch2_bset_insert(l->b, &l->iter, k, insert, 0);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- unsigned ret = 0;
-
- bkey_extent_entry_for_each(ptrs, entry) {
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ret++;
- }
- }
-
- return ret;
-}
+ case KEY_TYPE_inline_data: {
+ struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
-static int __bch2_extent_atomic_end(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned offset,
- struct bpos *end,
- unsigned *nr_iters,
- unsigned max_iters)
-{
- int ret = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
- if (*nr_iters >= max_iters) {
- *end = bpos_min(*end, k.k->p);
- return 0;
- }
-
- break;
- case KEY_TYPE_reflink_p: {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = le64_to_cpu(p.v->idx);
- unsigned sectors = end->offset - bkey_start_offset(p.k);
- struct btree_iter *iter;
- struct bkey_s_c r_k;
-
- for_each_btree_key(trans, iter,
- BTREE_ID_REFLINK, POS(0, idx + offset),
- BTREE_ITER_SLOTS, r_k, ret) {
- if (bkey_cmp(bkey_start_pos(r_k.k),
- POS(0, idx + sectors)) >= 0)
- break;
-
- *nr_iters += 1;
- if (*nr_iters >= max_iters) {
- struct bpos pos = bkey_start_pos(k.k);
- pos.offset += r_k.k->p.offset - idx;
+ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
- *end = bpos_min(*end, pos);
- break;
- }
- }
+ memmove(d.v->data,
+ d.v->data + sub,
+ bkey_val_bytes(d.k) - sub);
- bch2_trans_iter_put(trans, iter);
+ new_val_u64s -= sub >> 3;
break;
}
}
- return ret;
-}
-
-int bch2_extent_atomic_end(struct btree_iter *iter,
- struct bkey_i *insert,
- struct bpos *end)
-{
- struct btree_trans *trans = iter->trans;
- struct btree *b = iter->l[0].b;
- struct btree_node_iter node_iter = iter->l[0].iter;
- struct bkey_packed *_k;
- unsigned nr_iters =
- bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
- int ret = 0;
-
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
-
- *end = bpos_min(insert->k.p, b->key.k.p);
-
- ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
- 0, end, &nr_iters, 10);
- if (ret)
- return ret;
-
- while (nr_iters < 20 &&
- (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
- struct bkey unpacked;
- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
- unsigned offset = 0;
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+ BUG_ON(val_u64s_delta < 0);
- if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
- break;
-
- if (bkey_cmp(bkey_start_pos(&insert->k),
- bkey_start_pos(k.k)) > 0)
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
-
- ret = __bch2_extent_atomic_end(trans, k, offset,
- end, &nr_iters, 20);
- if (ret)
- return ret;
-
- if (nr_iters >= 20)
- break;
-
- bch2_btree_node_iter_advance(&node_iter, b);
- }
-
- return 0;
+ set_bkey_val_u64s(k.k, new_val_u64s);
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+ return -val_u64s_delta;
}
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(iter, k, &end);
- if (ret)
- return ret;
-
- bch2_cut_back(end, &k->k);
- return 0;
-}
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(iter, k, &end);
- if (ret)
- return ret;
-
- return !bkey_cmp(end, k->k.p);
-}
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *trans,
- struct btree_insert_entry *insert,
- unsigned *u64s)
-{
- struct btree_iter_level *l = &insert->iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- enum bch_extent_overlap overlap;
- struct bkey_packed *_k;
- struct bkey unpacked;
- struct bkey_s_c k;
- int sectors;
-
- /*
- * We avoid creating whiteouts whenever possible when deleting, but
- * those optimizations mean we may potentially insert two whiteouts
- * instead of one (when we overlap with the front of one extent and the
- * back of another):
- */
- if (bkey_whiteout(&insert->k->k))
- *u64s += BKEY_U64s;
-
- _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
- KEY_TYPE_discard);
- if (!_k)
- return BTREE_INSERT_OK;
-
- k = bkey_disassemble(l->b, _k, &unpacked);
-
- overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
- /* account for having to split existing extent: */
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- *u64s += _k->u64s;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bch2_extent_is_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c,
- trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
- }
- }
-
- return BTREE_INSERT_OK;
-}
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
+ int val_u64s_delta;
+ u64 len = 0;
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_packed *_k, struct bkey_s k,
- enum bch_extent_overlap overlap)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- /* insert overlaps with start of k: */
- __bch2_cut_front(insert->k.p, k);
- EBUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- break;
+ if (bkey_cmp(where, k.k->p) >= 0)
+ return 0;
- case BCH_EXTENT_OVERLAP_BACK:
- /* insert overlaps with end of k: */
- bch2_cut_back(bkey_start_pos(&insert->k), k.k);
- EBUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
+ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
- /*
- * As the auxiliary tree is indexed by the end of the
- * key and we've just changed the end, update the
- * auxiliary tree.
- */
- bch2_bset_fix_invalidated_key(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- break;
+ len = where.offset - bkey_start_offset(k.k);
- case BCH_EXTENT_OVERLAP_ALL: {
- /* The insert key completely covers k, invalidate k */
- if (!bkey_whiteout(k.k))
- btree_account_key_drop(l->b, _k);
+ k.k->p = where;
+ k.k->size = len;
- k.k->size = 0;
+ if (!len) {
k.k->type = KEY_TYPE_deleted;
-
- if (_k >= btree_bset_last(l->b)->start) {
- unsigned u64s = _k->u64s;
-
- bch2_bset_delete(l->b, _k, _k->u64s);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, u64s, 0);
- } else {
- extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- }
-
- break;
- }
- case BCH_EXTENT_OVERLAP_MIDDLE: {
- BKEY_PADDED(k) split;
- /*
- * The insert key falls 'in the middle' of k
- * The insert key splits k in 3:
- * - start only in k, preserve
- * - middle common section, invalidate in k
- * - end only in k, preserve
- *
- * We update the old key to preserve the start,
- * insert will be the new common section,
- * we manually insert the end that we are preserving.
- *
- * modify k _before_ doing the insert (which will move
- * what k points to)
- */
- bkey_reassemble(&split.k, k.s_c);
- split.k.k.needs_whiteout |= bkey_written(l->b, _k);
-
- bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
- BUG_ON(bkey_deleted(&split.k.k));
-
- __bch2_cut_front(insert->k.p, k);
- BUG_ON(bkey_deleted(k.k));
- extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
-
- extent_bset_insert(c, iter, &split.k);
- break;
- }
- }
-}
-
-struct extent_insert_state {
- struct bkey_i whiteout;
- bool update_journal;
- bool update_btree;
- bool deleting;
-};
-
-static void __bch2_insert_fixup_extent(struct bch_fs *c,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct extent_insert_state *s)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *_k;
- struct bkey unpacked;
-
- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
- struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
- break;
-
- if (!bkey_whiteout(k.k))
- s->update_journal = true;
-
- if (!s->update_journal) {
- bch2_cut_front(cur_end, insert);
- bch2_cut_front(cur_end, &s->whiteout);
- bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
- goto next;
- }
-
- /*
- * When deleting, if possible just do it by switching the type
- * of the key we're deleting, instead of creating and inserting
- * a new whiteout:
- */
- if (s->deleting &&
- !s->update_btree &&
- !bkey_cmp(insert->k.p, k.k->p) &&
- !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
- if (!bkey_whiteout(k.k)) {
- btree_account_key_drop(l->b, _k);
- _k->type = KEY_TYPE_discard;
- reserve_whiteout(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- }
- break;
- }
-
- if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
- insert->k.needs_whiteout = true;
- s->update_btree = true;
- }
-
- if (s->update_btree &&
- overlap == BCH_EXTENT_OVERLAP_ALL &&
- bkey_whiteout(k.k) &&
- k.k->needs_whiteout) {
- unreserve_whiteout(l->b, _k);
- _k->needs_whiteout = false;
- }
-
- extent_squash(c, iter, insert, _k, k, overlap);
-
- if (!s->update_btree)
- bch2_cut_front(cur_end, insert);
-next:
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
- }
-}
-
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_insert_entry *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
- struct extent_insert_state s = {
- .whiteout = *insert->k,
- .update_journal = !bkey_whiteout(&insert->k->k),
- .update_btree = !bkey_whiteout(&insert->k->k),
- .deleting = bkey_whiteout(&insert->k->k),
- };
- BKEY_PADDED(k) tmp;
-
- EBUG_ON(iter->level);
- EBUG_ON(!insert->k->k.size);
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-
- __bch2_insert_fixup_extent(c, iter, insert->k, &s);
-
- bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p);
-
- if (s.update_btree) {
- bkey_copy(&tmp.k, insert->k);
-
- if (s.deleting)
- tmp.k.k.type = KEY_TYPE_discard;
-
- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
-
- extent_bset_insert(c, iter, &tmp.k);
- }
-
- if (s.update_journal) {
- bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout);
-
- if (s.deleting)
- tmp.k.k.type = KEY_TYPE_discard;
-
- EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
-
- bch2_btree_journal_key(trans, iter, &tmp.k);
- }
-
- bch2_cut_front(insert->k->k.p, insert->k);
-}
-
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- char buf[160];
-
- /*
- * XXX: we should be doing most/all of these checks at startup time,
- * where we check bch2_bkey_invalid() in btree_node_read_done()
- *
- * But note that we can't check for stale pointers or incorrect gc marks
- * until after journal replay is done (it might be an extent that's
- * going to get overwritten during replay)
- */
-
- if (percpu_down_read_trylock(&c->mark_lock)) {
- bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
- "extent key bad (replicas not marked in superblock):\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
- percpu_up_read(&c->mark_lock);
- }
- /*
- * If journal replay hasn't finished, we might be seeing keys
- * that will be overwritten by the time journal replay is done:
- */
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
-
- extent_for_each_ptr_decode(e, p, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
- unsigned stale = gen_after(mark.gen, p.ptr.gen);
- unsigned disk_sectors = ptr_disk_sectors(p);
- unsigned mark_sectors = p.ptr.cached
- ? mark.cached_sectors
- : mark.dirty_sectors;
-
- bch2_fs_bug_on(stale && !p.ptr.cached, c,
- "stale dirty pointer (ptr gen %u bucket %u",
- p.ptr.gen, mark.gen);
-
- bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
-
- bch2_fs_bug_on(!stale &&
- (mark.data_type != BCH_DATA_USER ||
- mark_sectors < disk_sectors), c,
- "extent pointer not marked: %s:\n"
- "type %u sectors %u < %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
- mark.data_type,
- mark_sectors, disk_sectors);
- }
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-static unsigned bch2_crc_field_size_max[] = {
- [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
- [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
-
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
- struct bch_extent_crc_unpacked src)
-{
-#define set_common_fields(_dst, _src) \
- _dst.csum_type = _src.csum_type, \
- _dst.compression_type = _src.compression_type, \
- _dst._compressed_size = _src.compressed_size - 1, \
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
- _dst.offset = _src.offset
-
- switch (extent_entry_type(to_entry(dst))) {
- case BCH_EXTENT_ENTRY_crc32:
- set_common_fields(dst->crc32, src);
- dst->crc32.csum = *((__le32 *) &src.csum.lo);
- break;
- case BCH_EXTENT_ENTRY_crc64:
- set_common_fields(dst->crc64, src);
- dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = src.csum.lo;
- dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
- break;
- case BCH_EXTENT_ENTRY_crc128:
- set_common_fields(dst->crc128, src);
- dst->crc128.nonce = src.nonce;
- dst->crc128.csum = src.csum;
- break;
- default:
- BUG();
- }
-#undef set_common_fields
-}
-
-static void bch2_extent_crc_append(struct bkey_i *k,
- struct bch_extent_crc_unpacked new)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- union bch_extent_crc *crc = (void *) ptrs.end;
-
- if (bch_crc_bytes[new.csum_type] <= 4 &&
- new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
- new.nonce <= CRC32_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc32;
- else if (bch_crc_bytes[new.csum_type] <= 10 &&
- new.uncompressed_size - 1 <= CRC64_SIZE_MAX &&
- new.nonce <= CRC64_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc64;
- else if (bch_crc_bytes[new.csum_type] <= 16 &&
- new.uncompressed_size - 1 <= CRC128_SIZE_MAX &&
- new.nonce <= CRC128_NONCE_MAX)
- crc->type = 1 << BCH_EXTENT_ENTRY_crc128;
- else
- BUG();
-
- bch2_extent_crc_pack(crc, new);
-
- k->k.u64s += extent_entry_u64s(ptrs.end);
-
- EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
- union bch_extent_entry *dst,
- union bch_extent_entry *new)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
- memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
- k->k.u64s += extent_entry_u64s(new);
- memcpy(dst, new, extent_entry_bytes(new));
-}
-
-void bch2_extent_ptr_decoded_append(struct bkey_i *k,
- struct extent_ptr_decoded *p)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
- struct bch_extent_crc_unpacked crc =
- bch2_extent_crc_unpack(&k->k, NULL);
- union bch_extent_entry *pos;
- unsigned i;
-
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = ptrs.start;
- goto found;
- }
-
- bkey_for_each_crc(&k->k, ptrs, crc, pos)
- if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = extent_entry_next(pos);
- goto found;
- }
-
- bch2_extent_crc_append(k, p->crc);
- pos = bkey_val_end(bkey_i_to_s(k));
-found:
- p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ptr));
-
- for (i = 0; i < p->ec_nr; i++) {
- p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
- __extent_entry_insert(k, pos, to_entry(&p->ec[i]));
- }
-}
-
-/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
- struct bch_extent_ptr *ptr;
-
- bch2_bkey_drop_ptrs(k, ptr,
- ptr->cached &&
- ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
-
- /* will only happen if all pointers were cached: */
- if (!bkey_val_u64s(k.k))
- k.k->type = KEY_TYPE_discard;
-
- return bkey_whiteout(k.k);
-}
-
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
- unsigned target,
- unsigned nr_desired_replicas)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
- if (target && extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra &&
- !bch2_dev_in_target(c, p.ptr.dev, target)) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-
- if (extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
-}
-
-enum merge_result bch2_extent_merge(struct bch_fs *c,
- struct bkey_s _l, struct bkey_s _r)
-{
- struct bkey_s_extent l = bkey_s_to_extent(_l);
- struct bkey_s_extent r = bkey_s_to_extent(_r);
- union bch_extent_entry *en_l = l.v->start;
- union bch_extent_entry *en_r = r.v->start;
- struct bch_extent_crc_unpacked crc_l, crc_r;
-
- if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
- return BCH_MERGE_NOMERGE;
-
- crc_l = bch2_extent_crc_unpack(l.k, NULL);
-
- extent_for_each_entry(l, en_l) {
- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
- if (extent_entry_type(en_l) != extent_entry_type(en_r))
- return BCH_MERGE_NOMERGE;
-
- switch (extent_entry_type(en_l)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *lp = &en_l->ptr;
- const struct bch_extent_ptr *rp = &en_r->ptr;
- struct bch_dev *ca;
-
- if (lp->offset + crc_l.compressed_size != rp->offset ||
- lp->dev != rp->dev ||
- lp->gen != rp->gen)
- return BCH_MERGE_NOMERGE;
-
- /* We don't allow extents to straddle buckets: */
- ca = bch_dev_bkey_exists(c, lp->dev);
-
- if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
- return BCH_MERGE_NOMERGE;
-
- break;
- }
- case BCH_EXTENT_ENTRY_stripe_ptr:
- if (en_l->stripe_ptr.block != en_r->stripe_ptr.block ||
- en_l->stripe_ptr.idx != en_r->stripe_ptr.idx)
- return BCH_MERGE_NOMERGE;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- if (crc_l.csum_type != crc_r.csum_type ||
- crc_l.compression_type != crc_r.compression_type ||
- crc_l.nonce != crc_r.nonce)
- return BCH_MERGE_NOMERGE;
-
- if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
- crc_r.offset)
- return BCH_MERGE_NOMERGE;
-
- if (!bch2_checksum_mergeable(crc_l.csum_type))
- return BCH_MERGE_NOMERGE;
-
- if (crc_l.compression_type)
- return BCH_MERGE_NOMERGE;
-
- if (crc_l.csum_type &&
- crc_l.uncompressed_size +
- crc_r.uncompressed_size > c->sb.encoded_extent_max)
- return BCH_MERGE_NOMERGE;
-
- if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
- bch2_crc_field_size_max[extent_entry_type(en_l)])
- return BCH_MERGE_NOMERGE;
-
- break;
- default:
- return BCH_MERGE_NOMERGE;
- }
+ new_val_u64s = 0;
}
- extent_for_each_entry(l, en_l) {
- struct bch_extent_crc_unpacked crc_l, crc_r;
-
- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
- if (!extent_entry_is_crc(en_l))
- continue;
-
- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
- crc_l.csum,
- crc_r.csum,
- crc_r.uncompressed_size << 9);
-
- crc_l.uncompressed_size += crc_r.uncompressed_size;
- crc_l.compressed_size += crc_r.compressed_size;
-
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l);
- }
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
-
- return BCH_MERGE_MERGE;
-}
-
-/*
- * When merging an extent that we're inserting into a btree node, the new merged
- * extent could overlap with an existing 0 size extent - if we don't fix that,
- * it'll break the btree node iterator so this code finds those 0 size extents
- * and shifts them out of the way.
- *
- * Also unpacks and repacks.
- */
-static bool bch2_extent_merge_inline(struct bch_fs *c,
- struct btree_iter *iter,
- struct bkey_packed *l,
- struct bkey_packed *r,
- bool back_merge)
-{
- struct btree *b = iter->l[0].b;
- struct btree_node_iter *node_iter = &iter->l[0].iter;
- BKEY_PADDED(k) li, ri;
- struct bkey_packed *m = back_merge ? l : r;
- struct bkey_i *mi = back_merge ? &li.k : &ri.k;
- struct bset_tree *t = bch2_bkey_to_bset(b, m);
- enum merge_result ret;
-
- EBUG_ON(bkey_written(b, m));
-
- if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX ||
- bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX)
- return BCH_MERGE_NOMERGE;
-
- /*
- * We need to save copies of both l and r, because we might get a
- * partial merge (which modifies both) and then fails to repack
- */
- bch2_bkey_unpack(b, &li.k, l);
- bch2_bkey_unpack(b, &ri.k, r);
-
- ret = bch2_bkey_merge(c,
- bkey_i_to_s(&li.k),
- bkey_i_to_s(&ri.k));
- if (ret == BCH_MERGE_NOMERGE)
- return false;
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k));
- if (debug_check_bkeys(c) &&
- ret == BCH_MERGE_PARTIAL)
- bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k));
-
- /*
- * check if we overlap with deleted extents - would break the sort
- * order:
- */
- if (back_merge) {
- struct bkey_packed *n = bkey_next(m);
-
- if (n != btree_bkey_last(b, t) &&
- bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
- bkey_deleted(n))
- return false;
- } else if (ret == BCH_MERGE_MERGE) {
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
- if (prev &&
- bkey_cmp_left_packed_byval(b, prev,
- bkey_start_pos(&li.k.k)) > 0)
- return false;
- }
-
- if (ret == BCH_MERGE_PARTIAL) {
- if (!extent_i_save(b, m, mi))
- return false;
-
- if (!back_merge)
- bkey_copy(packed_to_bkey(l), &li.k);
- else
- bkey_copy(packed_to_bkey(r), &ri.k);
- } else {
- if (!extent_i_save(b, m, &li.k))
- return false;
- }
-
- bch2_bset_fix_invalidated_key(b, m);
- bch2_btree_node_iter_fix(iter, b, node_iter,
- m, m->u64s, m->u64s);
-
- return ret == BCH_MERGE_MERGE;
-}
-
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- bool ret = true;
- int err;
-
- end.offset += size;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
- ret = false;
- break;
- }
- }
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
- unsigned ret = 0;
-
switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- extent_for_each_ptr_decode(e, p, entry)
- ret += !p.ptr.cached &&
- p.crc.compression_type == BCH_COMPRESSION_NONE;
+ case KEY_TYPE_inline_data:
+ new_val_u64s = min(new_val_u64s, k.k->size << 6);
break;
}
- case KEY_TYPE_reservation:
- ret = bkey_s_c_to_reservation(k).v->nr_replicas;
- break;
- }
-
- return ret;
-}
-
-/* KEY_TYPE_reservation: */
-
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
- return "incorrect value size";
-
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
- return "invalid nr_replicas";
-
- return NULL;
-}
-
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- pr_buf(out, "generation %u replicas %u",
- le32_to_cpu(r.v->generation),
- r.v->nr_replicas);
-}
-enum merge_result bch2_reservation_merge(struct bch_fs *c,
- struct bkey_s _l, struct bkey_s _r)
-{
- struct bkey_s_reservation l = bkey_s_to_reservation(_l);
- struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+ BUG_ON(val_u64s_delta < 0);
- if (l.v->generation != r.v->generation ||
- l.v->nr_replicas != r.v->nr_replicas)
- return BCH_MERGE_NOMERGE;
-
- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
- bch2_key_resize(l.k, KEY_SIZE_MAX);
- __bch2_cut_front(l.k->p, r.s);
- return BCH_MERGE_PARTIAL;
- }
-
- bch2_key_resize(l.k, l.k->size + r.k->size);
-
- return BCH_MERGE_MERGE;
+ set_bkey_val_u64s(k.k, new_val_u64s);
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+ return -val_u64s_delta;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 613d76af69d9..1140d01a42ab 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -40,6 +40,9 @@ struct btree_insert_entry;
(union bch_extent_entry *) (_entry)); \
})
+#define extent_entry_next(_entry) \
+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@@ -185,10 +188,52 @@ struct bkey_ptrs {
union bch_extent_entry *end;
};
-/* iterate over bkey ptrs */
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_btree_ptr: {
+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+ return (struct bkey_ptrs_c) {
+ to_entry(&e.v->start[0]),
+ to_entry(extent_entry_last(e))
+ };
+ }
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ return (struct bkey_ptrs_c) {
+ e.v->start,
+ extent_entry_last(e)
+ };
+ }
+ case KEY_TYPE_stripe: {
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ return (struct bkey_ptrs_c) {
+ to_entry(&s.v->ptrs[0]),
+ to_entry(&s.v->ptrs[s.v->nr_blocks]),
+ };
+ }
+ case KEY_TYPE_reflink_v: {
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-#define extent_entry_next(_entry) \
- ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+ return (struct bkey_ptrs_c) {
+ r.v->start,
+ bkey_val_end(r),
+ };
+ }
+ default:
+ return (struct bkey_ptrs_c) { NULL, NULL };
+ }
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+ return (struct bkey_ptrs) {
+ (void *) p.start,
+ (void *) p.end
+ };
+}
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
@@ -228,7 +273,7 @@ struct bkey_ptrs {
__label__ out; \
\
(_ptr).idx = 0; \
- (_ptr).ec_nr = 0; \
+ (_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (extent_entry_type(_entry)) { \
@@ -242,7 +287,8 @@ struct bkey_ptrs {
entry_to_crc(_entry)); \
break; \
case BCH_EXTENT_ENTRY_stripe_ptr: \
- (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \
+ (_ptr).ec = _entry->stripe_ptr; \
+ (_ptr).has_ec = true; \
break; \
} \
out: \
@@ -280,96 +326,26 @@ out: \
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
-/* utility code common to all keys with pointers: */
+/* Iterate over pointers in KEY_TYPE_extent: */
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr: {
- struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
- return (struct bkey_ptrs_c) {
- to_entry(&e.v->start[0]),
- to_entry(extent_entry_last(e))
- };
- }
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- return (struct bkey_ptrs_c) {
- e.v->start,
- extent_entry_last(e)
- };
- }
- case KEY_TYPE_stripe: {
- struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
- return (struct bkey_ptrs_c) {
- to_entry(&s.v->ptrs[0]),
- to_entry(&s.v->ptrs[s.v->nr_blocks]),
- };
- }
- case KEY_TYPE_reflink_v: {
- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
- return (struct bkey_ptrs_c) {
- r.v->start,
- bkey_val_end(r),
- };
- }
- default:
- return (struct bkey_ptrs_c) { NULL, NULL };
- }
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
- return (struct bkey_ptrs) {
- (void *) p.start,
- (void *) p.end
- };
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(p, ptr)
- ret.devs[ret.nr++] = ptr->dev;
-
- return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(p, ptr)
- if (!ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_entry_from(_e, _entry, _start) \
+ __bkey_extent_entry_for_each_from(_start, \
+ extent_entry_last(_e),_entry)
- return ret;
-}
+#define extent_for_each_entry(_e, _entry) \
+ extent_for_each_entry_from(_e, _entry, (_e).v->start)
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
- struct bch_devs_list ret = (struct bch_devs_list) { 0 };
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
+#define extent_ptr_next(_e, _ptr) \
+ __bkey_ptr_next(_ptr, extent_entry_last(_e))
- bkey_for_each_ptr(p, ptr)
- if (ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_ptr(_e, _ptr) \
+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
- return ret;
-}
+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
+ extent_entry_last(_e), _ptr, _entry)
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+/* utility code common to all keys with pointers: */
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *);
@@ -377,22 +353,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *);
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
-
-/* bch_btree_ptr: */
+/* KEY_TYPE_btree_ptr: */
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
@@ -401,12 +367,11 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
.swab = bch2_ptr_swab, \
}
-/* bch_extent: */
+/* KEY_TYPE_extent: */
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
enum merge_result bch2_extent_merge(struct bch_fs *,
struct bkey_s, struct bkey_s);
@@ -419,7 +384,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
.key_merge = bch2_extent_merge, \
}
-/* bch_reservation: */
+/* KEY_TYPE_reservation: */
const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -432,27 +397,15 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
.key_merge = bch2_reservation_merge, \
}
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
- struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
- unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
- struct btree_insert_entry *);
-
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
- unsigned, unsigned);
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+/* Extent checksum entries: */
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+ struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+ struct bch_extent_crc_unpacked);
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
- struct bch_extent_ptr, u64);
+/* Generic code for keys with pointers: */
static inline bool bkey_extent_is_direct_data(const struct bkey *k)
{
@@ -469,6 +422,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
static inline bool bkey_extent_is_data(const struct bkey *k)
{
return bkey_extent_is_direct_data(k) ||
+ k->type == KEY_TYPE_inline_data ||
k->type == KEY_TYPE_reflink_p;
}
@@ -482,38 +436,64 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
case KEY_TYPE_reservation:
case KEY_TYPE_reflink_p:
case KEY_TYPE_reflink_v:
+ case KEY_TYPE_inline_data:
return true;
default:
return false;
}
}
-/* Extent entry iteration: */
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
-#define extent_for_each_entry_from(_e, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, \
- extent_entry_last(_e),_entry)
+ bkey_for_each_ptr(p, ptr)
+ ret.devs[ret.nr++] = ptr->dev;
-#define extent_for_each_entry(_e, _entry) \
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
+ return ret;
+}
-#define extent_ptr_next(_e, _ptr) \
- __bkey_ptr_next(_ptr, extent_entry_last(_e))
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
-#define extent_for_each_ptr(_e, _ptr) \
- __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+ bkey_for_each_ptr(p, ptr)
+ if (!ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
-#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
- __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
- extent_entry_last(_e), _ptr, _entry)
+ return ret;
+}
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
- struct extent_ptr_decoded *);
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
- struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+ bkey_for_each_ptr(p, ptr)
+ if (ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
+ return ret;
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+ unsigned, unsigned);
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+ struct extent_ptr_decoded *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
@@ -534,14 +514,34 @@ do { \
} \
} while (0)
-void __bch2_cut_front(struct bpos, struct bkey_s);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_extent_ptr, u64);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
{
- __bch2_cut_front(where, bkey_i_to_s(k));
+ bch2_cut_front_s(where, bkey_i_to_s(k));
}
-bool bch2_cut_back(struct bpos, struct bkey *);
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+ bch2_cut_back_s(where, bkey_i_to_s(k));
+}
/**
* bch_key_resize - adjust size of @k
@@ -573,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
BUG_ON(!bch2_bkey_pack_key(dst, src, f));
}
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index a8dd6952d989..43d6c341ecca 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -21,10 +21,10 @@ struct bch_extent_crc_unpacked {
struct extent_ptr_decoded {
unsigned idx;
- unsigned ec_nr;
+ bool has_ec;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
- struct bch_extent_stripe_ptr ec[4];
+ struct bch_extent_stripe_ptr ec;
};
struct bch_io_failures {
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..a4497eeb1f1b
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+ struct bch_inode_unpacked *dir_u,
+ struct bch_inode_unpacked *new_inode,
+ const struct qstr *name,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *dir_iter;
+ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+ u64 now = bch2_current_time(trans->c);
+ int ret;
+
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
+ if (IS_ERR(dir_iter))
+ return PTR_ERR(dir_iter);
+
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+ if (!name)
+ new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+ ret = bch2_inode_create(trans, new_inode,
+ BLOCKDEV_INODE_MAX, 0,
+ &c->unused_inode_hint);
+ if (ret)
+ return ret;
+
+ if (default_acl) {
+ ret = bch2_set_acl_trans(trans, new_inode, &hash,
+ default_acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ return ret;
+ }
+
+ if (acl) {
+ ret = bch2_set_acl_trans(trans, new_inode, &hash,
+ acl, ACL_TYPE_ACCESS);
+ if (ret)
+ return ret;
+ }
+
+ if (name) {
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+ if (S_ISDIR(new_inode->bi_mode))
+ dir_u->bi_nlink++;
+
+ ret = bch2_inode_write(trans, dir_iter, dir_u);
+ if (ret)
+ return ret;
+
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ mode_to_type(new_inode->bi_mode),
+ name, new_inode->bi_inum,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
+ u64 inum, struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
+{
+ struct btree_iter *dir_iter, *inode_iter;
+ struct bch_inode_unpacked dir_u;
+ struct bch_hash_info dir_hash;
+ u64 now = bch2_current_time(trans->c);
+
+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+ if (IS_ERR(inode_iter))
+ return PTR_ERR(inode_iter);
+
+ inode_u->bi_ctime = now;
+ bch2_inode_nlink_inc(inode_u);
+
+ dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0);
+ if (IS_ERR(dir_iter))
+ return PTR_ERR(dir_iter);
+
+ /* XXX: shouldn't we be updating mtime/ctime on the directory? */
+
+ dir_hash = bch2_hash_info_init(trans->c, &dir_u);
+ bch2_trans_iter_put(trans, dir_iter);
+
+ return bch2_dirent_create(trans, dir_inum, &dir_hash,
+ mode_to_type(inode_u->bi_mode),
+ name, inum, BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+ u64 dir_inum, struct bch_inode_unpacked *dir_u,
+ struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
+{
+ struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+ struct bch_hash_info dir_hash;
+ u64 inum, now = bch2_current_time(trans->c);
+ struct bkey_s_c k;
+
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
+ if (IS_ERR(dir_iter))
+ return PTR_ERR(dir_iter);
+
+ dir_hash = bch2_hash_info_init(trans->c, dir_u);
+
+ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
+ name, BTREE_ITER_INTENT);
+ if (IS_ERR(dirent_iter))
+ return PTR_ERR(dirent_iter);
+
+ k = bch2_btree_iter_peek_slot(dirent_iter);
+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+
+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
+ if (IS_ERR(inode_iter))
+ return PTR_ERR(inode_iter);
+
+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+ bch2_inode_nlink_dec(inode_u);
+
+ return (S_ISDIR(inode_u->bi_mode)
+ ? bch2_empty_dir_trans(trans, inum)
+ : 0) ?:
+ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
+ bch2_inode_write(trans, dir_iter, dir_u) ?:
+ bch2_inode_write(trans, inode_iter, inode_u);
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+ struct bch_inode_unpacked *src_u)
+{
+ u64 src, dst;
+ unsigned id;
+ bool ret = false;
+
+ for (id = 0; id < Inode_opt_nr; id++) {
+ if (dst_u->bi_fields_set & (1 << id))
+ continue;
+
+ src = bch2_inode_opt_get(src_u, id);
+ dst = bch2_inode_opt_get(dst_u, id);
+
+ if (src == dst)
+ continue;
+
+ bch2_inode_opt_set(dst_u, id, src);
+ ret = true;
+ }
+
+ return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+ u64 src_dir, struct bch_inode_unpacked *src_dir_u,
+ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+ struct bch_inode_unpacked *src_inode_u,
+ struct bch_inode_unpacked *dst_inode_u,
+ const struct qstr *src_name,
+ const struct qstr *dst_name,
+ enum bch_rename_mode mode)
+{
+ struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
+ struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+ struct bch_hash_info src_hash, dst_hash;
+ u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+ int ret;
+
+ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(src_dir_iter))
+ return PTR_ERR(src_dir_iter);
+
+ src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+
+ if (dst_dir != src_dir) {
+ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(dst_dir_iter))
+ return PTR_ERR(dst_dir_iter);
+
+ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+ } else {
+ dst_dir_u = src_dir_u;
+ dst_hash = src_hash;
+ }
+
+ ret = bch2_dirent_rename(trans,
+ src_dir, &src_hash,
+ dst_dir, &dst_hash,
+ src_name, &src_inode,
+ dst_name, &dst_inode,
+ mode);
+ if (ret)
+ return ret;
+
+ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(src_inode_iter))
+ return PTR_ERR(src_inode_iter);
+
+ if (dst_inode) {
+ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
+ BTREE_ITER_INTENT);
+ if (IS_ERR(dst_inode_iter))
+ return PTR_ERR(dst_inode_iter);
+ }
+
+ if (mode == BCH_RENAME_OVERWRITE) {
+ if (S_ISDIR(src_inode_u->bi_mode) !=
+ S_ISDIR(dst_inode_u->bi_mode))
+ return -ENOTDIR;
+
+ if (S_ISDIR(dst_inode_u->bi_mode) &&
+ bch2_empty_dir_trans(trans, dst_inode))
+ return -ENOTEMPTY;
+ }
+
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+ S_ISDIR(src_inode_u->bi_mode))
+ return -EXDEV;
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+ S_ISDIR(dst_inode_u->bi_mode))
+ return -EXDEV;
+
+ if (S_ISDIR(src_inode_u->bi_mode)) {
+ src_dir_u->bi_nlink--;
+ dst_dir_u->bi_nlink++;
+ }
+
+ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+ dst_dir_u->bi_nlink--;
+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+ }
+
+ if (mode == BCH_RENAME_OVERWRITE)
+ bch2_inode_nlink_dec(dst_inode_u);
+
+ src_dir_u->bi_mtime = now;
+ src_dir_u->bi_ctime = now;
+
+ if (src_dir != dst_dir) {
+ dst_dir_u->bi_mtime = now;
+ dst_dir_u->bi_ctime = now;
+ }
+
+ src_inode_u->bi_ctime = now;
+
+ if (dst_inode)
+ dst_inode_u->bi_ctime = now;
+
+ return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+ (src_dir != dst_dir
+ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+ : 0 ) ?:
+ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
+ (dst_inode
+ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+ : 0 );
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..c1621485a526
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+int bch2_create_trans(struct btree_trans *, u64,
+ struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *,
+ uid_t, gid_t, umode_t, dev_t,
+ struct posix_acl *,
+ struct posix_acl *);
+
+int bch2_link_trans(struct btree_trans *, u64,
+ u64, struct bch_inode_unpacked *,
+ const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *,
+ u64, struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *);
+
+int bch2_rename_trans(struct btree_trans *,
+ u64, struct bch_inode_unpacked *,
+ u64, struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *,
+ const struct qstr *,
+ const struct qstr *,
+ enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+ struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 04bcf061ca12..160644ccf439 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -3,11 +3,13 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_on_stack.h"
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include "error.h"
#include "extents.h"
+#include "extent_update.h"
#include "fs.h"
#include "fs-io.h"
#include "fsck.h"
@@ -36,27 +38,16 @@ struct quota_res {
u64 sectors;
};
-struct bchfs_write_op {
- struct bch_inode_info *inode;
- s64 sectors_added;
- bool is_dio;
- bool unalloc;
- u64 new_i_size;
-
- /* must be last: */
- struct bch_write_op op;
-};
-
struct bch_writepage_io {
struct closure cl;
- u64 new_sectors;
+ struct bch_inode_info *inode;
/* must be last: */
- struct bchfs_write_op op;
+ struct bch_write_op op;
};
struct dio_write {
- struct closure cl;
+ struct completion done;
struct kiocb *req;
struct mm_struct *mm;
unsigned loop:1,
@@ -68,7 +59,7 @@ struct dio_write {
struct iovec inline_vecs[2];
/* must be last: */
- struct bchfs_write_op iop;
+ struct bch_write_op op;
};
struct dio_read {
@@ -229,277 +220,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
mutex_unlock(&inode->ei_quota_lock);
}
-/* normal i_size/i_sectors update machinery: */
-
-static int sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new, bool *allocating,
- s64 *delta)
-{
- struct btree_iter *iter;
- struct bkey_s_c old;
-
- *delta = 0;
-
- iter = bch2_trans_copy_iter(trans, extent_iter);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
- old = bch2_btree_iter_peek_slot(iter);
-
- while (1) {
- /*
- * should not be possible to get an error here, since we're
- * carefully not advancing past @new and thus whatever leaf node
- * @_iter currently points to:
- */
- BUG_ON(bkey_err(old));
-
- if (allocating &&
- !*allocating &&
- bch2_bkey_nr_ptrs_allocated(old) <
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
- *allocating = true;
-
- *delta += (min(new->k.p.offset,
- old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k))) *
- (bkey_extent_is_allocation(&new->k) -
- bkey_extent_is_allocation(old.k));
-
- if (bkey_cmp(old.k->p, new->k.p) >= 0)
- break;
-
- old = bch2_btree_iter_next_slot(iter);
- }
-
- bch2_trans_iter_free(trans, iter);
- return 0;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct disk_reservation *disk_res,
- struct quota_res *quota_res,
- struct btree_iter *extent_iter,
- struct bkey_i *k,
- u64 new_i_size,
- bool may_allocate,
- bool direct,
- s64 *total_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *inode_iter = NULL;
- struct bch_inode_unpacked inode_u;
- struct bkey_inode_buf inode_p;
- bool allocating = false;
- bool extended = false;
- bool inode_locked = false;
- s64 i_sectors_delta;
- int ret;
-
- ret = bch2_btree_iter_traverse(extent_iter);
- if (ret)
- return ret;
-
- ret = bch2_extent_trim_atomic(k, extent_iter);
- if (ret)
- return ret;
-
- ret = sum_sector_overwrites(trans, extent_iter,
- k, &allocating,
- &i_sectors_delta);
- if (ret)
- return ret;
-
- if (!may_allocate && allocating)
- return -ENOSPC;
-
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k));
-
- new_i_size = min(k->k.p.offset << 9, new_i_size);
-
- /* XXX: inode->i_size locking */
- if (i_sectors_delta ||
- new_i_size > inode->ei_inode.bi_size) {
- if (c->opts.new_inode_updates) {
- bch2_trans_unlock(trans);
- mutex_lock(&inode->ei_update_lock);
-
- if (!bch2_trans_relock(trans)) {
- mutex_unlock(&inode->ei_update_lock);
- return -EINTR;
- }
-
- inode_locked = true;
-
- if (!inode->ei_inode_update)
- inode->ei_inode_update =
- bch2_deferred_update_alloc(c,
- BTREE_ID_INODES, 64);
-
- inode_u = inode->ei_inode;
- inode_u.bi_sectors += i_sectors_delta;
-
- /* XXX: this is slightly suspect */
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size) {
- inode_u.bi_size = new_i_size;
- extended = true;
- }
-
- bch2_inode_pack(&inode_p, &inode_u);
- bch2_trans_update(trans,
- BTREE_INSERT_DEFERRED(inode->ei_inode_update,
- &inode_p.inode.k_i));
- } else {
- inode_iter = bch2_trans_get_iter(trans,
- BTREE_ID_INODES,
- POS(k->k.p.inode, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(inode_iter))
- return PTR_ERR(inode_iter);
-
- ret = bch2_btree_iter_traverse(inode_iter);
- if (ret)
- goto err;
-
- inode_u = inode->ei_inode;
- inode_u.bi_sectors += i_sectors_delta;
-
- /* XXX: this is slightly suspect */
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size) {
- inode_u.bi_size = new_i_size;
- extended = true;
- }
-
- bch2_inode_pack(&inode_p, &inode_u);
- bch2_trans_update(trans,
- BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
- }
- }
-
- ret = bch2_trans_commit(trans, disk_res,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_USE_RESERVE);
- if (ret)
- goto err;
-
- inode->ei_inode.bi_sectors += i_sectors_delta;
-
- EBUG_ON(i_sectors_delta &&
- inode->ei_inode.bi_sectors != inode_u.bi_sectors);
-
- if (extended) {
- inode->ei_inode.bi_size = new_i_size;
-
- if (direct) {
- spin_lock(&inode->v.i_lock);
- if (new_i_size > inode->v.i_size)
- i_size_write(&inode->v, new_i_size);
- spin_unlock(&inode->v.i_lock);
- }
- }
-
- if (direct)
- i_sectors_acct(c, inode, quota_res, i_sectors_delta);
-
- if (total_delta)
- *total_delta += i_sectors_delta;
-err:
- if (!IS_ERR_OR_NULL(inode_iter))
- bch2_trans_iter_put(trans, inode_iter);
- if (inode_locked)
- mutex_unlock(&inode->ei_update_lock);
-
- return ret;
-}
-
-static int bchfs_write_index_update(struct bch_write_op *wop)
-{
- struct bch_fs *c = wop->c;
- struct bchfs_write_op *op = container_of(wop,
- struct bchfs_write_op, op);
- struct quota_res *quota_res = op->is_dio
- ? &container_of(op, struct dio_write, iop)->quota_res
- : NULL;
- struct bch_inode_info *inode = op->inode;
- struct keylist *keys = &op->op.insert_keys;
- struct bkey_i *k = bch2_keylist_front(keys);
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret;
-
- BUG_ON(k->k.p.inode != inode->v.i_ino);
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- iter = bch2_trans_get_iter(&trans,
- BTREE_ID_EXTENTS,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- do {
- BKEY_PADDED(k) tmp;
-
- bkey_copy(&tmp.k, bch2_keylist_front(keys));
-
- bch2_trans_begin_updates(&trans);
-
- ret = bch2_extent_update(&trans, inode,
- &wop->res, quota_res,
- iter, &tmp.k,
- op->new_i_size,
- !op->unalloc,
- op->is_dio,
- &op->sectors_added);
- if (ret == -EINTR)
- continue;
- if (ret)
- break;
-
- if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
- bch2_cut_front(iter->pos, bch2_keylist_front(keys));
- else
- bch2_keylist_pop_front(keys);
- } while (!bch2_keylist_empty(keys));
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
- struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch_io_opts opts,
- bool is_dio)
-{
- op->inode = inode;
- op->sectors_added = 0;
- op->is_dio = is_dio;
- op->unalloc = false;
- op->new_i_size = U64_MAX;
-
- bch2_write_op_init(&op->op, c, opts);
- op->op.target = opts.foreground_target;
- op->op.index_update_fn = bchfs_write_index_update;
- op_journal_seq_set(&op->op, &inode->ei_journal_seq);
-}
-
-static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
-{
- struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-
- bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
- return opts;
-}
-
/* page state: */
/* stored in page->private: */
@@ -521,6 +241,7 @@ struct bch_page_sector {
};
struct bch_page_state {
+ spinlock_t lock;
atomic_t write_count;
struct bch_page_sector s[PAGE_SECTORS];
};
@@ -576,6 +297,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
if (!s)
return NULL;
+ spin_lock_init(&s->lock);
/*
* migrate_page_move_mapping() assumes that pages with private data
* have their count elevated by 1.
@@ -723,6 +445,9 @@ static void bch2_clear_page_bits(struct page *page)
if (!s)
return;
+ EBUG_ON(!PageLocked(page));
+ EBUG_ON(PageWriteback(page));
+
for (i = 0; i < ARRAY_SIZE(s->s); i++) {
disk_res.sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
@@ -749,13 +474,23 @@ static void bch2_set_page_dirty(struct bch_fs *c,
struct bch_page_state *s = bch2_page_state(page);
unsigned i, dirty_sectors = 0;
+ WARN_ON((u64) page_offset(page) + offset + len >
+ round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+ spin_lock(&s->lock);
+
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
unsigned sectors = sectors_to_reserve(&s->s[i],
res->disk.nr_replicas);
- BUG_ON(sectors > res->disk.sectors);
+ /*
+ * This can happen if we race with the error path in
+ * bch2_writepage_io_done():
+ */
+ sectors = min_t(unsigned, sectors, res->disk.sectors);
+
s->s[i].replicas_reserved += sectors;
res->disk.sectors -= sectors;
@@ -765,6 +500,8 @@ static void bch2_set_page_dirty(struct bch_fs *c,
s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
}
+ spin_unlock(&s->lock);
+
if (dirty_sectors)
i_sectors_acct(c, inode, &res->quota, dirty_sectors);
@@ -772,12 +509,25 @@ static void bch2_set_page_dirty(struct bch_fs *c,
__set_page_dirty_nobuffers(page);
}
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+ struct file *file = vmf->vma->vm_file;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ int ret;
+
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ ret = filemap_fault(vmf);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+ return ret;
+}
+
vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct file *file = vmf->vma->vm_file;
struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = inode->v.i_mapping;
+ struct address_space *mapping = file->f_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_page_reservation res;
unsigned len;
@@ -795,8 +545,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
* a write_invalidate_inode_pages_range() that works without dropping
* page lock before invalidating page
*/
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
lock_page(page);
isize = i_size_read(&inode->v);
@@ -807,11 +556,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
goto out;
}
- /* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_SHIFT) <= isize)
- len = PAGE_SIZE;
- else
- len = offset_in_page(isize);
+ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
unlock_page(page);
@@ -820,23 +565,19 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
}
bch2_set_page_dirty(c, inode, page, &res, 0, len);
+ bch2_page_reservation_put(c, inode, &res);
+
wait_for_stable_page(page);
out:
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
sb_end_pagefault(inode->v.i_sb);
- bch2_page_reservation_put(c, inode, &res);
-
return ret;
}
void bch2_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- EBUG_ON(!PageLocked(page));
- EBUG_ON(PageWriteback(page));
-
if (offset || length < PAGE_SIZE)
return;
@@ -845,10 +586,6 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,
int bch2_releasepage(struct page *page, gfp_t gfp_mask)
{
- /* XXX: this can't take locks that are held while we allocate memory */
- EBUG_ON(!PageLocked(page));
- EBUG_ON(PageWriteback(page));
-
if (PageDirty(page))
return 0;
@@ -865,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
@@ -890,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- int i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -995,7 +732,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
struct bvec_iter iter;
struct bio_vec bv;
unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_allocated(k);
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
unsigned state = k.k->type == KEY_TYPE_reservation
? SECTOR_RESERVED
: SECTOR_ALLOCATED;
@@ -1013,6 +750,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
}
}
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc.csum_type || crc.compression_type)
+ return true;
+ return false;
+}
+
static void readpage_bio_extend(struct readpages_iter *iter,
struct bio *bio,
unsigned sectors_this_extent,
@@ -1033,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1069,15 +815,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
+ struct bkey_on_stack sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
+
+ bkey_on_stack_init(&sk);
retry:
while (1) {
- BKEY_PADDED(k) tmp;
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
@@ -1089,15 +837,15 @@ retry:
if (ret)
break;
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bkey_on_stack_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(trans,
- &offset_into_extent, &tmp.k);
+ &offset_into_extent, sk.k);
if (ret)
break;
@@ -1105,22 +853,9 @@ retry:
bch2_trans_unlock(trans);
- if (readpages_iter) {
- bool want_full_extent = false;
-
- if (bkey_extent_is_data(k.k)) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *i;
- struct extent_ptr_decoded p;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, i)
- want_full_extent |= ((p.crc.csum_type != 0) |
- (p.crc.compression_type != 0));
- }
-
- readpage_bio_extend(readpages_iter, &rbio->bio,
- sectors, want_full_extent);
- }
+ if (readpages_iter)
+ readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
+ extent_partial_reads_expensive(k));
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -1134,7 +869,7 @@ retry:
bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
- return;
+ break;
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
@@ -1143,8 +878,12 @@ retry:
if (ret == -EINTR)
goto retry;
- bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
- bio_endio(&rbio->bio);
+ if (ret) {
+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bio_endio(&rbio->bio);
+ }
+
+ bkey_on_stack_exit(&sk, c);
}
int bch2_readpages(struct file *file, struct address_space *mapping,
@@ -1152,7 +891,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, inode);
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
struct btree_iter *iter;
struct page *page;
@@ -1167,8 +906,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_SLOTS);
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
while ((page = readpage_iter_next(&readpages_iter))) {
pgoff_t index = readpages_iter.offset + readpages_iter.idx;
@@ -1191,8 +929,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
&readpages_iter);
}
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
@@ -1226,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page)
{
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, inode);
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct bch_read_bio *rbio;
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
@@ -1251,7 +988,7 @@ static int bch2_read_single_page(struct page *page,
DECLARE_COMPLETION_ONSTACK(done);
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
- io_opts(c, inode));
+ io_opts(c, &inode->ei_inode));
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
@@ -1278,7 +1015,9 @@ struct bch_writepage_state {
static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
struct bch_inode_info *inode)
{
- return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+ return (struct bch_writepage_state) {
+ .opts = io_opts(c, &inode->ei_inode)
+ };
}
static void bch2_writepage_io_free(struct closure *cl)
@@ -1286,30 +1025,43 @@ static void bch2_writepage_io_free(struct closure *cl)
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
- bio_put(&io->op.op.wbio.bio);
+ bio_put(&io->op.wbio.bio);
}
static void bch2_writepage_io_done(struct closure *cl)
{
struct bch_writepage_io *io = container_of(cl,
struct bch_writepage_io, cl);
- struct bch_fs *c = io->op.op.c;
- struct bio *bio = &io->op.op.wbio.bio;
+ struct bch_fs *c = io->op.c;
+ struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
- if (io->op.op.error) {
- bio_for_each_segment_all(bvec, bio, i) {
+ if (io->op.error) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
mapping_set_error(bvec->bv_page->mapping, -EIO);
- lock_page(bvec->bv_page);
- s = bch2_page_state(bvec->bv_page);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
- unlock_page(bvec->bv_page);
+ s = __bch2_page_state(bvec->bv_page);
+ spin_lock(&s->lock);
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+ }
+
+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+ bio_for_each_segment_all(bvec, bio, iter) {
+ struct bch_page_state *s;
+
+ s = __bch2_page_state(bvec->bv_page);
+ spin_lock(&s->lock);
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
+ spin_unlock(&s->lock);
}
}
@@ -1317,24 +1069,22 @@ static void bch2_writepage_io_done(struct closure *cl)
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected:
*/
- BUG_ON(io->op.sectors_added > (s64) io->new_sectors);
+ BUG_ON(io->op.i_sectors_delta > 0);
/*
* (error (due to going RO) halfway through a page can screw that up
* slightly)
* XXX wtf?
- BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS);
+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
*/
/*
* PageWriteback is effectively our ref on the inode - fixup i_blocks
* before calling end_page_writeback:
*/
- if (io->op.sectors_added != io->new_sectors)
- i_sectors_acct(c, io->op.inode, NULL,
- io->op.sectors_added - (s64) io->new_sectors);
+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1349,7 +1099,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
struct bch_writepage_io *io = w->io;
w->io = NULL;
- closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
}
@@ -1358,6 +1108,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
* possible, else allocating a new one:
*/
static void bch2_writepage_io_alloc(struct bch_fs *c,
+ struct writeback_control *wbc,
struct bch_writepage_state *w,
struct bch_inode_info *inode,
u64 sector,
@@ -1368,17 +1119,21 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
&c->writepage_bioset),
- struct bch_writepage_io, op.op.wbio.bio);
+ struct bch_writepage_io, op.wbio.bio);
closure_init(&w->io->cl, NULL);
- w->io->new_sectors = 0;
- bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
- op = &w->io->op.op;
+ w->io->inode = inode;
+
+ op = &w->io->op;
+ bch2_write_op_init(op, c, w->opts);
+ op->target = w->opts.foreground_target;
+ op_journal_seq_set(op, &inode->ei_journal_seq);
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->pos = POS(inode->v.i_ino, sector);
op->wbio.bio.bi_iter.bi_sector = sector;
+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
static int __bch2_writepage(struct page *page,
@@ -1482,33 +1237,30 @@ do_io:
}
if (w->io &&
- (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.op.wbio.bio) ||
- bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+ (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
+ w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
+ bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
if (!w->io)
- bch2_writepage_io_alloc(c, w, inode, sector,
+ bch2_writepage_io_alloc(c, wbc, w, inode, sector,
nr_replicas_this_write);
- w->io->new_sectors += dirty_sectors;
-
atomic_inc(&s->write_count);
- BUG_ON(inode != w->io->op.inode);
- BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+ BUG_ON(inode != w->io->inode);
+ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
sectors << 9, offset << 9));
/* Check for writing past i_size: */
- BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)));
+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)));
- w->io->op.op.res.sectors += reserved_sectors;
+ w->io->op.res.sectors += reserved_sectors;
+ w->io->op.i_sectors_delta -= dirty_sectors;
w->io->op.new_i_size = i_size;
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
-
offset += sectors;
}
@@ -1569,8 +1321,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, res);
*fsdata = res;
- /* Not strictly necessary - same reason as mkwrite(): */
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -1622,7 +1373,7 @@ err:
put_page(page);
*pagep = NULL;
err_unlock:
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
kfree(res);
*fsdata = NULL;
return ret;
@@ -1666,7 +1417,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
bch2_page_reservation_put(c, inode, res);
kfree(res);
@@ -1700,8 +1451,13 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
if (!pages[i]) {
nr_pages = i;
- ret = -ENOMEM;
- goto out;
+ if (!i) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ len = min_t(unsigned, len,
+ nr_pages * PAGE_SIZE - offset);
+ break;
}
}
@@ -1766,14 +1522,6 @@ retry_reservation:
if (!copied)
goto out;
- nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
- inode->ei_last_dirtied = (unsigned long) current;
-
- spin_lock(&inode->v.i_lock);
- if (pos + copied > inode->v.i_size)
- i_size_write(&inode->v, pos + copied);
- spin_unlock(&inode->v.i_lock);
-
if (copied < len &&
((offset + copied) & (PAGE_SIZE - 1))) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
@@ -1784,6 +1532,11 @@ retry_reservation:
}
}
+ spin_lock(&inode->v.i_lock);
+ if (pos + copied > inode->v.i_size)
+ i_size_write(&inode->v, pos + copied);
+ spin_unlock(&inode->v.i_lock);
+
while (set_dirty < copied) {
struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
@@ -1799,6 +1552,9 @@ retry_reservation:
set_dirty += pg_len;
}
+
+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+ inode->ei_last_dirtied = (unsigned long) current;
out:
for (i = nr_pages_copied; i < nr_pages; i++) {
unlock_page(pages[i]);
@@ -1819,7 +1575,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
ssize_t written = 0;
int ret = 0;
- pagecache_add_get(&mapping->add_lock);
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
@@ -1876,7 +1632,7 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
- pagecache_add_put(&mapping->add_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
return written ? written : ret;
}
@@ -1912,7 +1668,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct file *file = req->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, inode);
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct dio_read *dio;
struct bio *bio;
loff_t offset = req->ki_pos;
@@ -2000,67 +1756,98 @@ start:
}
}
-/* O_DIRECT writes */
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct address_space *mapping = file->f_mapping;
+ size_t count = iov_iter_count(iter);
+ ssize_t ret;
+
+ if (!count)
+ return 0; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct blk_plug plug;
+
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ return ret;
+
+ file_accessed(file);
+
+ blk_start_plug(&plug);
+ ret = bch2_direct_IO_read(iocb, iter);
+ blk_finish_plug(&plug);
+
+ if (ret >= 0)
+ iocb->ki_pos += ret;
+ } else {
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ ret = generic_file_read_iter(iocb, iter);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ }
+
+ return ret;
+}
-static void bch2_dio_write_loop_async(struct closure *);
+/* O_DIRECT writes */
static long bch2_dio_write_loop(struct dio_write *dio)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct address_space *mapping = req->ki_filp->f_mapping;
- struct bch_inode_info *inode = dio->iop.inode;
- struct bio *bio = &dio->iop.op.wbio.bio;
+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
+ struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- loff_t offset;
+ unsigned unaligned;
+ u64 new_i_size;
bool sync;
long ret;
- int i;
if (dio->loop)
goto loop;
- inode_dio_begin(&inode->v);
- __pagecache_block_get(&mapping->add_lock);
-
- /* Write and invalidate pagecache range that we're writing to: */
- offset = req->ki_pos + (dio->iop.op.written << 9);
- ret = write_invalidate_inode_pages_range(mapping,
- offset,
- offset + iov_iter_count(&dio->iter) - 1);
- if (unlikely(ret))
- goto err;
-
while (1) {
- offset = req->ki_pos + (dio->iop.op.written << 9);
-
- BUG_ON(current->pagecache_lock);
- current->pagecache_lock = &mapping->add_lock;
if (kthread)
use_mm(dio->mm);
+ BUG_ON(current->faults_disabled_mapping);
+ current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ current->faults_disabled_mapping = NULL;
if (kthread)
unuse_mm(dio->mm);
- current->pagecache_lock = NULL;
if (unlikely(ret < 0))
goto err;
- /* gup might have faulted pages back in: */
- ret = write_invalidate_inode_pages_range(mapping,
- offset,
- offset + bio->bi_iter.bi_size - 1);
- if (unlikely(ret))
+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+ bio->bi_iter.bi_size -= unaligned;
+ iov_iter_revert(&dio->iter, unaligned);
+
+ if (!bio->bi_iter.bi_size) {
+ /*
+ * bio_iov_iter_get_pages was only able to get <
+ * blocksize worth of pages:
+ */
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
+ ret = -EFAULT;
goto err;
+ }
- dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
+ dio->op.pos = POS(inode->v.i_ino,
+ (req->ki_pos >> 9) + dio->op.written);
task_io_account_write(bio->bi_iter.bi_size);
- closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl);
-
if (!dio->sync && !dio->loop && dio->iter.count) {
struct iovec *iov = dio->inline_vecs;
@@ -2068,8 +1855,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
GFP_KERNEL);
if (unlikely(!iov)) {
- dio->iop.op.error = -ENOMEM;
- goto err_wait_io;
+ dio->sync = true;
+ goto do_io;
}
dio->free_iov = true;
@@ -2078,34 +1865,44 @@ static long bch2_dio_write_loop(struct dio_write *dio)
memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
dio->iter.iov = iov;
}
-err_wait_io:
+do_io:
dio->loop = true;
+ closure_call(&dio->op.cl, bch2_write, NULL, NULL);
- if (!dio->sync) {
- continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
+ if (dio->sync)
+ wait_for_completion(&dio->done);
+ else
return -EIOCBQUEUED;
- }
-
- closure_sync(&dio->cl);
loop:
- bio_for_each_segment_all(bv, bio, i)
+ i_sectors_acct(c, inode, &dio->quota_res,
+ dio->op.i_sectors_delta);
+ dio->op.i_sectors_delta = 0;
+
+ new_i_size = req->ki_pos + ((u64) dio->op.written << 9);
+
+ spin_lock(&inode->v.i_lock);
+ if (new_i_size > inode->v.i_size)
+ i_size_write(&inode->v, new_i_size);
+ spin_unlock(&inode->v.i_lock);
+
+ bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
- if (!dio->iter.count || dio->iop.op.error)
+ if (!dio->iter.count || dio->op.error)
break;
+
bio_reset(bio);
+ reinit_completion(&dio->done);
}
- ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
+ ret = dio->op.error ?: ((long) dio->op.written << 9);
err:
- __pagecache_block_put(&mapping->add_lock);
- bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res);
- bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ bch2_disk_reservation_put(c, &dio->op.res);
+ bch2_quota_reservation_put(c, inode, &dio->quota_res);
if (dio->free_iov)
kfree(dio->iter.iov);
- closure_debug_destroy(&dio->cl);
-
sync = dio->sync;
bio_put(bio);
@@ -2119,141 +1916,155 @@ err:
return ret;
}
-static void bch2_dio_write_loop_async(struct closure *cl)
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
{
- struct dio_write *dio = container_of(cl, struct dio_write, cl);
+ struct dio_write *dio = container_of(op, struct dio_write, op);
- bch2_dio_write_loop(dio);
+ if (dio->sync)
+ complete(&dio->done);
+ else
+ bch2_dio_write_loop(dio);
}
-static int bch2_direct_IO_write(struct kiocb *req,
- struct iov_iter *iter,
- bool swap)
+static noinline
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
{
struct file *file = req->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct dio_write *dio;
struct bio *bio;
+ bool locked = true, extending;
ssize_t ret;
- lockdep_assert_held(&inode->v.i_rwsem);
+ prefetch(&c->opts);
+ prefetch((void *) &c->opts + 64);
+ prefetch(&inode->ei_inode);
+ prefetch((void *) &inode->ei_inode + 64);
- if (unlikely(!iter->count))
- return 0;
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(req, iter);
+ if (unlikely(ret <= 0))
+ goto err;
+
+ ret = file_remove_privs(file);
+ if (unlikely(ret))
+ goto err;
+
+ ret = file_update_time(file);
+ if (unlikely(ret))
+ goto err;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- return -EINVAL;
+ goto err;
+
+ inode_dio_begin(&inode->v);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+ extending = req->ki_pos + iter->count > inode->v.i_size;
+ if (!extending) {
+ inode_unlock(&inode->v);
+ locked = false;
+ }
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
&c->dio_write_bioset);
- dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
- closure_init(&dio->cl, NULL);
+ dio = container_of(bio, struct dio_write, op.wbio.bio);
+ init_completion(&dio->done);
dio->req = req;
dio->mm = current->mm;
dio->loop = false;
- dio->sync = is_sync_kiocb(req) ||
- req->ki_pos + iter->count > inode->v.i_size;
+ dio->sync = is_sync_kiocb(req) || extending;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->iter = *iter;
- bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
- dio->iop.op.write_point = writepoint_hashed((unsigned long) current);
- dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+ bch2_write_op_init(&dio->op, c, opts);
+ dio->op.end_io = bch2_dio_write_loop_async;
+ dio->op.target = opts.foreground_target;
+ op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
+ dio->op.write_point = writepoint_hashed((unsigned long) current);
+ dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION;
if ((req->ki_flags & IOCB_DSYNC) &&
!c->opts.journal_flush_disabled)
- dio->iop.op.flags |= BCH_WRITE_FLUSH;
+ dio->op.flags |= BCH_WRITE_FLUSH;
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
iter->count >> 9, true);
if (unlikely(ret))
- goto err;
+ goto err_put_bio;
- dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
+ dio->op.nr_replicas = dio->op.opts.data_replicas;
- ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
- dio->iop.op.opts.data_replicas, 0);
- if (unlikely(ret)) {
- if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
- req->ki_pos >> 9),
- iter->count >> 9,
- dio->iop.op.opts.data_replicas))
- goto err;
+ ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9,
+ dio->op.opts.data_replicas, 0);
+ if (unlikely(ret) &&
+ !bch2_check_range_allocated(c, POS(inode->v.i_ino,
+ req->ki_pos >> 9),
+ iter->count >> 9,
+ dio->op.opts.data_replicas))
+ goto err_put_bio;
- dio->iop.unalloc = true;
- }
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter->count - 1);
+ if (unlikely(ret))
+ goto err_put_bio;
- return bch2_dio_write_loop(dio);
+ ret = bch2_dio_write_loop(dio);
err:
- bch2_disk_reservation_put(c, &dio->iop.op.res);
+ if (locked)
+ inode_unlock(&inode->v);
+ if (ret > 0)
+ req->ki_pos += ret;
+ return ret;
+err_put_bio:
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ bch2_disk_reservation_put(c, &dio->op.res);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
- closure_debug_destroy(&dio->cl);
bio_put(bio);
- return ret;
-}
-
-ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
-{
- struct blk_plug plug;
- ssize_t ret;
-
- blk_start_plug(&plug);
- ret = iov_iter_rw(iter) == WRITE
- ? bch2_direct_IO_write(req, iter, false)
- : bch2_direct_IO_read(req, iter);
- blk_finish_plug(&plug);
-
- return ret;
-}
-
-static ssize_t
-bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- return bch2_direct_IO_write(iocb, iter, true);
+ inode_dio_end(&inode->v);
+ goto err;
}
-static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return bch2_direct_write(iocb, from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(&inode->v);
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+
ret = file_remove_privs(file);
if (ret)
- goto out;
+ goto unlock;
ret = file_update_time(file);
if (ret)
- goto out;
-
- ret = iocb->ki_flags & IOCB_DIRECT
- ? bch2_direct_write(iocb, from)
- : bch2_buffered_write(iocb, from);
+ goto unlock;
+ ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
-out:
+unlock:
+ inode_unlock(&inode->v);
current->backing_dev_info = NULL;
- return ret;
-}
-
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
- bool direct = iocb->ki_flags & IOCB_DIRECT;
- ssize_t ret;
- inode_lock(&inode->v);
- ret = generic_write_checks(iocb, from);
if (ret > 0)
- ret = __bch2_write_iter(iocb, from);
- inode_unlock(&inode->v);
-
- if (ret > 0 && !direct)
ret = generic_write_sync(iocb, ret);
return ret;
@@ -2288,80 +2099,6 @@ out:
/* truncate: */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, struct bch_inode_info *inode,
- u64 new_i_size)
-{
- struct bch_fs *c = trans->c;
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct bkey_s_c k;
- int ret = 0, ret2 = 0;
-
- while ((k = bch2_btree_iter_peek(iter)).k &&
- bkey_cmp(iter->pos, end) < 0) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete;
-
- ret = bkey_err(k);
- if (ret)
- goto btree_err;
-
- bkey_init(&delete.k);
- delete.k.p = iter->pos;
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete.k);
-
- bch2_trans_begin_updates(trans);
-
- ret = bch2_extent_update(trans, inode,
- &disk_res, NULL, iter, &delete,
- new_i_size, false, true, NULL);
- bch2_disk_reservation_put(c, &disk_res);
-btree_err:
- if (ret == -EINTR) {
- ret2 = ret;
- ret = 0;
- }
- if (ret)
- break;
- }
-
- if (bkey_cmp(iter->pos, end) > 0) {
- bch2_btree_iter_set_pos(iter, end);
- ret = bch2_btree_iter_traverse(iter);
- }
-
- return ret ?: ret2;
-}
-
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
- u64 start_offset, u64 end_offset)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS(inode->v.i_ino, start_offset),
- BTREE_ITER_INTENT);
-
- ret = bch2_fpunch_at(&trans, iter,
- POS(inode->v.i_ino, end_offset),
- inode, 0);
-
- bch2_trans_exit(&trans);
-
- if (ret == -EINTR)
- ret = 0;
-
- return ret;
-}
-
static inline int range_has_data(struct bch_fs *c,
struct bpos start,
struct bpos end)
@@ -2475,14 +2212,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
from, round_up(from, PAGE_SIZE));
}
-static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+static int bch2_extend(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode_u,
+ struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
int ret;
- ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size, S64_MAX);
+ /*
+ * sync appends:
+ *
+ * this has to be done _before_ extending i_size:
+ */
+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
if (ret)
return ret;
@@ -2522,19 +2265,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
+ struct bch_inode_unpacked inode_u;
+ struct btree_trans trans;
+ struct btree_iter *iter;
u64 new_i_size = iattr->ia_size;
- bool shrink;
+ s64 i_sectors_delta = 0;
int ret = 0;
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+ /*
+ * fetch current on disk i_size: inode is locked, i_size can only
+ * increase underneath us:
+ */
+ bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
+ ret = PTR_ERR_OR_ZERO(iter);
+ bch2_trans_exit(&trans);
- BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+ if (ret)
+ goto err;
- shrink = iattr->ia_size <= inode->v.i_size;
+ BUG_ON(inode->v.i_size < inode_u.bi_size);
- if (!shrink) {
- ret = bch2_extend(inode, iattr);
+ if (iattr->ia_size > inode->v.i_size) {
+ ret = bch2_extend(inode, &inode_u, iattr);
goto err;
}
@@ -2552,9 +2308,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
* userspace has to redirty it and call .mkwrite -> set_page_dirty
* again to allocate the part of the page that was extended.
*/
- if (iattr->ia_size > inode->ei_inode.bi_size)
+ if (iattr->ia_size > inode_u.bi_size)
ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size,
+ inode_u.bi_size,
iattr->ia_size - 1);
else if (iattr->ia_size & (PAGE_SIZE - 1))
ret = filemap_write_and_wait_range(mapping,
@@ -2573,9 +2329,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
truncate_setsize(&inode->v, iattr->ia_size);
- ret = __bch2_fpunch(c, inode,
+ ret = bch2_fpunch(c, inode->v.i_ino,
round_up(iattr->ia_size, block_bytes(c)) >> 9,
- U64_MAX);
+ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
if (unlikely(ret))
goto err;
@@ -2586,23 +2344,22 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
err:
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
return ret;
}
/* fallocate: */
-static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct address_space *mapping = inode->v.i_mapping;
u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
int ret = 0;
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
@@ -2621,21 +2378,29 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
truncate_pagecache_range(&inode->v, offset, offset + len - 1);
- if (discard_start < discard_end)
- ret = __bch2_fpunch(c, inode, discard_start, discard_end);
+ if (discard_start < discard_end) {
+ s64 i_sectors_delta = 0;
+
+ ret = bch2_fpunch(c, inode->v.i_ino,
+ discard_start, discard_end,
+ &inode->ei_journal_seq,
+ &i_sectors_delta);
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ }
err:
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
}
-static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
loff_t offset, loff_t len,
bool insert)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
+ struct bkey_on_stack copy;
struct btree_trans trans;
struct btree_iter *src, *dst, *del = NULL;
loff_t shift, new_size;
@@ -2645,6 +2410,7 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
+ bkey_on_stack_init(&copy);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
/*
@@ -2655,7 +2421,7 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
*/
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (insert) {
ret = -EFBIG;
@@ -2690,8 +2456,14 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
} else {
- ret = __bch2_fpunch(c, inode, offset >> 9,
- (offset + len) >> 9);
+ s64 i_sectors_delta = 0;
+
+ ret = bch2_fpunch(c, inode->v.i_ino,
+ offset >> 9, (offset + len) >> 9,
+ &inode->ei_journal_seq,
+ &i_sectors_delta);
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
if (ret)
goto err;
}
@@ -2707,7 +2479,6 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
while (1) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
- BKEY_PADDED(k) copy;
struct bkey_i delete;
struct bkey_s_c k;
struct bpos next_pos;
@@ -2732,38 +2503,34 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode,
bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
break;
reassemble:
- bkey_reassemble(&copy.k, k);
+ bkey_on_stack_reassemble(&copy, c, k);
if (insert &&
bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
- bch2_cut_front(move_pos, &copy.k);
- bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+ bch2_cut_front(move_pos, copy.k);
+ bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
}
- copy.k.k.p.offset += shift >> 9;
- bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
+ copy.k->k.p.offset += shift >> 9;
+ bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
- ret = bch2_btree_iter_traverse(dst);
+ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
if (ret)
goto bkey_err;
- ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
- if (ret)
- goto bkey_err;
-
- if (bkey_cmp(atomic_end, copy.k.k.p)) {
+ if (bkey_cmp(atomic_end, copy.k->k.p)) {
if (insert) {
move_pos = atomic_end;
move_pos.offset -= shift >> 9;
goto reassemble;
} else {
- bch2_cut_back(atomic_end, &copy.k.k);
+ bch2_cut_back(atomic_end, copy.k);
}
}
bkey_init(&delete.k);
delete.k.p = src->pos;
- bch2_key_resize(&delete.k, copy.k.k.size);
+ bch2_key_resize(&delete.k, copy.k->k.size);
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
@@ -2776,12 +2543,12 @@ reassemble:
* by the triggers machinery:
*/
if (insert &&
- bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
- bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+ bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
+ bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
} else if (!insert &&
- bkey_cmp(copy.k.k.p,
+ bkey_cmp(copy.k->k.p,
bkey_start_pos(&delete.k)) > 0) {
- bch2_cut_front(copy.k.k.p, &delete);
+ bch2_cut_front(copy.k->k.p, &delete);
del = bch2_trans_copy_iter(&trans, src);
BUG_ON(IS_ERR_OR_NULL(del));
@@ -2790,11 +2557,10 @@ reassemble:
bkey_start_pos(&delete.k));
}
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, &copy.k));
- bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(del ?: src, &delete));
+ bch2_trans_update(&trans, dst, copy.k);
+ bch2_trans_update(&trans, del ?: src, &delete);
- if (copy.k.k.size == k.k->size) {
+ if (copy.k->k.size == k.k->size) {
/*
* If we're moving the entire extent, we can skip
* running triggers:
@@ -2803,10 +2569,10 @@ reassemble:
} else {
/* We might end up splitting compressed extents: */
unsigned nr_ptrs =
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
ret = bch2_disk_reservation_get(c, &disk_res,
- copy.k.k.size, nr_ptrs,
+ copy.k->k.size, nr_ptrs,
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
}
@@ -2817,7 +2583,7 @@ reassemble:
bch2_disk_reservation_put(c, &disk_res);
bkey_err:
if (del)
- bch2_trans_iter_free(&trans, del);
+ bch2_trans_iter_put(&trans, del);
del = NULL;
if (!ret)
@@ -2841,13 +2607,14 @@ bkey_err:
}
err:
bch2_trans_exit(&trans);
- pagecache_block_put(&mapping->add_lock);
+ bkey_on_stack_exit(&copy, c);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
}
-static long bch2_fallocate(struct bch_inode_info *inode, int mode,
- loff_t offset, loff_t len)
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+ loff_t offset, loff_t len)
{
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -2858,14 +2625,14 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
loff_t block_start = round_down(offset, block_bytes(c));
loff_t block_end = round_up(end, block_bytes(c));
unsigned sectors;
- unsigned replicas = io_opts(c, inode).data_replicas;
+ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- pagecache_block_get(&mapping->add_lock);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
@@ -2896,6 +2663,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
end_pos = POS(inode->v.i_ino, block_end >> 9);
while (bkey_cmp(iter->pos, end_pos) < 0) {
+ s64 i_sectors_delta = 0;
struct disk_reservation disk_res = { 0 };
struct quota_res quota_res = { 0 };
struct bkey_i_reservation reservation;
@@ -2923,11 +2691,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
reservation.k.p = k.k->p;
reservation.k.size = k.k->size;
- bch2_cut_front(iter->pos, &reservation.k_i);
- bch2_cut_back(end_pos, &reservation.k);
+ bch2_cut_front(iter->pos, &reservation.k_i);
+ bch2_cut_back(end_pos, &reservation.k_i);
sectors = reservation.k.size;
- reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
+ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
if (!bkey_extent_is_allocation(k.k)) {
ret = bch2_quota_reservation_add(c, inode,
@@ -2938,7 +2706,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
}
if (reservation.v.nr_replicas < replicas ||
- bch2_extent_is_compressed(k)) {
+ bch2_bkey_sectors_compressed(k)) {
ret = bch2_disk_reservation_get(c, &disk_res, sectors,
replicas, 0);
if (unlikely(ret))
@@ -2949,10 +2717,10 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin_updates(&trans);
- ret = bch2_extent_update(&trans, inode,
- &disk_res, &quota_res,
- iter, &reservation.k_i,
- 0, true, true, NULL);
+ ret = bch2_extent_update(&trans, iter, &reservation.k_i,
+ &disk_res, &inode->ei_journal_seq,
+ 0, &i_sectors_delta);
+ i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
bch2_disk_reservation_put(c, &disk_res);
@@ -2961,37 +2729,53 @@ bkey_err:
if (ret)
goto err;
}
- bch2_trans_unlock(&trans);
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- end > inode->v.i_size) {
- i_size_write(&inode->v, end);
+ /*
+ * Do we need to extend the file?
+ *
+ * If we zeroed up to the end of the file, we dropped whatever writes
+ * were going to write out the current i_size, so we have to extend
+ * manually even if FL_KEEP_SIZE was set:
+ */
+ if (end >= inode->v.i_size &&
+ (!(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_ZERO_RANGE))) {
+ struct btree_iter *inode_iter;
+ struct bch_inode_unpacked inode_u;
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0);
- mutex_unlock(&inode->ei_update_lock);
- }
+ do {
+ bch2_trans_begin(&trans);
+ inode_iter = bch2_inode_peek(&trans, &inode_u,
+ inode->v.i_ino, 0);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ } while (ret == -EINTR);
- /* blech */
- if ((mode & FALLOC_FL_KEEP_SIZE) &&
- (mode & FALLOC_FL_ZERO_RANGE) &&
- inode->ei_inode.bi_size != inode->v.i_size) {
- /* sync appends.. */
+ bch2_trans_unlock(&trans);
+
+ if (ret)
+ goto err;
+
+ /*
+ * Sync existing appends before extending i_size,
+ * as in bch2_extend():
+ */
ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size, S64_MAX);
+ inode_u.bi_size, S64_MAX);
if (ret)
goto err;
- if (inode->ei_inode.bi_size != inode->v.i_size) {
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode,
- inode->v.i_size, 0);
- mutex_unlock(&inode->ei_update_lock);
- }
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ end = inode->v.i_size;
+ else
+ i_size_write(&inode->v, end);
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_write_inode_size(c, inode, end, 0);
+ mutex_unlock(&inode->ei_update_lock);
}
err:
bch2_trans_exit(&trans);
- pagecache_block_put(&mapping->add_lock);
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
}
@@ -3000,20 +2784,26 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
- if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
- return bch2_fallocate(inode, mode, offset, len);
-
- if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
- return bch2_fpunch(inode, offset, len);
+ if (!percpu_ref_tryget(&c->writes))
+ return -EROFS;
- if (mode == FALLOC_FL_INSERT_RANGE)
- return bch2_fcollapse_finsert(inode, offset, len, true);
+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+ ret = bchfs_fallocate(inode, mode, offset, len);
+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+ ret = bchfs_fpunch(inode, offset, len);
+ else if (mode == FALLOC_FL_INSERT_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+ else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+ else
+ ret = -EOPNOTSUPP;
- if (mode == FALLOC_FL_COLLAPSE_RANGE)
- return bch2_fcollapse_finsert(inode, offset, len, false);
+ percpu_ref_put(&c->writes);
- return -EOPNOTSUPP;
+ return ret;
}
static void mark_range_unallocated(struct bch_inode_info *inode,
@@ -3040,9 +2830,12 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
lock_page(page);
s = bch2_page_state(page);
- if (s)
+ if (s) {
+ spin_lock(&s->lock);
for (j = 0; j < PAGE_SECTORS; j++)
s->s[j].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
unlock_page(page);
}
@@ -3050,235 +2843,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode,
} while (index <= end_index);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3286,8 +2850,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct bch_inode_info *src = file_bch_inode(file_src);
struct bch_inode_info *dst = file_bch_inode(file_dst);
struct bch_fs *c = src->v.i_sb->s_fs_info;
+ s64 i_sectors_delta = 0;
+ u64 aligned_len;
loff_t ret = 0;
- loff_t aligned_len;
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
@@ -3303,42 +2868,51 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK, src, dst);
+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+ file_update_time(file_dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
- __pagecache_block_get(&src->v.i_mapping->add_lock);
- __pagecache_block_get(&dst->v.i_mapping->add_lock);
-
ret = generic_remap_file_range_prep(file_src, pos_src,
file_dst, pos_dst,
&len, remap_flags);
if (ret < 0 || len == 0)
- goto out_unlock;
+ goto err;
- aligned_len = round_up(len, block_bytes(c));
+ aligned_len = round_up((u64) len, block_bytes(c));
ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
- pos_dst, pos_dst + aligned_len);
+ pos_dst, pos_dst + len - 1);
if (ret)
- goto out_unlock;
+ goto err;
mark_range_unallocated(src, pos_src, pos_src + aligned_len);
- ret = bch2_remap_range(c, dst,
+ ret = bch2_remap_range(c,
POS(dst->v.i_ino, pos_dst >> 9),
POS(src->v.i_ino, pos_src >> 9),
aligned_len >> 9,
- pos_dst + len);
- if (ret > 0)
- ret = min(ret << 9, len);
+ &dst->ei_journal_seq,
+ pos_dst + len, &i_sectors_delta);
+ if (ret < 0)
+ goto err;
-out_unlock:
- __pagecache_block_put(&dst->v.i_mapping->add_lock);
- __pagecache_block_put(&src->v.i_mapping->add_lock);
+ /*
+ * due to alignment, we might have remapped slightly more than requsted
+ */
+ ret = min((u64) ret << 9, (u64) len);
+
+ /* XXX get a quota reservation */
+ i_sectors_acct(c, dst, NULL, i_sectors_delta);
- bch2_unlock_inodes(INODE_LOCK, src, dst);
+ spin_lock(&dst->v.i_lock);
+ if (pos_dst + ret > dst->v.i_size)
+ i_size_write(&dst->v, pos_dst + ret);
+ spin_unlock(&dst->v.i_lock);
+err:
+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
return ret;
}
@@ -3461,7 +3035,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
loff_t ret = -1;
page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
+ if (!page || xa_is_value(page))
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
@@ -3567,13 +3141,13 @@ int bch2_fs_fsio_init(struct bch_fs *c)
pr_verbose_init(c->opts, "");
if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+ 4, offsetof(struct bch_writepage_io, op.wbio.bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_read_bioset,
4, offsetof(struct dio_read, rbio.bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, iop.op.wbio.bio),
+ 4, offsetof(struct dio_write, op.wbio.bio),
BIOSET_NEED_BVECS))
ret = -ENOMEM;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index b0cbc849a59b..7063556d289b 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -11,16 +11,6 @@
struct quota_res;
-int bch2_extent_update(struct btree_trans *,
- struct bch_inode_info *,
- struct disk_reservation *,
- struct quota_res *,
- struct btree_iter *,
- struct bkey_i *,
- u64, bool, bool, s64 *);
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- struct bpos, struct bch_inode_info *, u64);
-
int __must_check bch2_write_inode_size(struct bch_fs *,
struct bch_inode_info *,
loff_t, unsigned);
@@ -37,8 +27,7 @@ int bch2_write_begin(struct file *, struct address_space *, loff_t,
int bch2_write_end(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page *, void *);
-ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *);
-
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
int bch2_fsync(struct file *, loff_t, loff_t, int);
@@ -46,15 +35,12 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
loff_t bch2_llseek(struct file *, loff_t, int);
+vm_fault_t bch2_page_fault(struct vm_fault *);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
int bch2_releasepage(struct page *, gfp_t);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index e80576f5a980..031e6d931171 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -5,6 +5,7 @@
#include "chardev.h"
#include "dirent.h"
#include "fs.h"
+#include "fs-common.h"
#include "fs-ioctl.h"
#include "quota.h"
@@ -164,6 +165,15 @@ err:
return ret;
}
+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_inode_info *dir = p;
+
+ return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
struct file *file,
struct bch_inode_info *src,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 83bad578ad0f..6fc6d504b094 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -3,12 +3,14 @@
#include "bcachefs.h"
#include "acl.h"
+#include "bkey_on_stack.h"
#include "btree_update.h"
#include "buckets.h"
#include "chardev.h"
#include "dirent.h"
#include "extents.h"
#include "fs.h"
+#include "fs-common.h"
#include "fs-io.h"
#include "fs-ioctl.h"
#include "fsck.h"
@@ -48,42 +50,59 @@ static void journal_seq_copy(struct bch_inode_info *dst,
} while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
}
-/*
- * I_SIZE_DIRTY requires special handling:
- *
- * To the recovery code, the flag means that there is stale data past i_size
- * that needs to be deleted; it's used for implementing atomic appends and
- * truncates.
- *
- * On append, we set I_SIZE_DIRTY before doing the write, then after the write
- * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
- * that exposes the data we just wrote.
- *
- * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
- * i_size to the new smaller size, then we delete the data that we just made
- * invisible, and then we clear I_SIZE_DIRTY.
- *
- * Because there can be multiple appends in flight at a time, we need a refcount
- * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
- * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
- *
- * Because write_inode() can be called at any time, i_size_dirty_count means
- * something different to the runtime code - it means to write_inode() "don't
- * update i_size yet".
- *
- * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
- * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
- * be set explicitly.
- */
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+ BUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, 1);
+}
+
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, 1);
+}
+
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
+{
+ __pagecache_lock_put(lock, -1);
+}
+
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
+{
+ __pagecache_lock_get(lock, -1);
+}
void bch2_inode_update_after_write(struct bch_fs *c,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
unsigned fields)
{
- set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
- ? 0
- : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+ set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
i_gid_write(&inode->v, bi->bi_gid);
inode->v.i_mode = bi->bi_mode;
@@ -100,68 +119,13 @@ void bch2_inode_update_after_write(struct bch_fs *c,
bch2_inode_flags_to_vfs(inode);
}
-int __must_check bch2_write_inode_trans(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *inode_u,
- inode_set_fn set,
- void *p)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
- struct bkey_inode_buf *inode_p;
- int ret;
-
- lockdep_assert_held(&inode->ei_update_lock);
-
- if (c->opts.new_inode_updates) {
- /* XXX: Don't do this with btree locks held */
- if (!inode->ei_inode_update)
- inode->ei_inode_update =
- bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64);
- } else {
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(inode->v.i_ino, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
- /* The btree node lock is our lock on the inode: */
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
- }
-
- *inode_u = inode->ei_inode;
-
- if (set) {
- ret = set(inode, inode_u, p);
- if (ret)
- return ret;
- }
-
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
-
- bch2_inode_pack(inode_p, inode_u);
-
- if (!inode->ei_inode_update)
- bch2_trans_update(trans,
- BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
- else
- bch2_trans_update(trans,
- BTREE_INSERT_DEFERRED(inode->ei_inode_update,
- &inode_p->inode.k_i));
-
- return 0;
-}
-
int __must_check bch2_write_inode(struct bch_fs *c,
struct bch_inode_info *inode,
inode_set_fn set,
void *p, unsigned fields)
{
struct btree_trans trans;
+ struct btree_iter *iter;
struct bch_inode_unpacked inode_u;
int ret;
@@ -169,7 +133,11 @@ int __must_check bch2_write_inode(struct bch_fs *c,
retry:
bch2_trans_begin(&trans);
- ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+ BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter) ?:
+ (set ? set(inode, &inode_u, p) : 0) ?:
+ bch2_inode_write(&trans, iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
@@ -224,32 +192,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_inode_info *dir = p;
- u64 src, dst;
- unsigned id;
- int ret = 1;
-
- for (id = 0; id < Inode_opt_nr; id++) {
- if (bi->bi_fields_set & (1 << id))
- continue;
-
- src = bch2_inode_opt_get(&dir->ei_inode, id);
- dst = bch2_inode_opt_get(bi, id);
-
- if (src == dst)
- continue;
-
- bch2_inode_opt_set(bi, id, src);
- ret = 0;
- }
-
- return ret;
-}
-
struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
{
struct bch_inode_unpacked inode_u;
@@ -277,82 +219,37 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
return &inode->v;
}
-static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
- const struct inode *dir, umode_t mode)
-{
- kuid_t uid = current_fsuid();
- kgid_t gid;
-
- if (dir && dir->i_mode & S_ISGID) {
- gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- gid = current_fsgid();
-
- inode_u->bi_uid = from_kuid(dir->i_sb->s_user_ns, uid);
- inode_u->bi_gid = from_kgid(dir->i_sb->s_user_ns, gid);
- inode_u->bi_mode = mode;
-}
-
-static int inode_update_for_create_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_inode_unpacked *new_inode = p;
-
- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-
- if (S_ISDIR(new_inode->bi_mode))
- bi->bi_nlink++;
-
- return 0;
-}
-
static struct bch_inode_info *
__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
umode_t mode, dev_t rdev, bool tmpfile)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
+ struct user_namespace *ns = dir->v.i_sb->s_user_ns;
struct btree_trans trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
- struct bch_hash_info hash_info;
struct posix_acl *default_acl = NULL, *acl = NULL;
u64 journal_seq = 0;
int ret;
- bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
- bch2_inode_init_owner(&inode_u, &dir->v, mode);
-
- hash_info = bch2_hash_info_init(c, &inode_u);
-
- if (tmpfile)
- inode_u.bi_flags |= BCH_INODE_UNLINKED;
-
- ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
- KEY_TYPE_QUOTA_PREALLOC);
- if (ret)
- return ERR_PTR(ret);
-
+ /*
+ * preallocate acls + vfs inode before btree transaction, so that
+ * nothing can fail after the transaction succeeds:
+ */
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
if (ret)
- goto err;
+ return ERR_PTR(ret);
#endif
-
- /*
- * preallocate vfs inode before btree transaction, so that nothing can
- * fail after the transaction succeeds:
- */
inode = to_bch_ei(new_inode(c->vfs_sb));
if (unlikely(!inode)) {
- ret = -ENOMEM;
+ inode = ERR_PTR(-ENOMEM);
goto err;
}
+ bch2_inode_init_early(c, &inode_u);
+
if (!tmpfile)
mutex_lock(&dir->ei_update_lock);
@@ -360,38 +257,28 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
retry:
bch2_trans_begin(&trans);
- ret = __bch2_inode_create(&trans, &inode_u,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint) ?:
- (default_acl
- ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
- default_acl, ACL_TYPE_DEFAULT)
- : 0) ?:
- (acl
- ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
- acl, ACL_TYPE_ACCESS)
- : 0) ?:
- (!tmpfile
- ? __bch2_dirent_create(&trans, dir->v.i_ino,
- &dir->ei_str_hash,
- mode_to_type(mode),
- &dentry->d_name,
- inode_u.bi_inum,
- BCH_HASH_SET_MUST_CREATE)
- : 0) ?:
- (!tmpfile
- ? bch2_write_inode_trans(&trans, dir, &dir_u,
- inode_update_for_create_fn,
- &inode_u)
- : 0) ?:
- bch2_trans_commit(&trans, NULL,
- &journal_seq,
+ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
+ !tmpfile ? &dentry->d_name : NULL,
+ from_kuid(ns, current_fsuid()),
+ from_kgid(ns, current_fsgid()),
+ mode, rdev,
+ default_acl, acl) ?:
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_before_quota;
+
+ ret = bch2_trans_commit(&trans, NULL, &journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
- if (ret == -EINTR)
- goto retry;
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+ KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+ if (ret == -EINTR)
+ goto retry;
goto err_trans;
+ }
if (!tmpfile) {
bch2_inode_update_after_write(c, dir, &dir_u,
@@ -418,7 +305,7 @@ retry:
* We raced, another process pulled the new inode into cache
* before us:
*/
- old->ei_journal_seq = inode->ei_journal_seq;
+ journal_seq_copy(old, journal_seq);
make_bad_inode(&inode->v);
iput(&inode->v);
@@ -432,7 +319,7 @@ retry:
}
bch2_trans_exit(&trans);
-out:
+err:
posix_acl_release(default_acl);
posix_acl_release(acl);
return inode;
@@ -443,10 +330,8 @@ err_trans:
bch2_trans_exit(&trans);
make_bad_inode(&inode->v);
iput(&inode->v);
-err:
- bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN);
inode = ERR_PTR(ret);
- goto out;
+ goto err;
}
/* methods */
@@ -469,11 +354,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
return d_splice_alias(vinode, dentry);
}
-static int bch2_create(struct inode *vdir, struct dentry *dentry,
- umode_t mode, bool excl)
+static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
- __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -482,20 +367,10 @@ static int bch2_create(struct inode *vdir, struct dentry *dentry,
return 0;
}
-static int inode_update_for_link_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
+static int bch2_create(struct inode *vdir, struct dentry *dentry,
+ umode_t mode, bool excl)
{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_ctime = bch2_current_time(c);
-
- if (bi->bi_flags & BCH_INODE_UNLINKED)
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
- else
- bi->bi_nlink++;
-
- return 0;
+ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0);
}
static int __bch2_link(struct bch_fs *c,
@@ -509,25 +384,18 @@ static int __bch2_link(struct bch_fs *c,
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
-retry:
- bch2_trans_begin(&trans);
- ret = __bch2_dirent_create(&trans, dir->v.i_ino,
- &dir->ei_str_hash,
- mode_to_type(inode->v.i_mode),
- &dentry->d_name,
- inode->v.i_ino,
- BCH_HASH_SET_MUST_CREATE) ?:
- bch2_write_inode_trans(&trans, inode, &inode_u,
- inode_update_for_link_fn,
- NULL) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOUNLOCK);
-
- if (ret == -EINTR)
- goto retry;
+ do {
+ bch2_trans_begin(&trans);
+ ret = bch2_link_trans(&trans,
+ dir->v.i_ino,
+ inode->v.i_ino, &inode_u,
+ &dentry->d_name) ?:
+ bch2_trans_commit(&trans, NULL,
+ &inode->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK);
+ } while (ret == -EINTR);
if (likely(!ret))
bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
@@ -556,35 +424,6 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
return 0;
}
-static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_inode_info *unlink_inode = p;
-
- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
-
- bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
-
- return 0;
-}
-
-static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
- bi->bi_ctime = bch2_current_time(c);
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_UNLINKED;
-
- return 0;
-}
-
static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
@@ -596,36 +435,30 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
-retry:
- bch2_trans_begin(&trans);
-
- ret = __bch2_dirent_delete(&trans, dir->v.i_ino,
- &dir->ei_str_hash,
- &dentry->d_name) ?:
- bch2_write_inode_trans(&trans, dir, &dir_u,
- inode_update_dir_for_unlink_fn,
- inode) ?:
- bch2_write_inode_trans(&trans, inode, &inode_u,
- inode_update_for_unlink_fn,
- NULL) ?:
- bch2_trans_commit(&trans, NULL,
- &dir->ei_journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_NOFAIL);
- if (ret == -EINTR)
- goto retry;
- if (ret)
- goto err;
- if (dir->ei_journal_seq > inode->ei_journal_seq)
- inode->ei_journal_seq = dir->ei_journal_seq;
+ do {
+ bch2_trans_begin(&trans);
+
+ ret = bch2_unlink_trans(&trans,
+ dir->v.i_ino, &dir_u,
+ &inode_u, &dentry->d_name) ?:
+ bch2_trans_commit(&trans, NULL,
+ &dir->ei_journal_seq,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOUNLOCK|
+ BTREE_INSERT_NOFAIL);
+ } while (ret == -EINTR);
+
+ if (likely(!ret)) {
+ BUG_ON(inode_u.bi_inum != inode->v.i_ino);
+
+ journal_seq_copy(inode, dir->ei_journal_seq);
+ bch2_inode_update_after_write(c, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ bch2_inode_update_after_write(c, inode, &inode_u,
+ ATTR_MTIME);
+ }
- bch2_inode_update_after_write(c, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(c, inode, &inode_u,
- ATTR_MTIME);
-err:
bch2_trans_exit(&trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@@ -669,98 +502,7 @@ err:
static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- struct bch_inode_info *inode =
- __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
-
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
-{
- struct bch_fs *c = vdir->i_sb->s_fs_info;
-
- if (bch2_empty_dir(c, dentry->d_inode->i_ino))
- return -ENOTEMPTY;
-
- return bch2_unlink(vdir, dentry);
-}
-
-static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct bch_inode_info *inode =
- __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
-
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- d_instantiate(dentry, &inode->v);
- return 0;
-}
-
-struct rename_info {
- u64 now;
- struct bch_inode_info *src_dir;
- struct bch_inode_info *dst_dir;
- struct bch_inode_info *src_inode;
- struct bch_inode_info *dst_inode;
- enum bch_rename_mode mode;
-};
-
-static int inode_update_for_rename_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct rename_info *info = p;
- int ret;
-
- if (inode == info->src_dir) {
- bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
- bi->bi_nlink += info->dst_inode &&
- S_ISDIR(info->dst_inode->v.i_mode) &&
- info->mode == BCH_RENAME_EXCHANGE;
- }
-
- if (inode == info->dst_dir) {
- bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
- bi->bi_nlink -= info->dst_inode &&
- S_ISDIR(info->dst_inode->v.i_mode);
- }
-
- if (inode == info->src_inode) {
- ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir);
-
- BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode));
- }
-
- if (inode == info->dst_inode &&
- info->mode == BCH_RENAME_EXCHANGE) {
- ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir);
-
- BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode));
- }
-
- if (inode == info->dst_inode &&
- info->mode == BCH_RENAME_OVERWRITE) {
- BUG_ON(bi->bi_nlink &&
- S_ISDIR(info->dst_inode->v.i_mode));
-
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_UNLINKED;
- }
-
- if (inode == info->src_dir ||
- inode == info->dst_dir)
- bi->bi_mtime = info->now;
- bi->bi_ctime = info->now;
-
- return 0;
+ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0);
}
static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
@@ -768,35 +510,25 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
unsigned flags)
{
struct bch_fs *c = src_vdir->i_sb->s_fs_info;
- struct rename_info i = {
- .src_dir = to_bch_ei(src_vdir),
- .dst_dir = to_bch_ei(dst_vdir),
- .src_inode = to_bch_ei(src_dentry->d_inode),
- .dst_inode = to_bch_ei(dst_dentry->d_inode),
- .mode = flags & RENAME_EXCHANGE
- ? BCH_RENAME_EXCHANGE
- : dst_dentry->d_inode
- ? BCH_RENAME_OVERWRITE : BCH_RENAME,
- };
- struct btree_trans trans;
+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct btree_trans trans;
+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+ ? BCH_RENAME_EXCHANGE
+ : dst_dentry->d_inode
+ ? BCH_RENAME_OVERWRITE : BCH_RENAME;
u64 journal_seq = 0;
int ret;
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
return -EINVAL;
- if (i.mode == BCH_RENAME_OVERWRITE) {
- if (S_ISDIR(i.src_inode->v.i_mode) !=
- S_ISDIR(i.dst_inode->v.i_mode))
- return -ENOTDIR;
-
- if (S_ISDIR(i.src_inode->v.i_mode) &&
- bch2_empty_dir(c, i.dst_inode->v.i_ino))
- return -ENOTEMPTY;
-
- ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+ if (mode == BCH_RENAME_OVERWRITE) {
+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
0, LLONG_MAX);
if (ret)
return ret;
@@ -805,37 +537,24 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
bch2_trans_init(&trans, c, 8, 2048);
bch2_lock_inodes(INODE_UPDATE_LOCK,
- i.src_dir,
- i.dst_dir,
- i.src_inode,
- i.dst_inode);
-
- if (S_ISDIR(i.src_inode->v.i_mode) &&
- inode_attrs_changing(i.dst_dir, i.src_inode)) {
- ret = -EXDEV;
- goto err;
- }
-
- if (i.mode == BCH_RENAME_EXCHANGE &&
- S_ISDIR(i.dst_inode->v.i_mode) &&
- inode_attrs_changing(i.src_dir, i.dst_inode)) {
- ret = -EXDEV;
- goto err;
- }
-
- if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, i.src_inode,
- i.dst_dir->ei_qid,
+ src_dir,
+ dst_dir,
+ src_inode,
+ dst_inode);
+
+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+ ret = bch2_fs_quota_transfer(c, src_inode,
+ dst_dir->ei_qid,
1 << QTYP_PRJ,
KEY_TYPE_QUOTA_PREALLOC);
if (ret)
goto err;
}
- if (i.mode == BCH_RENAME_EXCHANGE &&
- inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) {
- ret = bch2_fs_quota_transfer(c, i.dst_inode,
- i.src_dir->ei_qid,
+ if (mode == BCH_RENAME_EXCHANGE &&
+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+ ret = bch2_fs_quota_transfer(c, dst_inode,
+ src_dir->ei_qid,
1 << QTYP_PRJ,
KEY_TYPE_QUOTA_PREALLOC);
if (ret)
@@ -844,24 +563,14 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
retry:
bch2_trans_begin(&trans);
- i.now = bch2_current_time(c);
-
- ret = bch2_dirent_rename(&trans,
- i.src_dir, &src_dentry->d_name,
- i.dst_dir, &dst_dentry->d_name,
- i.mode) ?:
- bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
- inode_update_for_rename_fn, &i) ?:
- (i.src_dir != i.dst_dir
- ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
- inode_update_for_rename_fn, &i)
- : 0 ) ?:
- bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
- inode_update_for_rename_fn, &i) ?:
- (i.dst_inode
- ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
- inode_update_for_rename_fn, &i)
- : 0 ) ?:
+ ret = bch2_rename_trans(&trans,
+ src_dir->v.i_ino, &src_dir_u,
+ dst_dir->v.i_ino, &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode) ?:
bch2_trans_commit(&trans, NULL,
&journal_seq,
BTREE_INSERT_ATOMIC|
@@ -871,59 +580,62 @@ retry:
if (unlikely(ret))
goto err;
- bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+ BUG_ON(dst_inode &&
+ dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+ bch2_inode_update_after_write(c, src_dir, &src_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(i.src_dir, journal_seq);
+ journal_seq_copy(src_dir, journal_seq);
- if (i.src_dir != i.dst_dir) {
- bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+ if (src_dir != dst_dir) {
+ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(i.dst_dir, journal_seq);
+ journal_seq_copy(dst_dir, journal_seq);
}
- journal_seq_copy(i.src_inode, journal_seq);
- if (i.dst_inode)
- journal_seq_copy(i.dst_inode, journal_seq);
-
- bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+ bch2_inode_update_after_write(c, src_inode, &src_inode_u,
ATTR_CTIME);
- if (i.dst_inode)
- bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+ journal_seq_copy(src_inode, journal_seq);
+
+ if (dst_inode) {
+ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
ATTR_CTIME);
+ journal_seq_copy(dst_inode, journal_seq);
+ }
err:
bch2_trans_exit(&trans);
- bch2_fs_quota_transfer(c, i.src_inode,
- bch_qid(&i.src_inode->ei_inode),
+ bch2_fs_quota_transfer(c, src_inode,
+ bch_qid(&src_inode->ei_inode),
1 << QTYP_PRJ,
KEY_TYPE_QUOTA_NOCHECK);
- if (i.dst_inode)
- bch2_fs_quota_transfer(c, i.dst_inode,
- bch_qid(&i.dst_inode->ei_inode),
+ if (dst_inode)
+ bch2_fs_quota_transfer(c, dst_inode,
+ bch_qid(&dst_inode->ei_inode),
1 << QTYP_PRJ,
KEY_TYPE_QUOTA_NOCHECK);
bch2_unlock_inodes(INODE_UPDATE_LOCK,
- i.src_dir,
- i.dst_dir,
- i.src_inode,
- i.dst_inode);
+ src_dir,
+ dst_dir,
+ src_inode,
+ dst_inode);
return ret;
}
-static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
+void bch2_setattr_copy(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ struct iattr *attr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct iattr *attr = p;
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
+ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid);
if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
+ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid);
if (ia_valid & ATTR_ATIME)
bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
@@ -943,15 +655,15 @@ static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
-
- return 0;
}
-static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
+static int bch2_setattr_nonsize(struct bch_inode_info *inode,
+ struct iattr *attr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans trans;
+ struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
int ret;
@@ -960,11 +672,11 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt
qid = inode->ei_qid;
- if (iattr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid);
+ if (attr->ia_valid & ATTR_UID)
+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
- if (iattr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
+ if (attr->ia_valid & ATTR_GID)
+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -977,22 +689,33 @@ retry:
kfree(acl);
acl = NULL;
- ret = bch2_write_inode_trans(&trans, inode, &inode_u,
- inode_update_for_setattr_fn, iattr) ?:
- (iattr->ia_valid & ATTR_MODE
- ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
- : 0) ?:
+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
+ BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto btree_err;
+
+ bch2_setattr_copy(inode, &inode_u, attr);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+ if (ret)
+ goto btree_err;
+ }
+
+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
+btree_err:
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err_trans;
- bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
if (acl)
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -1031,10 +754,15 @@ static int bch2_getattr(const struct path *path, struct kstat *stat,
if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
+
if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
+ stat->attributes_mask |= STATX_ATTR_APPEND;
+
if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= STATX_ATTR_NODUMP;
return 0;
}
@@ -1123,7 +851,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- BKEY_PADDED(k) cur, prev;
+ struct bkey_on_stack cur, prev;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
@@ -1132,6 +860,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (start + len < start)
return -EINVAL;
+ bkey_on_stack_init(&cur);
+ bkey_on_stack_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1146,15 +876,17 @@ retry:
continue;
}
- bkey_reassemble(&cur.k, k);
- k = bkey_i_to_s_c(&cur.k);
+ bkey_on_stack_realloc(&cur, c, k.k->u64s);
+ bkey_on_stack_realloc(&prev, c, k.k->u64s);
+ bkey_reassemble(cur.k, k);
+ k = bkey_i_to_s_c(cur.k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(&trans,
- &offset_into_extent, &cur.k);
+ &offset_into_extent, cur.k);
if (ret)
break;
@@ -1164,19 +896,19 @@ retry:
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) +
offset_into_extent),
- &cur.k);
- bch2_key_resize(&cur.k.k, sectors);
- cur.k.k.p = iter->pos;
- cur.k.k.p.offset += cur.k.k.size;
+ cur.k);
+ bch2_key_resize(&cur.k->k, sectors);
+ cur.k->k.p = iter->pos;
+ cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
ret = bch2_fill_extent(c, info,
- bkey_i_to_s_c(&prev.k), 0);
+ bkey_i_to_s_c(prev.k), 0);
if (ret)
break;
}
- bkey_copy(&prev.k, &cur.k);
+ bkey_copy(prev.k, cur.k);
have_extent = true;
if (k.k->type == KEY_TYPE_reflink_v)
@@ -1189,15 +921,17 @@ retry:
goto retry;
if (!ret && have_extent)
- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
ret = bch2_trans_exit(&trans) ?: ret;
+ bkey_on_stack_exit(&cur, c);
+ bkey_on_stack_exit(&prev, c);
return ret < 0 ? ret : 0;
}
static const struct vm_operations_struct bch_vm_ops = {
- .fault = filemap_fault,
+ .fault = bch2_page_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = bch2_page_mkwrite,
};
@@ -1220,35 +954,33 @@ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct bch_fs *c = file_inode(file)->i_sb->s_fs_info;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
- return bch2_readdir(c, file, ctx);
-}
+ if (!dir_emit_dots(file, ctx))
+ return 0;
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
+ return bch2_readdir(c, inode->v.i_ino, ctx);
}
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
.open = generic_file_open,
.fsync = bch2_fsync,
.splice_read = generic_file_splice_read,
+ /*
+ * Broken, on v5.3:
.splice_write = iter_file_splice_write,
+ */
.fallocate = bch2_fallocate_dispatch,
.unlocked_ioctl = bch2_fs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1269,7 +1001,7 @@ static const struct inode_operations bch_dir_inode_operations = {
.unlink = bch2_unlink,
.symlink = bch2_symlink,
.mkdir = bch2_mkdir,
- .rmdir = bch2_rmdir,
+ .rmdir = bch2_unlink,
.mknod = bch2_mknod,
.rename = bch2_rename2,
.getattr = bch2_getattr,
@@ -1285,7 +1017,7 @@ static const struct inode_operations bch_dir_inode_operations = {
static const struct file_operations bch_dir_file_operations = {
.llseek = bch2_dir_llseek,
.read = generic_read_dir,
- .iterate = bch2_vfs_readdir,
+ .iterate_shared = bch2_vfs_readdir,
.fsync = bch2_fsync,
.unlocked_ioctl = bch2_fs_file_ioctl,
#ifdef CONFIG_COMPAT
@@ -1324,7 +1056,7 @@ static const struct address_space_operations bch_address_space_operations = {
.write_end = bch2_write_end,
.invalidatepage = bch2_invalidatepage,
.releasepage = bch2_releasepage,
- .direct_IO = bch2_direct_IO,
+ .direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = bch2_migrate_page,
#endif
@@ -1420,8 +1152,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
+ pagecache_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
- inode->ei_inode_update = NULL;
inode->ei_journal_seq = 0;
return &inode->v;
@@ -1479,10 +1211,6 @@ static void bch2_evict_inode(struct inode *vinode)
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
- if (inode->ei_inode_update)
- bch2_deferred_update_free(c, inode->ei_inode_update);
- inode->ei_inode_update = NULL;
-
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
KEY_TYPE_QUOTA_WARN);
@@ -1783,7 +1511,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
sb->s_bdi->congested_fn = bch2_congested;
sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 226223b058a9..eda903a45325 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -10,15 +10,36 @@
#include <linux/seqlock.h>
#include <linux/stat.h>
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
struct bch_inode_info {
struct inode v;
struct mutex ei_update_lock;
- struct deferred_update *ei_inode_update;
u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
+ struct pagecache_lock ei_pagecache_lock;
+
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
@@ -38,7 +59,8 @@ static inline int ptrcmp(void *l, void *r)
enum bch_inode_lock_op {
INODE_LOCK = (1U << 0),
- INODE_UPDATE_LOCK = (1U << 1),
+ INODE_PAGECACHE_BLOCK = (1U << 1),
+ INODE_UPDATE_LOCK = (1U << 2),
};
#define bch2_lock_inodes(_locks, ...) \
@@ -50,9 +72,11 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if (_locks & INODE_LOCK) \
+ if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
- if (_locks & INODE_UPDATE_LOCK) \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \
} while (0)
@@ -66,9 +90,11 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if (_locks & INODE_LOCK) \
+ if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
- if (_locks & INODE_UPDATE_LOCK) \
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+ if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \
} \
} while (0)
@@ -78,16 +104,6 @@ static inline struct bch_inode_info *file_bch_inode(struct file *file)
return to_bch_ei(file_inode(file));
}
-static inline u8 mode_to_type(umode_t mode)
-{
- return (mode >> 12) & 15;
-}
-
-static inline unsigned nlink_bias(umode_t mode)
-{
- return S_ISDIR(mode) ? 2 : 1;
-}
-
static inline bool inode_attr_changing(struct bch_inode_info *dir,
struct bch_inode_info *inode,
enum inode_opt_id id)
@@ -142,17 +158,9 @@ void bch2_inode_update_after_write(struct bch_fs *,
struct bch_inode_info *,
struct bch_inode_unpacked *,
unsigned);
-int __must_check bch2_write_inode_trans(struct btree_trans *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *,
- inode_set_fn, void *);
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned);
-int bch2_reinherit_attrs_fn(struct bch_inode_info *,
- struct bch_inode_unpacked *,
- void *);
-
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 50a7d8c1faba..0f2308e53d65 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -4,7 +4,7 @@
#include "btree_update.h"
#include "dirent.h"
#include "error.h"
-#include "fs.h"
+#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
@@ -80,9 +80,7 @@ static int reattach_inode(struct bch_fs *c,
struct bch_inode_unpacked *lostfound_inode,
u64 inum)
{
- struct bch_hash_info lostfound_hash_info =
- bch2_hash_info_init(c, lostfound_inode);
- struct bkey_inode_buf packed;
+ struct bch_inode_unpacked inode_u;
char name_buf[20];
struct qstr name;
int ret;
@@ -90,30 +88,14 @@ static int reattach_inode(struct bch_fs *c,
snprintf(name_buf, sizeof(name_buf), "%llu", inum);
name = (struct qstr) QSTR(name_buf);
- lostfound_inode->bi_nlink++;
-
- bch2_inode_pack(&packed, lostfound_inode);
-
- ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error %i reattaching inode %llu while updating lost+found",
- ret, inum);
- return ret;
- }
+ ret = bch2_trans_do(c, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_LAZY_RW,
+ bch2_link_trans(&trans, lostfound_inode->bi_inum,
+ inum, &inode_u, &name));
+ if (ret)
+ bch_err(c, "error %i reattaching inode %llu", ret, inum);
- ret = bch2_dirent_create(c, lostfound_inode->bi_inum,
- &lostfound_hash_info,
- DT_DIR, &name, inum, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error %i reattaching inode %llu while creating new dirent",
- ret, inum);
- return ret;
- }
return ret;
}
@@ -165,6 +147,7 @@ struct hash_check {
static void hash_check_init(struct hash_check *h)
{
h->chain = NULL;
+ h->chain_end = 0;
}
static void hash_stop_chain(struct btree_trans *trans,
@@ -248,7 +231,7 @@ static int hash_check_duplicates(struct btree_trans *trans,
iter = bch2_trans_copy_iter(trans, h->chain);
BUG_ON(IS_ERR(iter));
- for_each_btree_key_continue(iter, 0, k2) {
+ for_each_btree_key_continue(iter, 0, k2, ret) {
if (bkey_cmp(k2.k->p, k.k->p) >= 0)
break;
@@ -393,7 +376,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h,
if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)",
buf, strlen(buf), d->v.d_name, len)) {
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i));
+ bch2_trans_update(trans, iter, &d->k_i);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
@@ -458,7 +441,7 @@ static int check_extents(struct bch_fs *c)
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key_continue(iter, 0, k, ret) {
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
@@ -553,7 +536,7 @@ static int check_dirents(struct bch_fs *c)
iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key_continue(iter, 0, k, ret) {
struct bkey_s_c_dirent d;
struct bch_inode_unpacked target;
bool have_target;
@@ -663,8 +646,7 @@ retry:
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = mode_to_type(target.bi_mode);
- bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, &n->k_i));
+ bch2_trans_update(&trans, iter, &n->k_i);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
@@ -707,7 +689,7 @@ static int check_xattrs(struct bch_fs *c)
iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key_continue(iter, 0, k, ret) {
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
@@ -759,7 +741,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
fsck_err:
return ret;
create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
@@ -779,7 +761,6 @@ static int check_lostfound(struct bch_fs *c,
struct qstr lostfound = QSTR("lost+found");
struct bch_hash_info root_hash_info =
bch2_hash_info_init(c, root_inode);
- struct bkey_inode_buf packed;
u64 inum;
int ret;
@@ -807,33 +788,20 @@ static int check_lostfound(struct bch_fs *c,
fsck_err:
return ret;
create_lostfound:
- root_inode->bi_nlink++;
-
- bch2_inode_pack(&packed, root_inode);
-
- ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ bch2_inode_init_early(c, lostfound_inode);
+
+ ret = bch2_trans_do(c, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_create_trans(&trans,
+ BCACHEFS_ROOT_INO, root_inode,
+ lostfound_inode, &lostfound,
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL));
if (ret)
- return ret;
+ bch_err(c, "error creating lost+found: %i", ret);
- bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
- 0, root_inode);
-
- ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
- if (ret)
- return ret;
-
- ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
- &lostfound, lostfound_inode->bi_inum, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret)
- return ret;
-
- return 0;
+ return ret;
}
struct inode_bitmap {
@@ -995,7 +963,7 @@ up:
iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
retry:
- for_each_btree_key_continue(iter, 0, k) {
+ for_each_btree_key_continue(iter, 0, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
@@ -1021,7 +989,7 @@ retry:
had_unreachable = true;
}
}
- ret = bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_free(&trans, iter);
if (ret)
goto err;
@@ -1116,9 +1084,7 @@ static int check_inode_nlink(struct bch_fs *c,
struct nlink *link,
bool *do_update)
{
- u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
- ? 0
- : u->bi_nlink + nlink_bias(u->bi_mode);
+ u32 i_nlink = bch2_inode_nlink_get(u);
u32 real_i_nlink =
link->count * nlink_bias(u->bi_mode) +
link->dir_count;
@@ -1197,14 +1163,7 @@ static int check_inode_nlink(struct bch_fs *c,
u->bi_inum, i_nlink, real_i_nlink);
set_i_nlink:
if (i_nlink != real_i_nlink) {
- if (real_i_nlink) {
- u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
- u->bi_flags &= ~BCH_INODE_UNLINKED;
- } else {
- u->bi_nlink = 0;
- u->bi_flags |= BCH_INODE_UNLINKED;
- }
-
+ bch2_inode_nlink_set(u, real_i_nlink);
*do_update = true;
}
fsck_err:
@@ -1302,7 +1261,7 @@ static int check_inode(struct btree_trans *trans,
struct bkey_inode_buf p;
bch2_inode_pack(&p, &u);
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
+ bch2_trans_update(trans, iter, &p.inode.k_i);
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 05b7f6594113..c0642ff46ba0 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -6,8 +6,7 @@
#include "error.h"
#include "extents.h"
#include "inode.h"
-#include "io.h"
-#include "keylist.h"
+#include "str_hash.h"
#include <linux/random.h>
@@ -96,6 +95,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ unsigned bytes;
bkey_inode_init(&packed->inode.k_i);
packed->inode.k.p.inode = inode->bi_inum;
@@ -118,10 +118,9 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
out = last_nonzero_field;
nr_fields = last_nonzero_fieldnr;
- set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
- memset(out, 0,
- (u8 *) &packed->inode.v +
- bkey_val_bytes(&packed->inode.k) - out);
+ bytes = out - (u8 *) &packed->inode.v;
+ set_bkey_val_bytes(&packed->inode.k, bytes);
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
@@ -181,6 +180,53 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
return 0;
}
+struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u64 inum, unsigned flags)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ int ret;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+ BTREE_ITER_SLOTS|flags);
+ if (IS_ERR(iter))
+ return iter;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ if (ret)
+ goto err;
+
+ return iter;
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ERR_PTR(ret);
+}
+
+int bch2_inode_write(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode)
+{
+ struct bkey_inode_buf *inode_p;
+
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return PTR_ERR(inode_p);
+
+ bch2_inode_pack(inode_p, inode);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i);
+ return 0;
+}
+
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -251,19 +297,24 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
-void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
- struct bch_inode_unpacked *parent)
+void bch2_inode_init_early(struct bch_fs *c,
+ struct bch_inode_unpacked *inode_u)
{
- s64 now = bch2_current_time(c);
+ enum bch_str_hash_type str_hash =
+ bch2_str_hash_opt_to_type(c, c->opts.str_hash);
memset(inode_u, 0, sizeof(*inode_u));
/* ick */
- inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
get_random_bytes(&inode_u->bi_hash_seed,
sizeof(inode_u->bi_hash_seed));
+}
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
+{
inode_u->bi_mode = mode;
inode_u->bi_uid = uid;
inode_u->bi_gid = gid;
@@ -273,6 +324,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
inode_u->bi_ctime = now;
inode_u->bi_otime = now;
+ if (parent && parent->bi_mode & S_ISGID) {
+ inode_u->bi_gid = parent->bi_gid;
+ if (S_ISDIR(mode))
+ inode_u->bi_mode |= S_ISGID;
+ }
+
if (parent) {
#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
BCH_INODE_OPTS()
@@ -280,6 +337,15 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
}
}
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
+{
+ bch2_inode_init_early(c, inode_u);
+ bch2_inode_init_late(inode_u, bch2_current_time(c),
+ uid, gid, mode, rdev, parent);
+}
+
static inline u32 bkey_generation(struct bkey_s_c k)
{
switch (k.k->type) {
@@ -292,9 +358,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-int __bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
+int bch2_inode_create(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u,
+ u64 min, u64 max, u64 *hint)
{
struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
@@ -345,8 +411,7 @@ again:
inode_u->bi_generation = bkey_generation(k);
bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans,
- BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i);
return 0;
}
}
@@ -361,13 +426,6 @@ out:
return -ENOSPC;
}
-int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
-{
- return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
- __bch2_inode_create(&trans, inode_u, min, max, hint));
-}
-
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
{
struct btree_trans trans;
@@ -435,8 +493,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
delete.v.bi_generation = cpu_to_le32(bi_generation);
}
- bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, &delete.k_i));
+ bch2_trans_update(&trans, iter, &delete.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
@@ -452,7 +509,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
{
struct btree_iter *iter;
struct bkey_s_c k;
- int ret = -ENOENT;
+ int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
POS(inode_nr, 0), BTREE_ITER_SLOTS);
@@ -460,8 +517,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
return PTR_ERR(iter);
k = bch2_btree_iter_peek_slot(iter);
- if (k.k->type == KEY_TYPE_inode)
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = k.k->type == KEY_TYPE_inode
+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+ : -ENOENT;
bch2_trans_iter_put(trans, iter);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index af0c355f2f04..bb759a46dc41 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -46,14 +46,22 @@ struct bkey_inode_buf {
void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+struct btree_iter *bch2_inode_peek(struct btree_trans *,
+ struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_write(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *);
+
+void bch2_inode_init_early(struct bch_fs *,
+ struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+ uid_t, gid_t, umode_t, dev_t,
+ struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-int __bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *,
- u64, u64, u64 *);
-int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+int bch2_inode_create(struct btree_trans *,
+ struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_rm(struct bch_fs *, u64);
@@ -103,6 +111,63 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
}
}
+static inline struct bch_io_opts
+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
+{
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
+ return opts;
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+ return (mode >> 12) & 15;
+}
+
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+ return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_flags & BCH_INODE_UNLINKED)
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ else
+ bi->bi_nlink++;
+}
+
+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
+{
+ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_UNLINKED;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+ return bi->bi_flags & BCH_INODE_UNLINKED
+ ? 0
+ : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+ unsigned nlink)
+{
+ if (nlink) {
+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ } else {
+ bi->bi_nlink = 0;
+ bi->bi_flags |= BCH_INODE_UNLINKED;
+ }
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index d5c7e45ed5b9..f483312acd0d 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_on_stack.h"
#include "bset.h"
#include "btree_update.h"
#include "buckets.h"
@@ -18,7 +19,8 @@
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
-#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
#include "io.h"
#include "journal.h"
#include "keylist.h"
@@ -122,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -168,6 +170,262 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
mutex_unlock(&c->bio_bounce_pages_lock);
}
+/* Extent update path: */
+
+static int sum_sector_overwrites(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i *new,
+ bool may_allocate,
+ bool *maybe_extending,
+ s64 *delta)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c old;
+ int ret = 0;
+
+ *maybe_extending = true;
+ *delta = 0;
+
+ iter = bch2_trans_copy_iter(trans, extent_iter);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+ if (!may_allocate &&
+ bch2_bkey_nr_ptrs_fully_allocated(old) <
+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ *delta += (min(new->k.p.offset,
+ old.k->p.offset) -
+ max(bkey_start_offset(&new->k),
+ bkey_start_offset(old.k))) *
+ (bkey_extent_is_allocation(&new->k) -
+ bkey_extent_is_allocation(old.k));
+
+ if (bkey_cmp(old.k->p, new->k.p) >= 0) {
+ /*
+ * Check if there's already data above where we're
+ * going to be writing to - this means we're definitely
+ * not extending the file:
+ *
+ * Note that it's not sufficient to check if there's
+ * data up to the sector offset we're going to be
+ * writing to, because i_size could be up to one block
+ * less:
+ */
+ if (!bkey_cmp(old.k->p, new->k.p))
+ old = bch2_btree_iter_next(iter);
+
+ if (old.k && !bkey_err(old) &&
+ old.k->p.inode == extent_iter->pos.inode &&
+ bkey_extent_is_data(old.k))
+ *maybe_extending = false;
+
+ break;
+ }
+ }
+
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *k,
+ struct disk_reservation *disk_res,
+ u64 *journal_seq,
+ u64 new_i_size,
+ s64 *i_sectors_delta)
+{
+ /* this must live until after bch2_trans_commit(): */
+ struct bkey_inode_buf inode_p;
+ bool extending = false;
+ s64 delta = 0;
+ int ret;
+
+ ret = bch2_extent_trim_atomic(k, iter);
+ if (ret)
+ return ret;
+
+ ret = sum_sector_overwrites(trans, iter, k,
+ disk_res && disk_res->sectors != 0,
+ &extending, &delta);
+ if (ret)
+ return ret;
+
+ new_i_size = extending
+ ? min(k->k.p.offset << 9, new_i_size)
+ : 0;
+
+ if (delta || new_i_size) {
+ struct btree_iter *inode_iter;
+ struct bch_inode_unpacked inode_u;
+
+ inode_iter = bch2_inode_peek(trans, &inode_u,
+ k->k.p.inode, BTREE_ITER_INTENT);
+ if (IS_ERR(inode_iter))
+ return PTR_ERR(inode_iter);
+
+ /*
+ * XXX:
+ * writeback can race a bit with truncate, because truncate
+ * first updates the inode then truncates the pagecache. This is
+ * ugly, but lets us preserve the invariant that the in memory
+ * i_size is always >= the on disk i_size.
+ *
+ BUG_ON(new_i_size > inode_u.bi_size &&
+ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
+ */
+ BUG_ON(new_i_size > inode_u.bi_size && !extending);
+
+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > inode_u.bi_size)
+ inode_u.bi_size = new_i_size;
+ else
+ new_i_size = 0;
+
+ inode_u.bi_sectors += delta;
+
+ if (delta || new_i_size) {
+ bch2_inode_pack(&inode_p, &inode_u);
+ bch2_trans_update(trans, inode_iter,
+ &inode_p.inode.k_i);
+ }
+
+ bch2_trans_iter_put(trans, inode_iter);
+ }
+
+ bch2_trans_update(trans, iter, k);
+
+ ret = bch2_trans_commit(trans, disk_res, journal_seq,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE);
+ if (!ret && i_sectors_delta)
+ *i_sectors_delta += delta;
+
+ return ret;
+}
+
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, u64 *journal_seq,
+ s64 *i_sectors_delta)
+{
+ struct bch_fs *c = trans->c;
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bkey_s_c k;
+ int ret = 0, ret2 = 0;
+
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ bkey_cmp(iter->pos, end) < 0) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ struct bkey_i delete;
+
+ ret = bkey_err(k);
+ if (ret)
+ goto btree_err;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter->pos;
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+ bch2_cut_back(end, &delete);
+
+ bch2_trans_begin_updates(trans);
+
+ ret = bch2_extent_update(trans, iter, &delete,
+ &disk_res, journal_seq,
+ 0, i_sectors_delta);
+ bch2_disk_reservation_put(c, &disk_res);
+btree_err:
+ if (ret == -EINTR) {
+ ret2 = ret;
+ ret = 0;
+ }
+ if (ret)
+ break;
+ }
+
+ if (bkey_cmp(iter->pos, end) > 0) {
+ bch2_btree_iter_set_pos(iter, end);
+ ret = bch2_btree_iter_traverse(iter);
+ }
+
+ return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+ u64 *journal_seq, s64 *i_sectors_delta)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(inum, start),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
+ journal_seq, i_sectors_delta);
+ bch2_trans_exit(&trans);
+
+ if (ret == -EINTR)
+ ret = 0;
+
+ return ret;
+}
+
+int bch2_write_index_default(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct bkey_on_stack sk;
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_i *k = bch2_keylist_front(keys);
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret;
+
+ bkey_on_stack_init(&sk);
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ do {
+ k = bch2_keylist_front(keys);
+
+ bkey_on_stack_realloc(&sk, c, k->k.u64s);
+ bkey_copy(sk.k, k);
+ bch2_cut_front(iter->pos, sk.k);
+
+ bch2_trans_begin_updates(&trans);
+
+ ret = bch2_extent_update(&trans, iter, sk.k,
+ &op->res, op_journal_seq(op),
+ op->new_i_size, &op->i_sectors_delta);
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
+
+ if (bkey_cmp(iter->pos, k->k.p) >= 0)
+ bch2_keylist_pop_front(keys);
+ } while (!bch2_keylist_empty(keys));
+
+ bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
+
+ return ret;
+}
+
/* Writes */
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -243,60 +501,12 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- closure_return(cl);
-}
-
-int bch2_write_index_default(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans trans;
- struct btree_iter *iter;
- struct keylist *keys = &op->insert_keys;
- int ret;
-
- BUG_ON(bch2_keylist_empty(keys));
- bch2_verify_keylist_sorted(keys);
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
-retry:
- bch2_trans_begin(&trans);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_INTENT);
-
- do {
- BKEY_PADDED(k) split;
-
- bkey_copy(&split.k, bch2_keylist_front(keys));
-
- ret = bch2_extent_trim_atomic(&split.k, iter);
- if (ret)
- break;
-
- bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, &split.k));
-
- ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
- if (ret)
- break;
-
- if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
- bch2_cut_front(iter->pos, bch2_keylist_front(keys));
- else
- bch2_keylist_pop_front(keys);
- } while (!bch2_keylist_empty(keys));
-
- if (ret == -EINTR) {
- ret = 0;
- goto retry;
- }
-
- bch2_trans_exit(&trans);
-
- return ret;
+ if (op->end_io)
+ op->end_io(op);
+ if (cl->parent)
+ closure_return(cl);
+ else
+ closure_debug_destroy(cl);
}
/**
@@ -313,16 +523,19 @@ static void __bch2_write_index(struct bch_write_op *op)
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
- bkey_copy(dst, src);
- bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
- test_bit(ptr->dev, op->failed.d));
+ if (bkey_extent_is_direct_data(&src->k)) {
+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+ test_bit(ptr->dev, op->failed.d));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
- ret = -EIO;
- goto err;
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
+ ret = -EIO;
+ goto err;
+ }
}
+ if (dst != src)
+ memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst);
}
@@ -340,6 +553,7 @@ static void __bch2_write_index(struct bch_write_op *op)
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
+ BUG_ON(ret == -EINTR);
BUG_ON(keylist_sectors(keys) && !ret);
op->written += sectors_start - keylist_sectors(keys);
@@ -404,8 +618,10 @@ static void bch2_write_endio(struct bio *bio)
if (parent)
bio_endio(&parent->bio);
- else
+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
closure_put(cl);
+ else
+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
}
static void init_append_extent(struct bch_write_op *op,
@@ -414,27 +630,36 @@ static void init_append_extent(struct bch_write_op *op,
struct bch_extent_crc_unpacked crc)
{
struct bch_fs *c = op->c;
- struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
- struct extent_ptr_decoded p = { .crc = crc };
+ struct bkey_i_extent *e;
struct open_bucket *ob;
unsigned i;
+ BUG_ON(crc.compressed_size > wp->sectors_free);
+ wp->sectors_free -= crc.compressed_size;
op->pos.offset += crc.uncompressed_size;
+
+ e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
e->k.version = version;
- BUG_ON(crc.compressed_size > wp->sectors_free);
- wp->sectors_free -= crc.compressed_size;
+ if (crc.csum_type ||
+ crc.compression_type ||
+ crc.nonce)
+ bch2_extent_crc_append(&e->k_i, crc);
open_bucket_for_each(c, &wp->ptrs, ob, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ union bch_extent_entry *end =
+ bkey_val_end(bkey_i_to_s(&e->k_i));
- p.ptr = ob->ptr;
- p.ptr.cached = !ca->mi.durability ||
+ end->ptr = ob->ptr;
+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+ end->ptr.cached = !ca->mi.durability ||
(op->flags & BCH_WRITE_CACHED) != 0;
- p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_extent_ptr_decoded_append(&e->k_i, &p);
+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
+
+ e->k.u64s++;
BUG_ON(crc.compressed_size > ob->sectors_free);
ob->sectors_free -= crc.compressed_size;
@@ -615,15 +840,14 @@ static enum prep_encoded_ret {
return PREP_ENCODED_OK;
}
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+ struct bio **_dst)
{
struct bch_fs *c = op->c;
struct bio *src = &op->wbio.bio, *dst = src;
struct bvec_iter saved_iter;
- struct bkey_i *key_to_write;
void *ec_buf;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
+ struct bpos ec_pos = op->pos;
unsigned total_output = 0, total_input = 0;
bool bounce = false;
bool page_alloc_failed = false;
@@ -642,6 +866,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
case PREP_ENCODED_CHECKSUM_ERR:
goto csum_err;
case PREP_ENCODED_DO_WRITE:
+ /* XXX look for bug here */
if (ec_buf) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
@@ -791,21 +1016,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
dst->bi_iter.bi_size = total_output;
do_write:
/* might have done a realloc... */
+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
- key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
- bch2_ec_add_backpointer(c, wp,
- bkey_start_pos(&key_to_write->k),
- total_input >> 9);
-
- dst->bi_end_io = bch2_write_endio;
- dst->bi_private = &op->cl;
- bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
-
- closure_get(dst->bi_private);
-
- bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
- key_to_write);
+ *_dst = dst;
return more;
csum_err:
bch_err(c, "error verifying existing checksum while "
@@ -825,11 +1038,17 @@ static void __bch2_write(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp;
+ struct bio *bio;
+ bool skip_put = true;
int ret;
again:
memset(&op->failed, 0, sizeof(op->failed));
do {
+ struct bkey_i *key_to_write;
+ unsigned key_to_write_offset = op->insert_keys.top_p -
+ op->insert_keys.keys_p;
+
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v))
@@ -862,23 +1081,39 @@ again:
goto flush_io;
}
- ret = bch2_write_extent(op, wp);
-
bch2_open_bucket_get(c, wp, &op->open_buckets);
+ ret = bch2_write_extent(op, wp, &bio);
bch2_alloc_sectors_done(c, wp);
if (ret < 0)
goto err;
+
+ if (ret)
+ skip_put = false;
+
+ bio->bi_end_io = bch2_write_endio;
+ bio->bi_private = &op->cl;
+ bio->bi_opf |= REQ_OP_WRITE;
+
+ if (!skip_put)
+ closure_get(bio->bi_private);
+ else
+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+
+ key_to_write = (void *) (op->insert_keys.keys_p +
+ key_to_write_offset);
+
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ key_to_write);
} while (ret);
- continue_at(cl, bch2_write_index, index_update_wq(op));
+ if (!skip_put)
+ continue_at(cl, bch2_write_index, index_update_wq(op));
return;
err:
op->error = ret;
- continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
- ? bch2_write_index
- : bch2_write_done, index_update_wq(op));
+ continue_at(cl, bch2_write_index, index_update_wq(op));
return;
flush_io:
closure_sync(cl);
@@ -895,6 +1130,47 @@ flush_io:
goto again;
}
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+ struct closure *cl = &op->cl;
+ struct bio *bio = &op->wbio.bio;
+ struct bvec_iter iter;
+ struct bkey_i_inline_data *id;
+ unsigned sectors;
+ int ret;
+
+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+ if (ret) {
+ op->error = ret;
+ goto err;
+ }
+
+ sectors = bio_sectors(bio);
+ op->pos.offset += sectors;
+
+ id = bkey_inline_data_init(op->insert_keys.top);
+ id->k.p = op->pos;
+ id->k.version = op->version;
+ id->k.size = sectors;
+
+ iter = bio->bi_iter;
+ iter.bi_size = data_len;
+ memcpy_from_bio(id->v.data, bio, iter);
+
+ while (data_len & 7)
+ id->v.data[data_len++] = '\0';
+ set_bkey_val_bytes(&id->k, data_len);
+ bch2_keylist_push(&op->insert_keys);
+
+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+ continue_at_nobarrier(cl, bch2_write_index, NULL);
+ return;
+err:
+ bch2_write_done(&op->cl);
+}
+
/**
* bch_write - handle a write to a cache device or flash only volume
*
@@ -916,22 +1192,22 @@ void bch2_write(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
+ unsigned data_len;
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ op->start_time = local_clock();
+ bch2_keylist_init(&op->insert_keys, op->inline_keys);
+ wbio_init(bio)->put_bio = false;
+
if (bio_sectors(bio) & (c->opts.block_size - 1)) {
__bcache_io_error(c, "misaligned write");
op->error = -EIO;
goto err;
}
- op->start_time = local_clock();
-
- bch2_keylist_init(&op->insert_keys, op->inline_keys);
- wbio_init(bio)->put_bio = false;
-
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only");
@@ -941,12 +1217,25 @@ void bch2_write(struct closure *cl)
bch2_increment_clock(c, bio_sectors(bio), WRITE);
+ data_len = min_t(u64, bio->bi_iter.bi_size,
+ op->new_i_size - (op->pos.offset << 9));
+
+ if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+ bch2_write_data_inline(op, data_len);
+ return;
+ }
+
continue_at_nobarrier(cl, __bch2_write, NULL);
return;
err:
if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
bch2_disk_reservation_put(c, &op->res);
- closure_return(cl);
+ if (op->end_io)
+ op->end_io(op);
+ if (cl->parent)
+ closure_return(cl);
+ else
+ closure_debug_destroy(cl);
}
/* Cache promotion on read */
@@ -1042,7 +1331,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
closure_return_with_destructor(cl, promote_done);
}
-noinline
static struct promote_op *__promote_alloc(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos,
@@ -1116,7 +1404,8 @@ err:
return NULL;
}
-static inline struct promote_op *promote_alloc(struct bch_fs *c,
+noinline
+static struct promote_op *promote_alloc(struct bch_fs *c,
struct bvec_iter iter,
struct bkey_s_c k,
struct extent_ptr_decoded *pick,
@@ -1228,12 +1517,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
{
struct btree_trans trans;
struct btree_iter *iter;
- BKEY_PADDED(k) tmp;
+ struct bkey_on_stack sk;
struct bkey_s_c k;
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1245,11 +1536,11 @@ retry:
if (bkey_err(k))
goto err;
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bkey_on_stack_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
- if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+ if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
rbio->pos.offset -
rbio->pick.crc.offset)) {
@@ -1266,6 +1557,7 @@ retry:
out:
bch2_rbio_done(rbio);
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
return;
err:
rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1278,12 +1570,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
{
struct btree_trans trans;
struct btree_iter *iter;
+ struct bkey_on_stack sk;
struct bkey_s_c k;
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
@@ -1291,18 +1585,17 @@ retry:
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS, k, ret) {
- BKEY_PADDED(k) tmp;
unsigned bytes, sectors, offset_into_extent;
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bkey_on_stack_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(&trans,
- &offset_into_extent, &tmp.k);
+ &offset_into_extent, sk.k);
if (ret)
break;
@@ -1329,6 +1622,8 @@ retry:
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
+ if (ret == -EINTR)
+ goto retry;
/*
* If we get here, it better have been because there was an error
* reading a btree node
@@ -1339,6 +1634,7 @@ err:
rbio->bio.bi_status = BLK_STS_IOERR;
out:
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
bch2_rbio_done(rbio);
}
@@ -1395,7 +1691,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- BKEY_PADDED(k) new;
+ struct bkey_on_stack new;
struct bch_extent_crc_unpacked new_crc;
u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
int ret;
@@ -1403,18 +1699,19 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
if (rbio->pick.crc.compression_type)
return;
+ bkey_on_stack_init(&new);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(iter);
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
if (IS_ERR_OR_NULL(k.k))
goto out;
- bkey_reassemble(&new.k, k);
- k = bkey_i_to_s_c(&new.k);
+ bkey_on_stack_reassemble(&new, c, k);
+ k = bkey_i_to_s_c(new.k);
if (bversion_cmp(k.k->version, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
@@ -1433,10 +1730,10 @@ retry:
goto out;
}
- if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
+ if (!bch2_bkey_narrow_crcs(new.k, new_crc))
goto out;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
+ bch2_trans_update(&trans, iter, new.k);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
@@ -1445,6 +1742,7 @@ retry:
goto retry;
out:
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&new, c);
}
/* Inner part that may run in process context */
@@ -1602,9 +1900,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
*offset_into_extent;
- iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
- POS(0, reflink_offset),
- BTREE_ITER_SLOTS, 1);
+ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+ POS(0, reflink_offset),
+ BTREE_ITER_SLOTS);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
@@ -1641,6 +1939,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bpos pos = bkey_start_pos(k.k);
int pick_ret;
+ if (k.k->type == KEY_TYPE_inline_data) {
+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+ unsigned bytes = min_t(unsigned, iter.bi_size,
+ bkey_val_bytes(d.k));
+
+ swap(iter.bi_size, bytes);
+ memcpy_to_bio(&orig->bio, iter, d.v->data);
+ swap(iter.bi_size, bytes);
+ bio_advance_iter(&orig->bio, &iter, bytes);
+ zero_fill_bio_iter(&orig->bio, iter);
+ goto out_read_done;
+ }
+
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
/* hole or reservation - just zero fill: */
@@ -1677,7 +1988,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1689,8 +2000,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
bounce = true;
}
- promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
+ if (orig->opts.promote_target)
+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ &rbio, &bounce, &read_full);
if (!read_full) {
EBUG_ON(pick.crc.compression_type);
@@ -1718,7 +2030,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
* data in the write path, but we're not going to use it all
* here:
*/
- BUG_ON(rbio->bio.bi_iter.bi_size <
+ EBUG_ON(rbio->bio.bi_iter.bi_size <
pick.crc.compressed_size << 9);
rbio->bio.bi_iter.bi_size =
pick.crc.compressed_size << 9;
@@ -1751,10 +2063,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
noclone:
rbio = orig;
rbio->bio.bi_iter = iter;
- BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
rbio->c = c;
rbio->submit_time = local_clock();
@@ -1770,6 +2082,7 @@ noclone:
rbio->hole = 0;
rbio->retry = 0;
rbio->context = 0;
+ /* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
rbio->pos = pos;
@@ -1786,11 +2099,11 @@ noclone:
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- percpu_down_read(&c->mark_lock);
+ rcu_read_lock();
bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
- percpu_up_read(&c->mark_lock);
+ rcu_read_unlock();
- if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
trace_read_split(&orig->bio);
}
@@ -1867,14 +2180,13 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
{
struct btree_trans trans;
struct btree_iter *iter;
+ struct bkey_on_stack sk;
struct bkey_s_c k;
unsigned flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
BUG_ON(rbio->_state);
BUG_ON(flags & BCH_READ_NODECODE);
BUG_ON(flags & BCH_READ_IN_RETRY);
@@ -1882,12 +2194,15 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c;
rbio->start_time = local_clock();
+ bkey_on_stack_init(&sk);
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS);
-
while (1) {
- BKEY_PADDED(k) tmp;
unsigned bytes, sectors, offset_into_extent;
bch2_btree_iter_set_pos(iter,
@@ -1898,15 +2213,15 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
if (ret)
goto err;
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
-
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
+ bkey_on_stack_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
ret = bch2_read_indirect_extent(&trans,
- &offset_into_extent, &tmp.k);
+ &offset_into_extent, sk.k);
if (ret)
goto err;
@@ -1938,8 +2253,12 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
}
out:
bch2_trans_exit(&trans);
+ bkey_on_stack_exit(&sk, c);
return;
err:
+ if (ret == -EINTR)
+ goto retry;
+
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio);
goto out;
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 80b72dbf1a0c..45c950942d78 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -30,9 +30,11 @@ enum bch_write_flags {
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 8),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -54,13 +56,20 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
: op->c->wq;
}
+int bch2_extent_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct disk_reservation *,
+ u64 *, u64, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+ struct bpos, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+
int bch2_write_index_default(struct bch_write_op *);
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct bch_io_opts opts)
{
op->c = c;
- op->io_wq = index_update_wq(op);
+ op->end_io = NULL;
op->flags = 0;
op->written = 0;
op->error = 0;
@@ -78,6 +87,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
op->journal_seq = 0;
+ op->new_i_size = U64_MAX;
+ op->i_sectors_delta = 0;
op->index_update_fn = bch2_write_index_default;
}
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 2d397e5e5b9e..c37b7d7401e9 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -93,7 +93,7 @@ struct bch_write_bio {
struct bch_write_op {
struct closure cl;
struct bch_fs *c;
- struct workqueue_struct *io_wq;
+ void (*end_io)(struct bch_write_op *);
u64 start_time;
unsigned written; /* sectors */
@@ -109,7 +109,6 @@ struct bch_write_op {
struct bch_devs_list devs_have;
u16 target;
u16 nonce;
-
struct bch_io_opts opts;
struct bpos pos;
@@ -132,6 +131,8 @@ struct bch_write_op {
u64 *journal_seq_p;
u64 journal_seq;
};
+ u64 new_i_size;
+ s64 i_sectors_delta;
int (*index_update_fn)(struct bch_write_op *);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 5c3e146e3942..9f03a479c9a2 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -945,7 +945,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
w = j->buf + !state.idx;
ret = state.prev_buf_unwritten &&
- bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
spin_unlock(&j->lock);
return ret;
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index ec5ba2b9ef42..ec61137df00a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -269,7 +269,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _RET_IP_);
+ lock_release(&j->res_map, 0, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 387377dadab5..7112a25d0600 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1100,7 +1100,7 @@ void bch2_journal_write(struct closure *cl)
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
- !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index dc3b03d6e627..4b59dcd04cce 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,6 +4,7 @@
*/
#include "bcachefs.h"
+#include "bkey_on_stack.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
@@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- BKEY_PADDED(key) tmp;
+ struct bkey_on_stack sk;
int ret = 0;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -58,9 +60,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
continue;
}
- bkey_reassemble(&tmp.key, k);
+ bkey_on_stack_reassemble(&sk, c, k);
- ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
dev_idx, flags, false);
if (ret)
break;
@@ -70,12 +72,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
- bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
+ bch2_extent_normalize(c, bkey_i_to_s(sk.k));
- /* XXX not sketchy at all */
- iter->pos = bkey_start_pos(&tmp.key.k);
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key));
+ bch2_trans_update(&trans, iter, sk.k);
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
@@ -93,6 +94,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
}
ret = bch2_trans_exit(&trans) ?: ret;
+ bkey_on_stack_exit(&sk, c);
BUG_ON(ret == -EINTR);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 70b2b686e0a8..fad3cc4d587c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "bkey_on_stack.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
@@ -96,10 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k);
+ bch2_cut_front(iter->pos, &new->k_i);
- bch2_cut_front(iter->pos, insert);
- bch2_cut_back(new->k.p, &insert->k);
- bch2_cut_back(insert->k.p, &new->k);
+ bch2_cut_front(iter->pos, insert);
+ bch2_cut_back(new->k.p, insert);
+ bch2_cut_back(insert->k.p, &new->k_i);
if (m->data_cmd == DATA_REWRITE)
bch2_bkey_drop_device(bkey_i_to_s(insert),
@@ -133,11 +135,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
* If we're not fully overwriting @k, and it's compressed, we
* need a reservation for all the pointers in @insert
*/
- nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
+ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
m->nr_ptrs_reserved;
if (insert->k.size < k.k->size &&
- bch2_extent_is_compressed(k) &&
+ bch2_bkey_sectors_compressed(k) &&
nr > 0) {
ret = bch2_disk_reservation_add(c, &op->res,
keylist_sectors(keys) * nr, 0);
@@ -148,8 +150,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
goto next;
}
- bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, insert));
+ bch2_trans_update(&trans, iter, insert);
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
@@ -169,8 +170,6 @@ next:
if (bch2_keylist_empty(keys))
goto out;
}
-
- bch2_cut_front(iter->pos, bch2_keylist_front(keys));
continue;
nomatch:
if (m->ctxt)
@@ -252,7 +251,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
*/
#if 0
int nr = (int) io_opts.data_replicas -
- bch2_bkey_nr_dirty_ptrs(k);
+ bch2_bkey_nr_ptrs_allocated(k);
#endif
int nr = (int) io_opts.data_replicas;
@@ -302,12 +301,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- int i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -491,7 +490,7 @@ static int __bch2_move_data(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- BKEY_PADDED(k) tmp;
+ struct bkey_on_stack sk;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
@@ -500,6 +499,7 @@ static int __bch2_move_data(struct bch_fs *c,
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
+ bkey_on_stack_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_USER;
@@ -551,7 +551,8 @@ peek:
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- if (cur_inum != k.k->p.inode) {
+ if (btree_id == BTREE_ID_EXTENTS &&
+ cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
/* don't hold btree locks while looking up inode: */
@@ -578,8 +579,8 @@ peek:
}
/* unlock before doing IO: */
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bkey_on_stack_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
@@ -598,7 +599,7 @@ peek:
if (rate)
bch2_ratelimit_increment(rate, k.k->size);
next:
- atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
&stats->sectors_seen);
next_nondata:
bch2_btree_iter_next(iter);
@@ -606,6 +607,7 @@ next_nondata:
}
out:
ret = bch2_trans_exit(&trans) ?: ret;
+ bkey_on_stack_exit(&sk, c);
return ret;
}
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 710296044194..abdeef20fde9 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -107,10 +107,10 @@ static bool have_copygc_reserve(struct bch_dev *ca)
{
bool ret;
- spin_lock(&ca->freelist_lock);
+ spin_lock(&ca->fs->freelist_lock);
ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
ca->allocator_state != ALLOCATOR_RUNNING;
- spin_unlock(&ca->freelist_lock);
+ spin_unlock(&ca->fs->freelist_lock);
return ret;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 97a782f44f6e..0ec0999a6214 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -68,6 +68,12 @@ enum opt_type {
* - helptext
*/
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS true
+#else
+#define RATELIMIT_ERRORS false
+#endif
+
#define BCH_OPTS() \
x(block_size, u16, \
OPT_FORMAT, \
@@ -127,7 +133,7 @@ enum opt_type {
x(str_hash, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_str_hash_types), \
- BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH, \
+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \
NULL, "Hash function for directory entries and xattrs")\
x(foreground_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
@@ -227,6 +233,11 @@ enum opt_type {
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
+ x(ratelimit_errors, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, RATELIMIT_ERRORS, \
+ NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
@@ -289,13 +300,7 @@ enum opt_type {
OPT_UINT(0, BCH_REPLICAS_MAX), \
NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
- "to have already been replicated n times") \
- x(new_inode_updates, u8, \
- OPT_MOUNT, \
- OPT_BOOL(), \
- NO_SB_OPT, false, \
- NULL, "Enable new btree write-cache for inode updates")
-
+ "to have already been replicated n times")
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index f0da0fac09bf..0fa6f33c049b 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
if (qdq->d_fieldmask & QC_INO_HARD)
new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i));
+ bch2_trans_update(&trans, iter, &new_quota.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 98d9a1432e50..d4002b7fc917 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -10,6 +10,7 @@
#include "dirent.h"
#include "ec.h"
#include "error.h"
+#include "fs-common.h"
#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
@@ -176,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+ bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
} else {
struct bkey_i *split =
kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
@@ -185,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
goto err;
bkey_copy(split, i[0].k);
- bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+ bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
.btree_id = i[0].btree_id,
.allocated = true,
@@ -253,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
* Some extents aren't equivalent - w.r.t. what the triggers do
* - if they're split:
*/
- bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
k->k.type == KEY_TYPE_reflink_p;
bool remark = false;
int ret;
@@ -271,6 +272,8 @@ retry:
if (ret)
goto err;
+ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
+
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
@@ -281,16 +284,12 @@ retry:
if (ret)
goto err;
- ret = bch2_extent_atomic_end(split_iter, k, &atomic_end);
- if (ret)
- goto err;
-
if (!remark &&
remark_if_split &&
bkey_cmp(atomic_end, k->k.p) < 0) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
@@ -299,9 +298,9 @@ retry:
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
- bch2_cut_back(atomic_end, &split->k);
+ bch2_cut_back(atomic_end, split);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
+ bch2_trans_update(&trans, split_iter, split);
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
@@ -865,6 +864,8 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
bch_verbose(c, "alloc write done");
+
+ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
}
if (!c->sb.clean) {
@@ -912,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
write_sb = true;
}
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
+ c->disk_sb.sb->features[0] |=
+ cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
+ write_sb = true;
+ }
+
if (!test_bit(BCH_FS_ERROR, &c->flags)) {
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
write_sb = true;
@@ -952,7 +959,6 @@ int bch2_fs_initialize(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
- struct bch_hash_info root_hash_info;
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
@@ -997,7 +1003,6 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
- root_inode.bi_nlink++; /* lost+found */
bch2_inode_pack(&packed_inode, &root_inode);
err = "error creating root directory";
@@ -1007,24 +1012,15 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- bch2_inode_init(c, &lostfound_inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
- &root_inode);
- lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
- bch2_inode_pack(&packed_inode, &lostfound_inode);
+ bch2_inode_init_early(c, &lostfound_inode);
err = "error creating lost+found";
- ret = bch2_btree_insert(c, BTREE_ID_INODES,
- &packed_inode.inode.k_i,
- NULL, NULL, 0);
- if (ret)
- goto err;
-
- root_hash_info = bch2_hash_info_init(c, &root_inode);
-
- ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
- &lostfound, lostfound_inode.bi_inum, NULL,
- BTREE_INSERT_NOFAIL);
+ ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+ bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+ &root_inode, &lostfound_inode,
+ &lostfound,
+ 0, 0, S_IFDIR|0700, 0,
+ NULL, NULL));
if (ret)
goto err;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index dcca9c1d0f47..2812fa305c0e 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -1,9 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_on_stack.h"
#include "btree_update.h"
#include "extents.h"
-#include "fs.h"
-#include "fs-io.h"
+#include "inode.h"
+#include "io.h"
#include "reflink.h"
#include <linux/sched/signal.h>
@@ -39,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
bch2_key_resize(l.k, KEY_SIZE_MAX);
- __bch2_cut_front(l.k->p, _r);
+ bch2_cut_front_s(l.k->p, _r);
return BCH_MERGE_PARTIAL;
}
@@ -70,12 +71,6 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-/*
- * bch2_remap_range() depends on bch2_extent_update(), which depends on various
- * things tied to the linux vfs for inode updates, for now:
- */
-#ifndef NO_BCACHEFS_FS
-
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i_extent *e)
@@ -120,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->v.refcount = 0;
memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+ bch2_trans_update(trans, reflink_iter, &r_v->k_i);
r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
if (IS_ERR(r_p))
@@ -131,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+ bch2_trans_update(trans, extent_iter, &r_p->k_i);
err:
if (!IS_ERR(reflink_iter)) {
c->reflink_hint = reflink_iter->pos.offset;
@@ -144,35 +139,37 @@ err:
static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
{
struct bkey_s_c k = bch2_btree_iter_peek(iter);
+ int ret;
- while (1) {
- if (bkey_err(k))
- return k;
-
+ for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(iter->pos, end) >= 0)
return bkey_s_c_null;
if (k.k->type == KEY_TYPE_extent ||
k.k->type == KEY_TYPE_reflink_p)
- return k;
-
- k = bch2_btree_iter_next(iter);
+ break;
}
+
+ return k;
}
s64 bch2_remap_range(struct bch_fs *c,
- struct bch_inode_info *dst_inode,
struct bpos dst_start, struct bpos src_start,
- u64 remap_sectors, u64 new_i_size)
+ u64 remap_sectors, u64 *journal_seq,
+ u64 new_i_size, s64 *i_sectors_delta)
{
struct btree_trans trans;
struct btree_iter *dst_iter, *src_iter;
struct bkey_s_c src_k;
- BKEY_PADDED(k) new_dst, new_src;
+ BKEY_PADDED(k) new_dst;
+ struct bkey_on_stack new_src;
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos dst_want, src_want;
u64 src_done, dst_done;
- int ret = 0;
+ int ret = 0, ret2 = 0;
+
+ if (!percpu_ref_tryget(&c->writes))
+ return -EROFS;
if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
mutex_lock(&c->sb_lock);
@@ -188,12 +185,13 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_end.offset += remap_sectors;
src_end.offset += remap_sectors;
+ bkey_on_stack_init(&new_src);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
- src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
- BTREE_ITER_INTENT, 1);
- dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
- BTREE_ITER_INTENT, 2);
+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+ BTREE_ITER_INTENT);
+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+ BTREE_ITER_INTENT);
while (1) {
bch2_trans_begin_updates(&trans);
@@ -215,7 +213,7 @@ s64 bch2_remap_range(struct bch_fs *c,
if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
- dst_inode, new_i_size);
+ journal_seq, i_sectors_delta);
if (ret)
goto btree_err;
continue;
@@ -227,14 +225,14 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
if (src_k.k->type == KEY_TYPE_extent) {
- bkey_reassemble(&new_src.k, src_k);
- src_k = bkey_i_to_s_c(&new_src.k);
+ bkey_on_stack_reassemble(&new_src, c, src_k);
+ src_k = bkey_i_to_s_c(new_src.k);
- bch2_cut_front(src_iter->pos, &new_src.k);
- bch2_cut_back(src_end, &new_src.k.k);
+ bch2_cut_front(src_iter->pos, new_src.k);
+ bch2_cut_back(src_end, new_src.k);
ret = bch2_make_extent_indirect(&trans, src_iter,
- bkey_i_to_extent(&new_src.k));
+ bkey_i_to_extent(new_src.k));
if (ret)
goto btree_err;
@@ -261,9 +259,9 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_iter->pos.offset,
dst_end.offset - dst_iter->pos.offset));
- ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
- dst_iter, &new_dst.k,
- new_i_size, false, true, NULL);
+ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+ NULL, journal_seq,
+ new_i_size, i_sectors_delta);
if (ret)
goto btree_err;
@@ -284,17 +282,29 @@ err:
dst_done = dst_iter->pos.offset - dst_start.offset;
new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+ bch2_trans_begin(&trans);
+
+ do {
+ struct bch_inode_unpacked inode_u;
+ struct btree_iter *inode_iter;
+
+ inode_iter = bch2_inode_peek(&trans, &inode_u,
+ dst_start.inode, BTREE_ITER_INTENT);
+ ret2 = PTR_ERR_OR_ZERO(inode_iter);
+
+ if (!ret2 &&
+ inode_u.bi_size < new_i_size) {
+ inode_u.bi_size = new_i_size;
+ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, journal_seq,
+ BTREE_INSERT_ATOMIC);
+ }
+ } while (ret2 == -EINTR);
+
ret = bch2_trans_exit(&trans) ?: ret;
+ bkey_on_stack_exit(&new_src, c);
- mutex_lock(&dst_inode->ei_update_lock);
- if (dst_inode->v.i_size < new_i_size) {
- i_size_write(&dst_inode->v, new_i_size);
- ret = bch2_write_inode_size(c, dst_inode, new_i_size,
- ATTR_MTIME|ATTR_CTIME);
- }
- mutex_unlock(&dst_inode->ei_update_lock);
+ percpu_ref_put(&c->writes);
- return dst_done ?: ret;
+ return dst_done ?: ret ?: ret2;
}
-
-#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 327618c36d33..ac23b855858c 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -24,9 +24,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
.val_to_text = bch2_reflink_v_to_text, \
}
-#ifndef NO_BCACHEFS_FS
-s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
- struct bpos, struct bpos, u64, u64);
-#endif /* NO_BCACHEFS_FS */
+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
+ u64, u64 *, u64, s64 *);
#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index bb9da2bb5a92..cb5ebb87c701 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -84,10 +84,8 @@ static void extent_to_replicas(struct bkey_s_c k,
if (p.ptr.cached)
continue;
- if (p.ec_nr) {
+ if (p.has_ec)
r->nr_required = 0;
- break;
- }
r->devs[r->nr_devs++] = p.ptr.dev;
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 091bf7a89577..582e718b6bd1 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -14,6 +14,23 @@
#include <crypto/hash.h>
#include <crypto/sha.h>
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+ switch (opt) {
+ case BCH_STR_HASH_OPT_CRC32C:
+ return BCH_STR_HASH_CRC32C;
+ case BCH_STR_HASH_OPT_CRC64:
+ return BCH_STR_HASH_CRC64;
+ case BCH_STR_HASH_OPT_SIPHASH:
+ return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH)
+ ? BCH_STR_HASH_SIPHASH
+ : BCH_STR_HASH_SIPHASH_OLD;
+ default:
+ BUG();
+ }
+}
+
struct bch_hash_info {
u8 type;
union {
@@ -23,34 +40,24 @@ struct bch_hash_info {
};
static inline struct bch_hash_info
-bch2_hash_info_init(struct bch_fs *c,
- const struct bch_inode_unpacked *bi)
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0U << INODE_STR_HASH_BITS)
+ ~(~0U << INODE_STR_HASH_BITS),
+ .crc_key = bi->bi_hash_seed,
};
- switch (info.type) {
- case BCH_STR_HASH_CRC32C:
- case BCH_STR_HASH_CRC64:
- info.crc_key = bi->bi_hash_seed;
- break;
- case BCH_STR_HASH_SIPHASH: {
+ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
desc->tfm = c->sha256;
- desc->flags = 0;
crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
- break;
- }
- default:
- BUG();
}
return info;
@@ -74,6 +81,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
case BCH_STR_HASH_CRC64:
ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
break;
+ case BCH_STR_HASH_SIPHASH_OLD:
case BCH_STR_HASH_SIPHASH:
SipHash24_Init(&ctx->siphash, &info->siphash_key);
break;
@@ -93,6 +101,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
case BCH_STR_HASH_CRC64:
ctx->crc64 = crc64_be(ctx->crc64, data, len);
break;
+ case BCH_STR_HASH_SIPHASH_OLD:
case BCH_STR_HASH_SIPHASH:
SipHash24_Update(&ctx->siphash, data, len);
break;
@@ -109,6 +118,7 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
return ctx->crc32c;
case BCH_STR_HASH_CRC64:
return ctx->crc64 >> 1;
+ case BCH_STR_HASH_SIPHASH_OLD:
case BCH_STR_HASH_SIPHASH:
return SipHash24_End(&ctx->siphash) >> 1;
default:
@@ -188,6 +198,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
{
struct btree_iter *iter;
struct bkey_s_c k;
+ int ret;
iter = bch2_trans_copy_iter(trans, start);
if (IS_ERR(iter))
@@ -195,19 +206,21 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
bch2_btree_iter_next_slot(iter);
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_whiteout)
break;
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset) {
- bch2_trans_iter_free_on_commit(trans, iter);
- return 1;
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ ret = 1;
+ break;
}
}
- return bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
static __always_inline
@@ -246,11 +259,15 @@ int bch2_hash_set(struct btree_trans *trans,
goto not_found;
}
- if (slot)
- bch2_trans_iter_free(trans, slot);
- bch2_trans_iter_free(trans, iter);
+ if (!ret)
+ ret = -ENOSPC;
+out:
+ if (!IS_ERR_OR_NULL(slot))
+ bch2_trans_iter_put(trans, slot);
+ if (!IS_ERR_OR_NULL(iter))
+ bch2_trans_iter_put(trans, iter);
- return ret ?: -ENOSPC;
+ return ret;
found:
found = true;
not_found:
@@ -260,17 +277,14 @@ not_found:
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
- if (!found && slot) {
- bch2_trans_iter_free(trans, iter);
- iter = slot;
- }
+ if (!found && slot)
+ swap(iter, slot);
insert->k.p = iter->pos;
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
- bch2_trans_iter_free_on_commit(trans, iter);
+ bch2_trans_update(trans, iter, insert);
}
- return ret;
+ goto out;
}
static __always_inline
@@ -294,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
- bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
+ bch2_trans_update(trans, iter, delete);
return 0;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 3043def884ab..b36cfdf0b41c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -949,6 +949,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret;
}
+static void
+entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+{
+ memset(entry, 0, u64s * sizeof(u64));
+
+ /*
+ * The u64s field counts from the start of data, ignoring the shared
+ * fields.
+ */
+ entry->u64s = u64s - 1;
+}
+
+static void
+entry_init_size(struct jset_entry *entry, size_t size)
+{
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+ entry_init_u64s(entry, u64s);
+}
+
struct jset_entry *
bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry *entry,
@@ -963,7 +982,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
r < c->btree_roots + BTREE_ID_NR;
r++)
if (r->alive) {
- entry->u64s = r->key.u64s;
+ entry_init_u64s(entry, r->key.u64s + 1);
entry->btree_id = r - c->btree_roots;
entry->level = r->level;
entry->type = BCH_JSET_ENTRY_btree_root;
@@ -988,8 +1007,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- memset(u, 0, sizeof(*u));
- u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+ entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
@@ -1001,8 +1019,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- memset(u, 0, sizeof(*u));
- u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+ entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
@@ -1014,8 +1031,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- memset(u, 0, sizeof(*u));
- u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1;
+ entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
@@ -1030,9 +1046,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
- memset(u, 0, sizeof(*u));
- u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs,
- sizeof(u64)) - 1;
+ entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 4145832f4856..17bdf985559c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -506,6 +506,7 @@ static void bch2_fs_free(struct bch_fs *c)
free_percpu(c->usage[0]);
kfree(c->usage_base);
free_percpu(c->pcpu);
+ mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->btree_interior_update_pool);
@@ -758,6 +759,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
@@ -1090,7 +1092,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
- spin_lock_init(&ca->freelist_lock);
bch2_dev_copygc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 27646c435e30..e7699afd99fc 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -775,7 +775,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
struct printbuf out = _PBUF(buf, PAGE_SIZE);
enum alloc_reserve i;
- spin_lock(&ca->freelist_lock);
+ spin_lock(&ca->fs->freelist_lock);
pr_buf(&out, "free_inc:\t%zu\t%zu\n",
fifo_used(&ca->free_inc),
@@ -786,7 +786,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
fifo_used(&ca->free[i]),
ca->free[i].size);
- spin_unlock(&ca->freelist_lock);
+ spin_unlock(&ca->fs->freelist_lock);
return out.pos - buf;
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index fe0b987902fb..724f41e6590c 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+ bch2_trans_update(&trans, iter, &k.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
ret = bch2_btree_iter_traverse(iter);
BUG_ON(ret);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+ bch2_trans_update(&trans, iter, &k.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p = iter->pos;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i));
+ bch2_trans_update(&trans, iter, &k.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
}
@@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
insert.k.p = iter->pos;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i));
+ bch2_trans_update(&trans, iter, &insert.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
@@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
bkey_reassemble(&u.k_i, k);
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i));
+ bch2_trans_update(&trans, iter, &u.k_i);
ret = bch2_trans_commit(&trans, NULL, NULL, 0);
BUG_ON(ret);
}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 2cc433ec0e3a..e69d03d1109f 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -550,7 +550,7 @@ size_t bch2_rand_range(size_t max)
return rand;
}
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
{
struct bio_vec bv;
struct bvec_iter iter;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 4b33a527494d..0128daba5970 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -547,9 +547,19 @@ do { \
size_t bch2_rand_range(size_t);
-void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+static inline void memcpy_u64s_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ u64 *d = dst;
+ const u64 *s = src;
+
+ while (u64s--)
+ *d++ = *s++;
+}
+
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
@@ -591,6 +601,24 @@ static inline void memmove_u64s_down(void *dst, const void *src,
__memmove_u64s_down(dst, src, u64s);
}
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+ unsigned u64s)
+{
+ u64 *dst = (u64 *) _dst + u64s;
+ u64 *src = (u64 *) _src + u64s;
+
+ while (u64s--)
+ *--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst < src);
+
+ __memmove_u64s_up_small(dst, src, u64s);
+}
+
static inline void __memmove_u64s_up(void *_dst, const void *_src,
unsigned u64s)
{
@@ -628,35 +656,14 @@ static inline void memmove_u64s(void *dst, const void *src,
__memmove_u64s_up(dst, src, u64s);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
+ memset(s + bytes, c, rem);
}
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));