summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-11-27 16:20:13 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2021-11-27 23:23:00 -0500
commit8d547a161a2e3d94360fd4979270db7ca9794a2c (patch)
treef3c9ee9bbe2ccb9cb050a17cbc0d312ebf8dffb6
parent5e2bedbf8ac4e178f8570cf75fdfa931c960556f (diff)
Merge with d9565fc1a8 bcachefs: Improve tracing of btree_path leaks
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/Kconfig1
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/acl.c97
-rw-r--r--fs/bcachefs/acl.h15
-rw-r--r--fs/bcachefs/alloc_background.c170
-rw-r--r--fs/bcachefs/alloc_background.h28
-rw-r--r--fs/bcachefs/bcachefs.h52
-rw-r--r--fs/bcachefs/bcachefs_format.h187
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h15
-rw-r--r--fs/bcachefs/bkey.c20
-rw-r--r--fs/bcachefs/bkey.h33
-rw-r--r--fs/bcachefs/bkey_methods.c62
-rw-r--r--fs/bcachefs/bkey_methods.h29
-rw-r--r--fs/bcachefs/bset.c6
-rw-r--r--fs/bcachefs/btree_cache.c235
-rw-r--r--fs/bcachefs/btree_cache.h8
-rw-r--r--fs/bcachefs/btree_gc.c394
-rw-r--r--fs/bcachefs/btree_gc.h2
-rw-r--r--fs/bcachefs/btree_io.c399
-rw-r--r--fs/bcachefs/btree_io.h59
-rw-r--r--fs/bcachefs/btree_iter.c2638
-rw-r--r--fs/bcachefs/btree_iter.h398
-rw-r--r--fs/bcachefs/btree_key_cache.c178
-rw-r--r--fs/bcachefs/btree_key_cache.h5
-rw-r--r--fs/bcachefs/btree_locking.h130
-rw-r--r--fs/bcachefs/btree_types.h203
-rw-r--r--fs/bcachefs/btree_update.h63
-rw-r--r--fs/bcachefs/btree_update_interior.c611
-rw-r--r--fs/bcachefs/btree_update_interior.h49
-rw-r--r--fs/bcachefs/btree_update_leaf.c1210
-rw-r--r--fs/bcachefs/buckets.c1064
-rw-r--r--fs/bcachefs/buckets.h27
-rw-r--r--fs/bcachefs/chardev.c47
-rw-r--r--fs/bcachefs/checksum.c188
-rw-r--r--fs/bcachefs/checksum.h26
-rw-r--r--fs/bcachefs/compress.c2
-rw-r--r--fs/bcachefs/debug.c42
-rw-r--r--fs/bcachefs/dirent.c353
-rw-r--r--fs/bcachefs/dirent.h36
-rw-r--r--fs/bcachefs/ec.c130
-rw-r--r--fs/bcachefs/ec.h4
-rw-r--r--fs/bcachefs/error.c1
-rw-r--r--fs/bcachefs/error.h1
-rw-r--r--fs/bcachefs/extent_update.c75
-rw-r--r--fs/bcachefs/extent_update.h12
-rw-r--r--fs/bcachefs/extents.c355
-rw-r--r--fs/bcachefs/extents.h32
-rw-r--r--fs/bcachefs/fs-common.c360
-rw-r--r--fs/bcachefs/fs-common.h26
-rw-r--r--fs/bcachefs/fs-io.c1409
-rw-r--r--fs/bcachefs/fs-io.h10
-rw-r--r--fs/bcachefs/fs-ioctl.c203
-rw-r--r--fs/bcachefs/fs.c700
-rw-r--r--fs/bcachefs/fs.h34
-rw-r--r--fs/bcachefs/fsck.c2046
-rw-r--r--fs/bcachefs/inode.c422
-rw-r--r--fs/bcachefs/inode.h39
-rw-r--r--fs/bcachefs/io.c356
-rw-r--r--fs/bcachefs/io.h27
-rw-r--r--fs/bcachefs/io_types.h5
-rw-r--r--fs/bcachefs/journal.c65
-rw-r--r--fs/bcachefs/journal.h16
-rw-r--r--fs/bcachefs/journal_io.c19
-rw-r--r--fs/bcachefs/journal_reclaim.c12
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c27
-rw-r--r--fs/bcachefs/journal_types.h3
-rw-r--r--fs/bcachefs/migrate.c56
-rw-r--r--fs/bcachefs/move.c270
-rw-r--r--fs/bcachefs/move.h4
-rw-r--r--fs/bcachefs/move_types.h2
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/opts.c28
-rw-r--r--fs/bcachefs/opts.h23
-rw-r--r--fs/bcachefs/quota.c98
-rw-r--r--fs/bcachefs/rebalance.c11
-rw-r--r--fs/bcachefs/rebalance_types.h1
-rw-r--r--fs/bcachefs/recovery.c238
-rw-r--r--fs/bcachefs/recovery.h4
-rw-r--r--fs/bcachefs/reflink.c179
-rw-r--r--fs/bcachefs/reflink.h7
-rw-r--r--fs/bcachefs/replicas.c3
-rw-r--r--fs/bcachefs/str_hash.h160
-rw-r--r--fs/bcachefs/subvolume.c1084
-rw-r--r--fs/bcachefs/subvolume.h132
-rw-r--r--fs/bcachefs/subvolume_types.h11
-rw-r--r--fs/bcachefs/super-io.c12
-rw-r--r--fs/bcachefs/super.c339
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/sysfs.c142
-rw-r--r--fs/bcachefs/tests.c212
-rw-r--r--fs/bcachefs/util.c7
-rw-r--r--fs/bcachefs/util.h39
-rw-r--r--fs/bcachefs/varint.c82
-rw-r--r--fs/bcachefs/varint.h3
-rw-r--r--fs/bcachefs/xattr.c92
-rw-r--r--fs/bcachefs/xattr.h3
-rw-r--r--include/linux/exportfs.h6
-rw-r--r--include/trace/events/bcachefs.h254
-rw-r--r--kernel/locking/six.c2
99 files changed, 11644 insertions, 7337 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 57c5d58c2d87..27742ce276cd 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -20,6 +20,7 @@ config BCACHEFS_FS
select SIXLOCKS
select RAID6_PQ
select XOR_BLOCKS
+ select XXHASH
select SRCU
help
The bcachefs filesystem - a modern, copy on write filesystem, with
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index ee5e6dbd5ede..71cda24e6d08 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -49,6 +49,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
siphash.o \
+ subvolume.o \
super.o \
super-io.o \
sysfs.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 594e1f1a1291..5070caf8f349 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -212,50 +212,61 @@ bch2_acl_to_xattr(struct btree_trans *trans,
return xattr;
}
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
+ struct bkey_s_c k;
+ int ret;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
- &hash, inode->v.i_ino,
+ ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode),
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
- if (IS_ERR(iter)) {
- if (PTR_ERR(iter) == -EINTR)
+ if (ret) {
+ if (ret == -EINTR)
goto retry;
+ if (ret != -ENOENT)
+ acl = ERR_PTR(ret);
+ goto out;
+ }
- if (PTR_ERR(iter) != -ENOENT)
- acl = ERR_CAST(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ acl = ERR_PTR(ret);
goto out;
}
- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+ xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
- bch2_trans_iter_put(&trans, iter);
out:
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return acl;
}
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
int ret;
if (type == ACL_TYPE_DEFAULT &&
@@ -268,27 +279,27 @@ int bch2_set_acl_trans(struct btree_trans *trans,
if (IS_ERR(xattr))
return PTR_ERR(xattr);
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &xattr->k_i, 0);
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &search);
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &search);
}
return ret == -ENOENT ? 0 : ret;
}
-int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
+int bch2_set_acl(struct user_namespace *mnt_userns,
+ struct inode *vinode, struct posix_acl *_acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
- struct bch_hash_info hash_info;
struct posix_acl *acl;
umode_t mode;
int ret;
@@ -299,42 +310,37 @@ retry:
bch2_trans_begin(&trans);
acl = _acl;
- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
if (ret)
goto btree_err;
mode = inode_u.bi_mode;
if (type == ACL_TYPE_ACCESS) {
- ret = posix_acl_update_mode(&inode->v, &mode, &acl);
+ ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl);
if (ret)
goto btree_err;
}
- hash_info = bch2_hash_info_init(c, &inode_u);
-
- ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
if (ret)
goto btree_err;
inode_u.bi_ctime = bch2_current_time(c);
inode_u.bi_mode = mode;
- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK);
+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL, 0);
btree_err:
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err;
- bch2_inode_update_after_write(c, inode, &inode_u,
+ bch2_inode_update_after_write(&trans, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
@@ -345,31 +351,35 @@ err:
return ret;
}
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
{
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
+ struct bkey_s_c k;
int ret;
- iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
- &hash_info, inode->bi_inum,
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum,
&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret == -ENOENT ? 0 : ret;
- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+ k = bch2_btree_iter_peek_slot(&iter);
+ xattr = bkey_s_c_to_xattr(k);
+ if (ret)
+ goto err;
+
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
- if (ret || !acl)
+ if (IS_ERR_OR_NULL(acl))
goto err;
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
@@ -382,13 +392,14 @@ int bch2_acl_chmod(struct btree_trans *trans,
goto err;
}
- new->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+ new->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
- bch2_trans_iter_put(trans, iter);
- kfree(acl);
+ bch2_trans_iter_exit(trans, &iter);
+ if (!IS_ERR_OR_NULL(acl))
+ kfree(acl);
return ret;
}
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index ba210c26d5c1..2d76a4897ba8 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -26,27 +26,26 @@ typedef struct {
__le32 a_version;
} bch_acl_header;
-struct posix_acl *bch2_get_acl(struct inode *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
- const struct bch_hash_info *,
struct posix_acl *, int);
-int bch2_set_acl(struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
#else
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 2324b81c09ce..b2735c8591d6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
#define x(_name, _bits) \
if (fieldnr < a.v->nr_fields) { \
- ret = bch2_varint_decode(in, end, &v); \
+ ret = bch2_varint_decode_fast(in, end, &v); \
if (ret < 0) \
return ret; \
in += ret; \
@@ -147,10 +147,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
return 0;
}
-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+ out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
- struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+ struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
u8 *out = a->v.data;
u8 *end = (void *) &dst[1];
@@ -161,12 +195,13 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
a->v.gen = src.gen;
a->v.oldest_gen = src.oldest_gen;
a->v.data_type = src.data_type;
+ a->v.journal_seq = cpu_to_le64(src.journal_seq);
#define x(_name, _bits) \
nr_fields++; \
\
if (src._name) { \
- out += bch2_varint_encode(out, src._name); \
+ out += bch2_varint_encode_fast(out, src._name); \
\
last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \
@@ -194,10 +229,17 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
.gen = 0,
};
- if (k.k->type == KEY_TYPE_alloc_v2)
- bch2_alloc_unpack_v2(&ret, k);
- else if (k.k->type == KEY_TYPE_alloc)
+ switch (k.k->type) {
+ case KEY_TYPE_alloc:
bch2_alloc_unpack_v1(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v2:
+ bch2_alloc_unpack_v2(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v3:
+ bch2_alloc_unpack_v3(&ret, k);
+ break;
+ }
return ret;
}
@@ -206,7 +248,7 @@ void bch2_alloc_pack(struct bch_fs *c,
struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
- bch2_alloc_pack_v2(dst, src);
+ bch2_alloc_pack_v3(dst, src);
}
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -249,26 +291,41 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
return NULL;
}
+const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked u;
+
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
+
+ if (bch2_alloc_unpack_v3(&u, k))
+ return "unpack error";
+
+ return NULL;
+}
+
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
- pr_buf(out, "gen %u oldest_gen %u data_type %s",
- u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
+ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+ u.gen, u.oldest_gen, bch2_data_types[u.data_type],
+ u.journal_seq);
#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
BCH_ALLOC_FIELDS_V2()
#undef x
}
-static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct bch_dev *ca;
struct bucket *g;
struct bkey_alloc_unpacked u;
- if (k.k->type != KEY_TYPE_alloc &&
- k.k->type != KEY_TYPE_alloc_v2)
+ if (!bkey_is_alloc(k.k))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -289,11 +346,14 @@ static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
int bch2_alloc_read(struct bch_fs *c)
{
+ struct btree_trans trans;
int ret;
+ bch2_trans_init(&trans, c, 0, 0);
down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
up_read(&c->gc_lock);
+ bch2_trans_exit(&trans);
if (ret) {
bch_err(c, "error reading alloc info: %i", ret);
return ret;
@@ -353,32 +413,30 @@ err:
int bch2_alloc_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bch_dev *ca;
unsigned i;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(iter,
+ bch2_btree_iter_set_pos(&iter,
POS(ca->dev_idx, ca->mi.first_bucket));
- while (iter->pos.offset < ca->mi.nbuckets) {
- bch2_trans_cond_resched(&trans);
-
- ret = bch2_alloc_write_key(&trans, iter, flags);
+ while (iter.pos.offset < ca->mi.nbuckets) {
+ ret = bch2_alloc_write_key(&trans, &iter, flags);
if (ret) {
percpu_ref_put(&ca->ref);
goto err;
}
- bch2_btree_iter_next_slot(iter);
+ bch2_btree_iter_advance(&iter);
}
}
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -390,18 +448,18 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bucket *g;
struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
u64 *time, now;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
@@ -412,7 +470,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
- u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
+ u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
time = rw == READ ? &u.read_time : &u.write_time;
@@ -423,10 +481,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
*time = now;
bch2_alloc_pack(c, a, u);
- ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
+ ret = bch2_trans_update(trans, &iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -695,27 +753,28 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
- struct btree_iter *iter =
- bch2_trans_get_iter(trans, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
+ struct btree_iter iter;
int ret;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, b),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_INTENT);
+
a = bch2_trans_kmalloc(trans, sizeof(*a));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
percpu_down_read(&c->mark_lock);
g = bucket(ca, b);
m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(iter, g, m);
+ u = alloc_mem_to_key(&iter, g, m);
percpu_up_read(&c->mark_lock);
u.gen++;
@@ -726,10 +785,10 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
u.write_time = atomic64_read(&c->io_clock[WRITE].now);
bch2_alloc_pack(c, a, u);
- ret = bch2_trans_update(trans, iter, &a->k,
+ ret = bch2_trans_update(trans, &iter, &a->k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -856,10 +915,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
/* If we used NOWAIT, don't return the error: */
if (!fifo_empty(&ca->free_inc))
ret = 0;
- if (ret) {
+ if (ret < 0)
bch_err(ca, "error invalidating buckets: %i", ret);
+ if (ret)
return ret;
- }
if (journal_seq)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@@ -1015,7 +1074,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
lockdep_assert_held(&c->state_lock);
for_each_online_member(ca, c, i) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
}
@@ -1232,3 +1291,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list) {
+ pr_buf(out, "%zu ref %u type %s\n",
+ ob - c->open_buckets,
+ atomic_read(&ob->pin),
+ bch2_data_types[ob->type]);
+ }
+ spin_unlock(&ob->lock);
+ }
+
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 9cadfdb5b83d..370573f8e05d 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -9,6 +9,7 @@
extern const char * const bch2_allocator_states[];
struct bkey_alloc_unpacked {
+ u64 journal_seq;
u64 bucket;
u8 dev;
u8 gen;
@@ -21,19 +22,11 @@ struct bkey_alloc_unpacked {
struct bkey_alloc_buf {
struct bkey_i k;
+ struct bch_alloc_v3 v;
- union {
- struct {
#define x(_name, _bits) + _bits / 8
- u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
+ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
#undef x
- } _v1;
- struct {
-#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
-#undef x
- } _v2;
- };
} __attribute__((packed, aligned(8)));
/* How out of date a pointer gen is allowed to be: */
@@ -79,6 +72,7 @@ alloc_mem_to_key(struct btree_iter *iter,
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
@@ -91,6 +85,18 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.val_to_text = bch2_alloc_to_text, \
}
+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v3_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_alloc ||
+ k->type == KEY_TYPE_alloc_v2 ||
+ k->type == KEY_TYPE_alloc_v3;
+}
+
int bch2_alloc_read(struct bch_fs *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
@@ -132,4 +138,6 @@ int bch2_dev_allocator_start(struct bch_dev *);
int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b35d008bcc04..fdf3a777ae16 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -179,6 +179,7 @@
#undef pr_fmt
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
#include <linux/closure.h>
@@ -217,8 +218,8 @@
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
-#define bch2_fmt(_c, fmt) fmt "\n"
-#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt(_c, fmt) "%s: " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt) "%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
#endif
#define bch_info(c, fmt, ...) \
@@ -352,6 +353,7 @@ enum bch_time_stats {
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
+#include "subvolume_types.h"
#include "super_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
@@ -380,6 +382,8 @@ enum gc_phase {
GC_PHASE_BTREE_alloc,
GC_PHASE_BTREE_quotas,
GC_PHASE_BTREE_reflink,
+ GC_PHASE_BTREE_subvolumes,
+ GC_PHASE_BTREE_snapshots,
GC_PHASE_PENDING_DELETE,
};
@@ -491,12 +495,14 @@ struct bch_dev {
enum {
/* startup: */
+ BCH_FS_INITIALIZED,
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOC_CLEAN,
BCH_FS_ALLOCATOR_RUNNING,
BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
+ BCH_FS_TOPOLOGY_REPAIR_DONE,
BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
@@ -556,12 +562,27 @@ struct journal_keys {
u64 journal_seq_base;
};
-struct btree_iter_buf {
- struct btree_iter *iter;
+struct btree_path_buf {
+ struct btree_path *path;
};
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
+struct snapshot_t {
+ u32 parent;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 equiv;
+};
+
+typedef struct {
+ u32 subvol;
+ u64 inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
struct bch_fs {
struct closure cl;
@@ -633,6 +654,15 @@ struct bch_fs {
struct closure sb_write;
struct mutex sb_lock;
+ /* snapshot.c: */
+ GENRADIX(struct snapshot_t) snapshots;
+ struct bch_snapshot_table __rcu *snapshot_table;
+ struct mutex snapshot_table_lock;
+ struct work_struct snapshot_delete_work;
+ struct work_struct snapshot_wait_for_pagecache_and_delete_work;
+ struct snapshot_id_list snapshots_unlinked;
+ struct mutex snapshots_unlinked_lock;
+
/* BTREE CACHE */
struct bio_set btree_bio;
struct workqueue_struct *io_complete_wq;
@@ -665,16 +695,16 @@ struct bch_fs {
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
- mempool_t btree_iters_pool;
+ mempool_t btree_paths_pool;
mempool_t btree_trans_mem_pool;
- struct btree_iter_buf __percpu *btree_iters_bufs;
+ struct btree_path_buf __percpu *btree_paths_bufs;
struct srcu_struct btree_trans_barrier;
struct btree_key_cache btree_key_cache;
struct workqueue_struct *btree_update_wq;
- struct workqueue_struct *btree_error_wq;
+ struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
@@ -774,7 +804,7 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
@@ -790,6 +820,10 @@ struct bch_fs {
struct write_point copygc_write_point;
s64 copygc_wait;
+ /* DATA PROGRESS STATS */
+ struct list_head data_progress_list;
+ struct mutex data_progress_lock;
+
/* STRIPES: */
GENRADIX(struct stripe) stripes[2];
@@ -825,8 +859,6 @@ struct bch_fs {
atomic64_t btree_writes_nr;
atomic64_t btree_writes_sectors;
- struct bio_list btree_write_error_list;
- struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
/* ERRORS */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 79c0876aab8b..b115bd1fa5a3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -148,7 +148,8 @@ static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
}
#define POS_MIN SPOS(0, 0, 0)
-#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
/* Empty placeholder struct, for container_of() */
@@ -322,7 +323,7 @@ static inline void bkey_init(struct bkey *k)
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
- x(discard, 1) \
+ x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
@@ -341,7 +342,11 @@ static inline void bkey_init(struct bkey *k)
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
- x(alloc_v2, 20)
+ x(alloc_v2, 20) \
+ x(subvolume, 21) \
+ x(snapshot, 22) \
+ x(inode_v2, 23) \
+ x(alloc_v3, 24)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -354,7 +359,7 @@ struct bch_deleted {
struct bch_val v;
};
-struct bch_discard {
+struct bch_whiteout {
struct bch_val v;
};
@@ -678,6 +683,16 @@ struct bch_inode {
__u8 fields[0];
} __attribute__((packed, aligned(8)));
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[0];
+} __attribute__((packed, aligned(8)));
+
struct bch_inode_generation {
struct bch_val v;
@@ -685,6 +700,10 @@ struct bch_inode_generation {
__le32 pad;
} __attribute__((packed, aligned(8)));
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
#define BCH_INODE_FIELDS() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
@@ -708,7 +727,9 @@ struct bch_inode_generation {
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
- x(bi_dir_offset, 64)
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -763,6 +784,9 @@ LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
/* Dirents */
/*
@@ -780,7 +804,13 @@ struct bch_dirent {
struct bch_val v;
/* Target inode number: */
+ union {
__le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
/*
* Copy of mode bits 12-15 from the target inode - so userspace can get
@@ -791,6 +821,9 @@ struct bch_dirent {
__u8 d_name[];
} __attribute__((packed, aligned(8)));
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
offsetof(struct bch_dirent, d_name))
@@ -848,6 +881,17 @@ struct bch_alloc_v2 {
x(stripe, 32) \
x(stripe_redundancy, 8)
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+
enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
@@ -901,18 +945,24 @@ struct bch_stripe {
struct bch_reflink_p {
struct bch_val v;
__le64 idx;
-
- __le32 reservation_generation;
- __u8 nr_replicas;
- __u8 pad[3];
-};
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __attribute__((packed, aligned(8)));
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[0];
-};
+} __attribute__((packed, aligned(8)));
struct bch_indirect_inline_data {
struct bch_val v;
@@ -927,6 +977,43 @@ struct bch_inline_data {
u8 data[0];
};
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ __le32 pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -983,8 +1070,6 @@ LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
-#define BCH_TIER_MAX 4U
-
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
@@ -1209,7 +1294,12 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_inode_btree_change = 11,
bcachefs_metadata_version_snapshot = 12,
bcachefs_metadata_version_inode_backpointers = 13,
- bcachefs_metadata_version_max = 14,
+ bcachefs_metadata_version_btree_ptr_sectors_written = 14,
+ bcachefs_metadata_version_snapshot_2 = 15,
+ bcachefs_metadata_version_reflink_p_fix = 16,
+ bcachefs_metadata_version_subvol_dirent = 17,
+ bcachefs_metadata_version_inode_v2 = 18,
+ bcachefs_metadata_version_max = 19,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -1345,6 +1435,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
/*
* Features:
@@ -1352,7 +1443,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
* journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
* reflink: gates KEY_TYPE_reflink
* inline_data: gates KEY_TYPE_inline_data
- * new_siphash: gates BCH_STR_HASH_SIPHASH
+ * new_siphash: gates BCH_STR_HASH_siphash
* new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
*/
#define BCH_SB_FEATURES() \
@@ -1428,12 +1519,17 @@ enum bch_error_actions {
BCH_ON_ERROR_NR
};
+#define BCH_STR_HASH_TYPES() \
+ x(crc32c, 0) \
+ x(crc64, 1) \
+ x(siphash_old, 2) \
+ x(siphash, 3)
+
enum bch_str_hash_type {
- BCH_STR_HASH_CRC32C = 0,
- BCH_STR_HASH_CRC64 = 1,
- BCH_STR_HASH_SIPHASH_OLD = 2,
- BCH_STR_HASH_SIPHASH = 3,
- BCH_STR_HASH_NR = 4,
+#define x(t, n) BCH_STR_HASH_##t = n,
+ BCH_STR_HASH_TYPES()
+#undef x
+ BCH_STR_HASH_NR
};
#define BCH_STR_HASH_OPTS() \
@@ -1448,32 +1544,39 @@ enum bch_str_hash_opts {
BCH_STR_HASH_OPT_NR
};
+#define BCH_CSUM_TYPES() \
+ x(none, 0) \
+ x(crc32c_nonzero, 1) \
+ x(crc64_nonzero, 2) \
+ x(chacha20_poly1305_80, 3) \
+ x(chacha20_poly1305_128, 4) \
+ x(crc32c, 5) \
+ x(crc64, 6) \
+ x(xxhash, 7)
+
enum bch_csum_type {
- BCH_CSUM_NONE = 0,
- BCH_CSUM_CRC32C_NONZERO = 1,
- BCH_CSUM_CRC64_NONZERO = 2,
- BCH_CSUM_CHACHA20_POLY1305_80 = 3,
- BCH_CSUM_CHACHA20_POLY1305_128 = 4,
- BCH_CSUM_CRC32C = 5,
- BCH_CSUM_CRC64 = 6,
- BCH_CSUM_NR = 7,
+#define x(t, n) BCH_CSUM_##t = n,
+ BCH_CSUM_TYPES()
+#undef x
+ BCH_CSUM_NR
};
static const unsigned bch_crc_bytes[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C_NONZERO] = 4,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64_NONZERO] = 8,
- [BCH_CSUM_CRC64] = 8,
- [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
- [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
+ [BCH_CSUM_none] = 0,
+ [BCH_CSUM_crc32c_nonzero] = 4,
+ [BCH_CSUM_crc32c] = 4,
+ [BCH_CSUM_crc64_nonzero] = 8,
+ [BCH_CSUM_crc64] = 8,
+ [BCH_CSUM_xxhash] = 8,
+ [BCH_CSUM_chacha20_poly1305_80] = 10,
+ [BCH_CSUM_chacha20_poly1305_128] = 16,
};
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
{
switch (type) {
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128:
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128:
return true;
default:
return false;
@@ -1483,7 +1586,8 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
#define BCH_CSUM_OPTS() \
x(none, 0) \
x(crc32c, 1) \
- x(crc64, 2)
+ x(crc64, 2) \
+ x(xxhash, 3)
enum bch_csum_opts {
#define x(t, n) BCH_CSUM_OPT_##t = n,
@@ -1689,7 +1793,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
x(alloc, 4) \
x(quotas, 5) \
x(stripes, 6) \
- x(reflink, 7)
+ x(reflink, 7) \
+ x(subvolumes, 8) \
+ x(snapshots, 9)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
@@ -1736,6 +1842,9 @@ LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
struct bset, flags, 5, 6);
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32);
+
struct btree_node {
struct bch_csum csum;
__le64 magic;
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index f679fc2151bc..930981ad5535 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
__u64 nbuckets;
};
+struct bch_ioctl_subvolume {
+ __u32 flags;
+ __u32 dirfd;
+ __u16 mode;
+ __u16 pad[3];
+ __u64 dst_ptr;
+ __u64 src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 0053f32c0076..946dd27f09fc 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -620,22 +620,22 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
if (f->nr_fields != BKEY_NR_FIELDS)
return "incorrect number of fields";
+ /*
+ * Verify that the packed format can't represent fields larger than the
+ * unpacked format:
+ */
for (i = 0; i < f->nr_fields; i++) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 packed_max = f->bits_per_field[i]
+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ : 0;
u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (f->bits_per_field[i] > unpacked_bits)
+ if (packed_max + field_offset < packed_max ||
+ packed_max + field_offset > unpacked_max)
return "field too large";
- if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
- return "offset + bits overflow";
-
- if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
- unpacked_mask) <
- field_offset)
- return "offset + bits overflow";
-
bits += f->bits_per_field[i];
}
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 2e45d88fab03..7dee3d8e0a3d 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
@@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r)
return bpos_cmp(l, r) > 0 ? l : r;
}
-#define sbb(a, b, borrow) \
-do { \
- typeof(a) d1, d2; \
- \
- d1 = a - borrow; \
- borrow = d1 > a; \
- \
- d2 = d1 - b; \
- borrow += d2 > d1; \
- a = d2; \
-} while (0)
-
-/* returns a - b: */
-static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
-{
- int borrow = 0;
-
- sbb(a.snapshot, b.snapshot, borrow);
- sbb(a.offset, b.offset, borrow);
- sbb(a.inode, b.inode, borrow);
- return a;
-}
-
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
- if (bpos_cmp(l, r) > 0)
- swap(l, r);
-
- return bpos_sub(r, l);
-}
-
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 9f869bed9f1c..5c900cf8a8a2 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -11,6 +11,7 @@
#include "inode.h"
#include "quota.h"
#include "reflink.h"
+#include "subvolume.h"
#include "xattr.h"
const char * const bch2_bkey_types[] = {
@@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
.key_invalid = deleted_key_invalid, \
}
-#define bch2_bkey_ops_discard (struct bkey_ops) { \
+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
}
@@ -84,7 +85,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
.val_to_text = key_type_inline_data_to_text, \
}
-static const struct bkey_ops bch2_bkey_ops[] = {
+const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
@@ -100,31 +101,54 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_extents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_error)|
+ (1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_extent)|
(1U << KEY_TYPE_reservation)|
(1U << KEY_TYPE_reflink_p)|
(1U << KEY_TYPE_inline_data),
[BKEY_TYPE_inodes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
+ (1U << KEY_TYPE_inode_v2)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_dirent),
[BKEY_TYPE_xattrs] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
+ (1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_xattr),
[BKEY_TYPE_alloc] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
- (1U << KEY_TYPE_alloc_v2),
+ (1U << KEY_TYPE_alloc_v2)|
+ (1U << KEY_TYPE_alloc_v3),
[BKEY_TYPE_quotas] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
[BKEY_TYPE_stripes] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_stripe),
[BKEY_TYPE_reflink] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_reflink_v)|
(1U << KEY_TYPE_indirect_inline_data),
+ [BKEY_TYPE_subvolumes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_subvolume),
+ [BKEY_TYPE_snapshots] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_snapshot),
[BKEY_TYPE_btree] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
@@ -132,21 +156,18 @@ static unsigned bch2_key_types_allowed[] = {
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type)
{
- unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
- bch2_key_types_allowed[type] ;
-
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
- if (!(key_types_allowed & (1U << k.k->type)))
+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
return "invalid key type for this btree";
if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
- if (btree_node_type_is_extents(type)) {
- if ((k.k->size == 0) != bkey_deleted(k.k))
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ if (k.k->size == 0)
return "bad size field";
if (k.k->size > k.k->p.offset)
@@ -163,7 +184,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
- k.k->p.snapshot != U32_MAX)
+ !k.k->p.snapshot)
return "invalid snapshot field";
if (type != BKEY_TYPE_btree &&
@@ -213,6 +234,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
pr_buf(out, "POS_MIN");
else if (!bpos_cmp(pos, POS_MAX))
pr_buf(out, "POS_MAX");
+ else if (!bpos_cmp(pos, SPOS_MAX))
+ pr_buf(out, "SPOS_MAX");
else {
if (pos.inode == U64_MAX)
pr_buf(out, "U64_MAX");
@@ -267,7 +290,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
{
bch2_bkey_to_text(out, k.k);
- if (k.k) {
+ if (bkey_val_bytes(k.k)) {
pr_buf(out, ": ");
bch2_val_to_text(out, c, k);
}
@@ -290,24 +313,11 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
: false;
}
-enum merge_result bch2_bkey_merge(struct bch_fs *c,
- struct bkey_s l, struct bkey_s r)
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
- enum merge_result ret;
-
- if (bch2_key_merging_disabled ||
- !ops->key_merge ||
- l.k->type != r.k->type ||
- bversion_cmp(l.k->version, r.k->version) ||
- bpos_cmp(l.k->p, bkey_start_pos(r.k)))
- return BCH_MERGE_NOMERGE;
-
- ret = ops->key_merge(c, l, r);
- if (ret != BCH_MERGE_NOMERGE)
- l.k->needs_whiteout |= r.k->needs_whiteout;
- return ret;
+ return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
}
static const struct old_bkey_type {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index bfa6f112aeed..3012035db1a3 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -11,17 +11,6 @@ enum btree_node_type;
extern const char * const bch2_bkey_types[];
-enum merge_result {
- BCH_MERGE_NOMERGE,
-
- /*
- * The keys were mergeable, but would have overflowed size - so instead
- * l was changed to the maximum size, and both keys were modified:
- */
- BCH_MERGE_PARTIAL,
- BCH_MERGE_MERGE,
-};
-
struct bkey_ops {
/* Returns reason for being invalid if invalid, else NULL: */
const char * (*key_invalid)(const struct bch_fs *,
@@ -30,13 +19,14 @@ struct bkey_ops {
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
- enum merge_result (*key_merge)(struct bch_fs *,
- struct bkey_s, struct bkey_s);
+ bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
};
+extern const struct bkey_ops bch2_bkey_ops[];
+
const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
enum btree_node_type);
@@ -57,8 +47,17 @@ void bch2_bkey_swab_val(struct bkey_s);
bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-enum merge_result bch2_bkey_merge(struct bch_fs *,
- struct bkey_s, struct bkey_s);
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
+{
+ return l->type == r->type &&
+ !bversion_cmp(l->version, r->version) &&
+ !bpos_cmp(l->p, bkey_start_pos(r)) &&
+ (u64) l->size + r->size <= KEY_SIZE_MAX &&
+ bch2_bkey_ops[l->type].key_merge &&
+ !bch2_key_merging_disabled;
+}
+
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 8a149e21d0b4..59e4c1d1a2a5 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
return;
/* Verify no duplicates: */
- btree_node_iter_for_each(iter, set)
+ btree_node_iter_for_each(iter, set) {
+ BUG_ON(set->k > set->end);
btree_node_iter_for_each(iter, s2)
BUG_ON(set != s2 && set->end == s2->end);
+ }
/* Verify that set->end is correct: */
btree_node_iter_for_each(iter, set) {
@@ -1193,7 +1195,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
static inline void prefetch_four_cachelines(void *p)
{
-#if CONFIG_X86_64
+#ifdef CONFIG_X86_64
asm("prefetcht0 (-127 + 64 * 0)(%0);"
"prefetcht0 (-127 + 64 * 1)(%0);"
"prefetcht0 (-127 + 64 * 2)(%0);"
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 986f9b4b1a21..5ae61e5d3923 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -13,6 +13,8 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+struct lock_class_key bch2_btree_node_lock_key;
+
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned i, reserve = 16;
@@ -76,8 +78,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->data)
return -ENOMEM;
#ifdef __KERNEL__
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
#else
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -99,7 +100,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
return NULL;
bkey_btree_ptr_init(&b->key);
- six_lock_init(&b->c.lock);
+ __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
b->byte_order = ilog2(btree_bytes(c));
@@ -127,7 +128,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
- rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ BUG_ON(ret);
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
@@ -185,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
+wait_on_io:
+ if (b->flags & ((1U << BTREE_NODE_dirty)|
+ (1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ return -ENOMEM;
+
+ /* XXX: waiting on IO with btree cache lock held */
+ bch2_btree_node_wait_on_read(b);
+ bch2_btree_node_wait_on_write(b);
+ }
if (!six_trylock_intent(&b->c.lock))
return -ENOMEM;
@@ -192,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent;
+ /* recheck under lock */
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ goto out_unlock;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
+
if (btree_node_noevict(b))
goto out_unlock;
if (!btree_node_may_write(b))
goto out_unlock;
- if (btree_node_dirty(b) &&
- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
- goto out_unlock;
-
- if (btree_node_dirty(b) ||
- btree_node_write_in_flight(b) ||
- btree_node_read_in_flight(b)) {
- if (!flush)
+ if (btree_node_dirty(b)) {
+ if (!flush ||
+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock;
-
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -220,10 +234,11 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent);
else
- __bch2_btree_node_write(c, b);
+ __bch2_btree_node_write(c, b, false);
- /* wait for any in flight btree write */
- btree_node_wait_on_io(b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
}
out:
if (b->hash_val && !ret)
@@ -286,7 +301,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
list_for_each_entry_safe(b, t, &bc->freeable, list) {
touched++;
- if (freed >= nr)
+ if (touched >= nr)
break;
if (++i > 3 &&
@@ -301,7 +316,7 @@ restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
- if (freed >= nr) {
+ if (touched >= nr) {
/* Save position */
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
@@ -573,6 +588,7 @@ got_node:
}
BUG_ON(btree_node_hashed(b));
+ BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b));
out:
b->flags = 0;
@@ -617,7 +633,8 @@ err:
/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
- struct btree_iter *iter,
+ struct btree_trans *trans,
+ struct btree_path *path,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
@@ -626,14 +643,17 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
+ u32 seq;
BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
- if (iter && !bch2_btree_node_relock(iter, level + 1))
+ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+ btree_trans_restart(trans);
return ERR_PTR(-EINTR);
+ }
b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
@@ -655,25 +675,32 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return NULL;
}
- /*
- * Unlock before doing IO:
- *
- * XXX: ideally should be dropping all btree node locks here
- */
- if (iter && btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
-
- bch2_btree_node_read(c, b, sync);
+ set_btree_node_read_in_flight(b);
six_unlock_write(&b->c.lock);
+ seq = b->c.lock.state.seq;
+ six_unlock_intent(&b->c.lock);
- if (!sync) {
- six_unlock_intent(&b->c.lock);
+ /* Unlock before doing IO: */
+ if (trans && sync)
+ bch2_trans_unlock(trans);
+
+ bch2_btree_node_read(c, b, sync);
+
+ if (!sync)
return NULL;
+
+ if (trans &&
+ (!bch2_trans_relock(trans) ||
+ !bch2_btree_path_relock_intent(trans, path))) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(-EINTR);
}
- if (lock_type == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
+ if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+ btree_trans_restart(trans);
+ return ERR_PTR(-EINTR);
+ }
return b;
}
@@ -688,26 +715,25 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
- char buf1[100], buf2[100], buf3[100], buf4[100];
+ char buf1[200], buf2[100], buf3[100];
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
- bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
- ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
- : POS_MIN);
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
+ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
- bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
- bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
- "btree: ptr %u header %llu\n"
- "level: ptr %u header %llu\n"
- "min ptr %s node header %s\n"
- "max ptr %s node header %s",
- b->c.btree_id, BTREE_NODE_ID(b->data),
- b->c.level, BTREE_NODE_LEVEL(b->data),
- buf1, buf2, buf3, buf4);
+ "btree %s level %u\n"
+ "ptr: %s\n"
+ "header: btree %s level %llu\n"
+ "min %s max %s\n",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1,
+ bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ BTREE_NODE_LEVEL(b->data),
+ buf2, buf3);
}
static inline void btree_check_header(struct bch_fs *c, struct btree *b)
@@ -730,20 +756,28 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
* The btree node will have either a read or a write lock held, depending on
* the @write parameter.
*/
-struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
const struct bkey_i *k, unsigned level,
enum six_lock_type lock_type,
unsigned long trace_ip)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_node_mem_ptr(k);
- if (b)
- goto lock_node;
+ if (c->opts.btree_node_mem_ptr_optimization) {
+ b = btree_node_mem_ptr(k);
+ /*
+ * Check b->hash_val _before_ calling btree_node_lock() - this
+ * might not be the node we want anymore, and trying to lock the
+ * wrong node could cause an unneccessary transaction restart:
+ */
+ if (b && b->hash_val == btree_ptr_hash_val(k))
+ goto lock_node;
+ }
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
@@ -752,7 +786,7 @@ retry:
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
level, lock_type, true);
/* We raced and found the btree node in the cache */
@@ -791,12 +825,12 @@ lock_node:
* the parent was modified, when the pointer to the node we want
* was removed - and we'll bail out:
*/
- if (btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(path, level + 1);
- if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+ if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
lock_node_check_fn, (void *) k, trace_ip)) {
- if (b->hash_val != btree_ptr_hash_val(k))
+ if (!trans->restarted)
goto retry;
return ERR_PTR(-EINTR);
}
@@ -805,20 +839,40 @@ lock_node:
b->c.level != level ||
race_fault())) {
six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(iter, level + 1))
+ if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(iter->trans->ip,
+ trace_trans_restart_btree_node_reused(trans->ip,
trace_ip,
- iter->btree_id,
- &iter->real_pos);
+ path->btree_id,
+ &path->pos);
+ btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
}
- /* XXX: waiting on IO with btree locks held: */
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
+ if (unlikely(btree_node_read_in_flight(b))) {
+ u32 seq = b->c.lock.state.seq;
+
+ six_unlock_type(&b->c.lock, lock_type);
+ bch2_trans_unlock(trans);
+
+ bch2_btree_node_wait_on_read(b);
+
+ /*
+ * should_be_locked is not set on this path yet, so we need to
+ * relock it specifically:
+ */
+ if (trans &&
+ (!bch2_trans_relock(trans) ||
+ !bch2_btree_path_relock_intent(trans, path))) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(-EINTR);
+ }
+
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ goto retry;
+ }
prefetch(b->aux_data);
@@ -839,7 +893,7 @@ lock_node:
return ERR_PTR(-EIO);
}
- EBUG_ON(b->c.btree_id != iter->btree_id);
+ EBUG_ON(b->c.btree_id != path->btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
@@ -859,16 +913,18 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
EBUG_ON(level >= BTREE_MAX_DEPTH);
- b = btree_node_mem_ptr(k);
- if (b)
- goto lock_node;
+ if (c->opts.btree_node_mem_ptr_optimization) {
+ b = btree_node_mem_ptr(k);
+ if (b)
+ goto lock_node;
+ }
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
if (nofill)
goto out;
- b = bch2_btree_node_fill(c, NULL, k, btree_id,
+ b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
level, SIX_LOCK_read, true);
/* We raced and found the btree node in the cache */
@@ -896,8 +952,7 @@ lock_node:
}
/* XXX: waiting on IO with btree locks held: */
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
+ __bch2_btree_node_wait_on_read(b);
prefetch(b->aux_data);
@@ -927,21 +982,25 @@ out:
return b;
}
-void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
- const struct bkey_i *k,
- enum btree_id btree_id, unsigned level)
+int bch2_btree_node_prefetch(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ const struct bkey_i *k,
+ enum btree_id btree_id, unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- BUG_ON(iter && !btree_node_locked(iter, level + 1));
+ BUG_ON(trans && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);
if (b)
- return;
+ return 0;
- bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+ b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+ level, SIX_LOCK_read, false);
+ return PTR_ERR_OR_ZERO(b);
}
void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
@@ -952,16 +1011,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+wait_on_io:
+ /* not allowed to wait on io with btree locks held: */
+
+ /* XXX we're called from btree_gc which will be holding other btree
+ * nodes locked
+ * */
+ __bch2_btree_node_wait_on_read(b);
+ __bch2_btree_node_wait_on_write(b);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
- __bch2_btree_node_write(c, b);
-
- /* wait for any in flight btree write */
- btree_node_wait_on_io(b);
+ if (btree_node_dirty(b)) {
+ __bch2_btree_node_write(c, b, false);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
BUG_ON(btree_node_dirty(b));
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 40dd263a7caa..402cec1802bc 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -5,6 +5,8 @@
#include "bcachefs.h"
#include "btree_types.h"
+extern struct lock_class_key bch2_btree_node_lock_key;
+
struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
@@ -20,15 +22,15 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned, bool);
-void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
- const struct bkey_i *, enum btree_id, unsigned);
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
+ const struct bkey_i *, enum btree_id, unsigned);
void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ba560fbd5f36..091bddee575d 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -36,6 +36,9 @@
#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
+#define DROP_THIS_NODE 10
+#define DROP_PREV_NODE 11
+
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
{
preempt_disable();
@@ -83,12 +86,17 @@ static int bch2_gc_check_topology(struct bch_fs *c,
if (bpos_cmp(expected_start, bp->v.min_key)) {
bch2_topology_error(c);
- if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+ if (__fsck_err(c,
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1,
+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
return FSCK_ERR_START_TOPOLOGY_REPAIR;
} else {
@@ -100,12 +108,17 @@ static int bch2_gc_check_topology(struct bch_fs *c,
if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
bch2_topology_error(c);
- if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
- " %s\n"
- " expected %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+ if (__fsck_err(c,
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
return FSCK_ERR_START_TOPOLOGY_REPAIR;
} else {
@@ -203,8 +216,8 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
- struct btree *prev, struct btree *cur)
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur)
{
struct bpos expected_start = !prev
? b->data->min_key
@@ -220,22 +233,50 @@ static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
}
- if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
- if (prev &&
- bpos_cmp(expected_start, cur->data->min_key) > 0 &&
- BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+
+ if (prev &&
+ bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+ /* cur overwrites prev: */
+
+ if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
+ cur->data->min_key) >= 0, c,
+ "btree node overwritten by next node at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ return DROP_PREV_NODE;
+
+ if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
+ bpos_predecessor(cur->data->min_key)), c,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
ret = set_node_max(c, prev,
- bpos_predecessor(cur->data->min_key));
- else
- ret = set_node_min(c, cur, expected_start);
- if (ret)
- return ret;
+ bpos_predecessor(cur->data->min_key));
+ } else {
+ /* prev overwrites cur: */
+
+ if (mustfix_fsck_err_on(bpos_cmp(expected_start,
+ cur->data->max_key) >= 0, c,
+ "btree node overwritten by prev node at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ return DROP_THIS_NODE;
+
+ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ ret = set_node_min(c, cur, expected_start);
}
fsck_err:
return ret;
@@ -262,13 +303,11 @@ fsck_err:
return ret;
}
-#define DROP_THIS_NODE 10
-
static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bkey_buf tmp;
+ struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false;
char buf[200];
@@ -277,15 +316,20 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
if (!b->c.level)
return 0;
again:
+ prev = NULL;
have_child = dropped_children = false;
- bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_init(&prev_k);
+ bch2_bkey_buf_init(&cur_k);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+
bch2_btree_and_journal_iter_advance(&iter);
- bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
- cur = bch2_btree_node_get_noiter(c, tmp.k,
+ cur = bch2_btree_node_get_noiter(c, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@@ -295,12 +339,12 @@ again:
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
- bch2_btree_node_evict(c, tmp.k);
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
+ bch2_btree_node_evict(c, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, tmp.k->k.p);
+ b->c.level, cur_k.k->k.p);
if (ret)
- goto err;
+ break;
continue;
}
@@ -310,14 +354,39 @@ again:
break;
}
- ret = btree_repair_node_start(c, b, prev, cur);
+ ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+ if (ret == DROP_THIS_NODE) {
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(c, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ if (ret)
+ break;
+ continue;
+ }
+
if (prev)
six_unlock_read(&prev->c.lock);
- prev = cur;
- cur = NULL;
+ prev = NULL;
- if (ret)
+ if (ret == DROP_PREV_NODE) {
+ bch2_btree_node_evict(c, prev_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, prev_k.k->k.p);
+ if (ret)
+ break;
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
+ goto again;
+ } else if (ret)
break;
+
+ prev = cur;
+ cur = NULL;
+ bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
}
if (!ret && !IS_ERR_OR_NULL(prev)) {
@@ -339,10 +408,10 @@ again:
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- cur = bch2_btree_node_get_noiter(c, tmp.k,
+ cur = bch2_btree_node_get_noiter(c, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@@ -358,9 +427,9 @@ again:
cur = NULL;
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(c, tmp.k);
+ bch2_btree_node_evict(c, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, tmp.k->k.p);
+ b->c.level, cur_k.k->k.p);
dropped_children = true;
}
@@ -385,7 +454,8 @@ fsck_err:
six_unlock_read(&cur->c.lock);
bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&tmp, c);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
if (!ret && dropped_children)
goto again;
@@ -428,28 +498,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
char buf[200];
int ret = 0;
+ /*
+ * XXX
+ * use check_bucket_ref here
+ */
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
- if (fsck_err_on(g->mark.data_type &&
- g->mark.data_type != data_type, c,
- "bucket %u:%zu different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->mark.data_type],
- bch2_data_types[data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
- if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
- } else {
- do_update = true;
- }
- }
-
if (fsck_err_on(!g->gen_valid, c,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
@@ -466,6 +524,19 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
}
}
+ if (fsck_err_on(data_type == BCH_DATA_btree &&
+ g->mark.gen != p.ptr.gen, c,
+ "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ g2->_mark.data_type = g->_mark.data_type = data_type;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ }
+
if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
@@ -486,6 +557,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
}
}
+ if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+
if (fsck_err_on(!p.ptr.cached &&
gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
@@ -496,6 +576,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
do_update = true;
+ if (p.ptr.gen != g->mark.gen)
+ continue;
+
+ if (fsck_err_on(g->mark.data_type &&
+ g->mark.data_type != data_type, c,
+ "bucket %u:%zu different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[g->mark.data_type],
+ bch2_data_types[data_type],
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g2->_mark.data_type = g->_mark.data_type = data_type;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
if (p.has_ec) {
struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
@@ -557,6 +657,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
(!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
(!ptr->cached &&
gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+ gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
(g->mark.data_type &&
g->mark.data_type != data_type);
}));
@@ -601,16 +702,18 @@ fsck_err:
/* marking of btree keys/nodes: */
-static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k,
u8 *max_stale, bool initial)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr;
unsigned flags =
BTREE_TRIGGER_GC|
(initial ? BTREE_TRIGGER_NOATOMIC : 0);
+ char buf[200];
int ret = 0;
if (initial) {
@@ -629,8 +732,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
- "superblock not marked as containing replicas (type %u)",
- k->k->type)) {
+ "superblock not marked as containing replicas\n"
+ " while marking %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
ret = bch2_mark_bkey_replicas(c, *k);
if (ret) {
bch_err(c, "error marking bkey replicas: %i", ret);
@@ -650,7 +754,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
- bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
+ ret = bch2_mark_key(trans, *k, flags);
fsck_err:
err:
if (ret)
@@ -658,9 +762,10 @@ err:
return ret;
}
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
bool initial)
{
+ struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
@@ -678,7 +783,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, max_stale, initial);
if (ret)
break;
@@ -700,11 +805,11 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
return ret;
}
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
bool initial, bool metadata_only)
{
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct btree *b;
unsigned depth = metadata_only ? 1
: bch2_expensive_debug_checks ? 0
@@ -713,39 +818,32 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
u8 max_stale = 0;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
- __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b) {
+ __for_each_btree_node(trans, iter, btree_id, POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
bch2_verify_btree_nr_keys(b);
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(c, b, &max_stale, initial);
+ ret = btree_gc_mark_node(trans, b, &max_stale, initial);
if (ret)
break;
if (!initial) {
if (max_stale > 64)
- bch2_btree_node_rewrite(c, iter,
- b->data->keys.seq,
+ bch2_btree_node_rewrite(trans, &iter, b,
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
else if (!bch2_btree_gc_rewrite_disabled &&
(bch2_btree_gc_always_rewrite || max_stale > 16))
- bch2_btree_node_rewrite(c, iter,
- b->data->keys.seq,
- BTREE_INSERT_NOWAIT|
+ bch2_btree_node_rewrite(trans, &iter,
+ b, BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
}
-
- bch2_trans_cond_resched(&trans);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
@@ -754,7 +852,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
if (!btree_node_fake(b)) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
&k, &max_stale, initial);
}
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
@@ -763,9 +861,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
return ret;
}
-static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
unsigned target_depth)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
@@ -782,7 +881,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, &max_stale, true);
if (ret) {
bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
@@ -823,11 +922,16 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
if (ret == -EIO) {
bch2_topology_error(c);
- if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
- " %s",
- bch2_btree_ids[b->c.btree_id],
- b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+ if (__fsck_err(c,
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ "Unreadable btree node at btree %s level %u:\n"
+ " %s",
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level - 1,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
bch_info(c, "Halting mark and sweep to start topology repair pass");
goto fsck_err;
@@ -844,7 +948,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
break;
}
- ret = bch2_gc_btree_init_recurse(c, child,
+ ret = bch2_gc_btree_init_recurse(trans, child,
target_depth);
six_unlock_read(&child->c.lock);
@@ -859,10 +963,11 @@ fsck_err:
return ret;
}
-static int bch2_gc_btree_init(struct bch_fs *c,
+static int bch2_gc_btree_init(struct btree_trans *trans,
enum btree_id btree_id,
bool metadata_only)
{
+ struct bch_fs *c = trans->c;
struct btree *b;
unsigned target_depth = metadata_only ? 1
: bch2_expensive_debug_checks ? 0
@@ -886,7 +991,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
goto fsck_err;
}
- if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+ if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
"btree root with incorrect max_key: %s",
(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
bch_err(c, "repair unimplemented");
@@ -895,12 +1000,12 @@ static int bch2_gc_btree_init(struct bch_fs *c,
}
if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(c, b, target_depth);
+ ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
if (!ret) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
&k, &max_stale, true);
}
fsck_err:
@@ -919,21 +1024,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
{
+ struct btree_trans trans;
enum btree_id ids[BTREE_ID_NR];
unsigned i;
int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
for (i = 0; i < BTREE_ID_NR && !ret; i++)
ret = initial
- ? bch2_gc_btree_init(c, ids[i], metadata_only)
- : bch2_gc_btree(c, ids[i], initial, metadata_only);
+ ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
+ : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
if (ret < 0)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ bch2_trans_exit(&trans);
return ret;
}
@@ -1020,9 +1130,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_GC);
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
@@ -1283,8 +1391,10 @@ static int bch2_gc_start(struct bch_fs *c,
return 0;
}
-static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct reflink_gc *r;
const __le64 *refcount = bkey_refcount_c(k);
char buf[200];
@@ -1339,7 +1449,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
size_t idx = 0;
@@ -1349,16 +1459,16 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
if (metadata_only)
return 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
if (initial) {
c->reflink_gc_idx = 0;
- ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
bch2_gc_reflink_done_initial_fn);
goto out;
}
- bch2_trans_init(&trans, c, 0, 0);
-
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
@@ -1405,17 +1515,19 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
}
}
fsck_err:
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(&trans, &iter);
out:
genradix_free(&c->reflink_gc_table);
c->reflink_gc_nr = 0;
+ bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct reflink_gc *r;
const __le64 *refcount = bkey_refcount_c(k);
@@ -1437,22 +1549,23 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bool metadata_only)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
- int ret;
+ int ret = 0;
if (metadata_only)
return 0;
+ bch2_trans_init(&trans, c, 0, 0);
genradix_free(&c->reflink_gc_table);
c->reflink_gc_nr = 0;
- if (initial)
- return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
- bch2_gc_reflink_start_initial_fn);
-
- bch2_trans_init(&trans, c, 0, 0);
+ if (initial) {
+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
+ bch2_gc_reflink_start_initial_fn);
+ goto out;
+ }
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
@@ -1472,10 +1585,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
r->size = k.k->size;
r->refcount = 0;
}
- bch2_trans_iter_put(&trans, iter);
-
+ bch2_trans_iter_exit(&trans, &iter);
+out:
bch2_trans_exit(&trans);
- return 0;
+ return ret;
}
/**
@@ -1519,7 +1632,7 @@ again:
bch2_mark_superblocks(c);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
c->opts.fix_errors != FSCK_OPT_NO) {
bch_info(c, "starting topology repair pass");
@@ -1527,11 +1640,14 @@ again:
if (ret)
goto out;
bch_info(c, "topology repair pass done");
+
+ set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
}
ret = bch2_gc_btrees(c, initial, metadata_only);
if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+ !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
ret = 0;
@@ -1644,7 +1760,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0, commit_err = 0;
@@ -1652,22 +1768,28 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- c->gc_gens_pos = iter->pos;
+ while ((bch2_trans_begin(&trans),
+ k = bch2_btree_iter_peek(&iter)).k) {
+ ret = bkey_err(k);
+
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
+
+ c->gc_gens_pos = iter.pos;
if (gc_btree_gens_key(c, k) && !commit_err) {
bch2_bkey_buf_reassemble(&sk, c, k);
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
commit_err =
- bch2_trans_update(&trans, iter, sk.k, 0) ?:
+ bch2_trans_update(&trans, &iter, sk.k, 0) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOWAIT|
BTREE_INSERT_NOFAIL);
@@ -1677,9 +1799,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
}
}
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index e9a87394370a..59dfb069e699 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -87,7 +87,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
*/
static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
{
- return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+ return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 47cfd8a08f91..f11fcab61902 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -22,6 +22,51 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+ EBUG_ON(!btree_node_write_in_flight(b));
+
+ clear_btree_node_write_in_flight_inner(b);
+ clear_btree_node_write_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
struct bkey_packed *end)
@@ -420,17 +465,17 @@ void bch2_btree_build_aux_trees(struct btree *b)
*
* Returns true if we sorted (i.e. invalidated iterators
*/
-void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_node_entry *bne;
bool reinit_iter = false;
EBUG_ON(!(b->c.lock.state.seq & 1));
- EBUG_ON(iter && iter->l[b->c.level].b != b);
BUG_ON(bset_written(b, bset(b, &b->set[1])));
- if (b->nsets == MAX_BSETS) {
+ if (b->nsets == MAX_BSETS &&
+ !btree_node_write_in_flight(b)) {
unsigned log_u64s[] = {
ilog2(bset_u64s(&b->set[0])),
ilog2(bset_u64s(&b->set[1])),
@@ -455,8 +500,8 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bch2_btree_build_aux_trees(b);
- if (iter && reinit_iter)
- bch2_btree_iter_reinit_node(iter, b);
+ if (reinit_iter)
+ bch2_trans_node_reinit_iter(trans, b);
}
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
@@ -565,13 +610,16 @@ out: \
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
struct bset_tree *t;
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
for_each_bset(b, t) {
struct bset *i = bset(b, t);
struct bkey_packed *k;
for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
- if (bkey_cmp_left_packed(b, k, &b->data->min_key) < 0)
+ if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
break;
if (k != i->start) {
@@ -596,11 +644,17 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
}
bch2_btree_build_aux_trees(b);
+
+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+ }
}
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
- unsigned sectors, int write, bool have_retry)
+ unsigned offset, unsigned sectors,
+ int write, bool have_retry)
{
unsigned version = le16_to_cpu(i->version);
const char *err;
@@ -638,18 +692,23 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_FATAL, c, ca, b, i,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
+ if (btree_err_on(offset + sectors > c->opts.btree_node_size,
BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
return 0;
}
- btree_err_on(b->written && !i->u64s,
+ btree_err_on(offset && !i->u64s,
BTREE_ERR_FIXABLE, c, ca, b, i,
"empty bset");
- if (!b->written) {
+ btree_err_on(BSET_OFFSET(i) &&
+ BSET_OFFSET(i) != offset,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ "bset at wrong sector offset");
+
+ if (!offset) {
struct btree_node *bn =
container_of(i, struct btree_node, keys);
/* These indicate that we read the wrong btree node: */
@@ -815,7 +874,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
- unsigned nonblacklisted_written = 0;
+ unsigned blacklisted_written, nonblacklisted_written = 0;
+ unsigned ptr_written = btree_ptr_sectors_written(&b->key);
int ret, retry_read = 0, write = READ;
b->version_ondisk = U16_MAX;
@@ -846,7 +906,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->data->keys.seq, bp->seq);
}
- while (b->written < c->opts.btree_node_size) {
+ while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
@@ -902,7 +962,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
- ret = validate_bset(c, ca, b, i, sectors,
+ ret = validate_bset(c, ca, b, i, b->written, sectors,
READ, have_retry);
if (ret)
goto fsck_err;
@@ -926,6 +986,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(blacklisted && first,
BTREE_ERR_FIXABLE, c, ca, b, i,
"first btree node bset has blacklisted journal seq");
+
+ btree_err_on(blacklisted && ptr_written,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
+ "found blacklisted bset in btree node with sectors_written");
if (blacklisted && !first)
continue;
@@ -939,26 +1003,34 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonblacklisted_written = b->written;
}
- for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
- bne = (void *) bne + block_bytes(c))
- btree_err_on(bne->keys.seq == b->data->keys.seq &&
- !bch2_journal_seq_is_blacklisted(c,
- le64_to_cpu(bne->keys.journal_seq),
- true),
+ if (ptr_written) {
+ btree_err_on(b->written < ptr_written,
BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
- "found bset signature after last bset");
+ "btree node data missing: expected %u sectors, found %u",
+ ptr_written, b->written);
+ } else {
+ for (bne = write_block(b);
+ bset_byte_offset(b, bne) < btree_bytes(c);
+ bne = (void *) bne + block_bytes(c))
+ btree_err_on(bne->keys.seq == b->data->keys.seq &&
+ !bch2_journal_seq_is_blacklisted(c,
+ le64_to_cpu(bne->keys.journal_seq),
+ true),
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+ "found bset signature after last bset");
- /*
- * Blacklisted bsets are those that were written after the most recent
- * (flush) journal write. Since there wasn't a flush, they may not have
- * made it to all devices - which means we shouldn't write new bsets
- * after them, as that could leave a gap and then reads from that device
- * wouldn't find all the bsets in that btree node - which means it's
- * important that we start writing new bsets after the most recent _non_
- * blacklisted bset:
- */
- b->written = nonblacklisted_written;
+ /*
+ * Blacklisted bsets are those that were written after the most recent
+ * (flush) journal write. Since there wasn't a flush, they may not have
+ * made it to all devices - which means we shouldn't write new bsets
+ * after them, as that could leave a gap and then reads from that device
+ * wouldn't find all the bsets in that btree node - which means it's
+ * important that we start writing new bsets after the most recent _non_
+ * blacklisted bset:
+ */
+ blacklisted_written = b->written;
+ b->written = nonblacklisted_written;
+ }
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
sorted->keys.u64s = 0;
@@ -1026,6 +1098,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ca->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
+
+ if (!ptr_written)
+ set_btree_node_need_rewrite(b);
out:
mempool_free(iter, &c->fill_iter);
return retry_read;
@@ -1179,31 +1254,27 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
container_of(cl, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
- bool have_good_copy = false;
bool dump_bset_maps = false;
bool have_retry = false;
- int ret = 0, write = READ;
- unsigned i, written, written2;
+ int ret = 0, best = -1, write = READ;
+ unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
for (i = 0; i < ra->nr; i++) {
+ struct btree_node *bn = ra->buf[i];
+
if (ra->err[i])
continue;
- if (!have_good_copy) {
- memcpy(b->data, ra->buf[i], btree_bytes(c));
- have_good_copy = true;
- written = btree_node_sectors_written(c, b->data);
- }
+ if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+ (seq && seq != bn->keys.seq))
+ continue;
- /* Try to get the right btree node: */
- if (have_good_copy &&
- seq &&
- b->data->keys.seq != seq &&
- ((struct btree_node *) ra->buf[i])->keys.seq == seq) {
- memcpy(b->data, ra->buf[i], btree_bytes(c));
- written = btree_node_sectors_written(c, b->data);
+ if (best < 0) {
+ best = i;
+ written = btree_node_sectors_written(c, bn);
+ continue;
}
written2 = btree_node_sectors_written(c, ra->buf[i]);
@@ -1213,14 +1284,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
"found bset signature after last bset") ||
- btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
+ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
"btree node replicas content mismatch"))
dump_bset_maps = true;
if (written2 > written) {
written = written2;
- memcpy(b->data, ra->buf[i], btree_bytes(c));
+ best = i;
}
}
fsck_err:
@@ -1273,9 +1344,14 @@ fsck_err:
}
}
- if (have_good_copy)
- bch2_btree_node_read_done(c, NULL, b, false);
- else
+ if (best >= 0) {
+ memcpy(b->data, ra->buf[best], btree_bytes(c));
+ ret = bch2_btree_node_read_done(c, NULL, b, false);
+ } else {
+ ret = -1;
+ }
+
+ if (ret)
set_btree_node_read_error(b);
for (i = 0; i < ra->nr; i++) {
@@ -1390,8 +1466,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b);
- set_btree_node_read_in_flight(b);
-
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
return;
@@ -1467,6 +1541,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+ set_btree_node_read_in_flight(b);
+
bch2_btree_node_read(c, b, true);
if (btree_node_read_error(b)) {
@@ -1510,83 +1586,50 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
+ unsigned long old, new, v;
bch2_btree_complete_write(c, b, w);
- btree_node_io_unlock(b);
-}
-
-static void bch2_btree_node_write_error(struct bch_fs *c,
- struct btree_write_bio *wbio)
-{
- struct btree *b = wbio->wbio.bio.bi_private;
- struct bkey_buf k;
- struct bch_extent_ptr *ptr;
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret;
-
- bch2_bkey_buf_init(&k);
- bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level, 0);
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
-
- /* has node been freed? */
- if (iter->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- goto out;
- }
- BUG_ON(!btree_node_hashed(b));
-
- bch2_bkey_buf_copy(&k, c, &b->key);
+ v = READ_ONCE(b->flags);
+ do {
+ old = new = v;
- bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
- bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+ if (old & (1U << BTREE_NODE_need_write))
+ goto do_write;
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
- goto err;
+ new &= ~(1U << BTREE_NODE_write_in_flight);
+ new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
- ret = bch2_btree_node_update_key(c, iter, b, k.k);
- if (ret == -EINTR)
- goto retry;
- if (ret)
- goto err;
-out:
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&k, c);
- bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
return;
-err:
- set_btree_node_noevict(b);
- bch2_fs_fatal_error(c, "fatal error writing btree node");
- goto out;
-}
-void bch2_btree_write_error_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- btree_write_error_work);
- struct bio *bio;
+do_write:
+ six_lock_read(&b->c.lock, NULL, NULL);
+ v = READ_ONCE(b->flags);
+ do {
+ old = new = v;
- while (1) {
- spin_lock_irq(&c->btree_write_error_lock);
- bio = bio_list_pop(&c->btree_write_error_list);
- spin_unlock_irq(&c->btree_write_error_lock);
+ if ((old & (1U << BTREE_NODE_dirty)) &&
+ (old & (1U << BTREE_NODE_need_write)) &&
+ !(old & (1U << BTREE_NODE_never_write)) &&
+ btree_node_may_write(b)) {
+ new &= ~(1U << BTREE_NODE_dirty);
+ new &= ~(1U << BTREE_NODE_need_write);
+ new |= (1U << BTREE_NODE_write_in_flight);
+ new |= (1U << BTREE_NODE_write_in_flight_inner);
+ new |= (1U << BTREE_NODE_just_written);
+ new ^= (1U << BTREE_NODE_write_idx);
+ } else {
+ new &= ~(1U << BTREE_NODE_write_in_flight);
+ new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+ }
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
- if (!bio)
- break;
+ if (new & (1U << BTREE_NODE_write_in_flight))
+ __bch2_btree_node_write(c, b, true);
- bch2_btree_node_write_error(c,
- container_of(bio, struct btree_write_bio, wbio.bio));
- }
+ six_unlock_read(&b->c.lock);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1595,25 +1638,39 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
+ struct bch_extent_ptr *ptr;
+ int ret;
btree_bounce_free(c,
- wbio->bytes,
+ wbio->data_bytes,
wbio->wbio.used_mempool,
wbio->data);
- if (wbio->wbio.failed.nr) {
- unsigned long flags;
+ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
- spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+ goto err;
- queue_work(c->btree_error_wq, &c->btree_write_error_work);
- return;
- }
+ if (wbio->wbio.first_btree_write) {
+ if (wbio->wbio.failed.nr) {
+ }
+ } else {
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+ !wbio->wbio.failed.nr));
+ if (ret)
+ goto err;
+ }
+out:
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
+ return;
+err:
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_error(c, "fatal error writing btree node");
+ goto out;
}
static void btree_node_write_endio(struct bio *bio)
@@ -1621,7 +1678,9 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
+ struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio);
struct bch_fs *c = wbio->c;
+ struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
@@ -1642,13 +1701,13 @@ static void btree_node_write_endio(struct bio *bio)
if (parent) {
bio_put(bio);
bio_endio(&parent->bio);
- } else {
- struct btree_write_bio *wb =
- container_of(orig, struct btree_write_bio, wbio);
-
- INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->io_complete_wq, &wb->work);
+ return;
}
+
+ clear_btree_node_write_in_flight_inner(b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+ INIT_WORK(&wb->work, btree_node_write_work);
+ queue_work(c->btree_io_complete_wq, &wb->work);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
@@ -1661,7 +1720,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return -1;
ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
- validate_bset(c, NULL, b, i, sectors, WRITE, false);
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -1673,18 +1732,24 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+ struct bch_extent_ptr *ptr;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
+ bkey_copy(&tmp.k, &wbio->key);
+
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+ ptr->offset += wbio->sector_offset;
+
+ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
}
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
{
struct btree_write_bio *wbio;
struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
- struct bch_extent_ptr *ptr;
struct sort_iter sort_iter;
struct nonce nonce;
unsigned bytes_to_write, sectors_to_write, bytes, u64s;
@@ -1694,6 +1759,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
bool validate_before_checksum = false;
void *data;
+ if (already_started)
+ goto do_write;
+
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return;
@@ -1716,22 +1784,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
if (old & (1 << BTREE_NODE_never_write))
return;
- if (old & (1 << BTREE_NODE_write_in_flight)) {
- /*
- * XXX waiting on btree writes with btree locks held -
- * this can deadlock, and we hit the write error path
- */
- btree_node_wait_on_io(b);
- continue;
- }
+ BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
new |= (1 << BTREE_NODE_write_in_flight);
+ new |= (1 << BTREE_NODE_write_in_flight_inner);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ if (new & (1U << BTREE_NODE_need_write))
+ return;
+do_write:
atomic_dec(&c->btree_cache.dirty);
BUG_ON(btree_node_fake(b));
@@ -1818,6 +1883,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
i->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le16(BCH_BSET_VERSION_OLD)
: cpu_to_le16(c->sb.version);
+ SET_BSET_OFFSET(i, b->written);
SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
@@ -1876,37 +1942,30 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
wbio->data = data;
- wbio->bytes = bytes;
+ wbio->data_bytes = bytes;
+ wbio->sector_offset = b->written;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
+ wbio->wbio.first_btree_write = !b->written;
wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
- /*
- * If we're appending to a leaf node, we don't technically need FUA -
- * this write just needs to be persisted before the next journal write,
- * which will be marked FLUSH|FUA.
- *
- * Similarly if we're writing a new btree root - the pointer is going to
- * be in the next journal entry.
- *
- * But if we're writing a new btree node (that isn't a root) or
- * appending to a non leaf btree node, we need either FUA or a flush
- * when we write the parent with the new pointer. FUA is cheaper than a
- * flush, and writes appending to leaf nodes aren't blocking anything so
- * just make all btree node writes FUA to keep things sane.
- */
-
bkey_copy(&wbio->key, &b->key);
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
- ptr->offset += b->written;
-
b->written += sectors_to_write;
+ if (wbio->wbio.first_btree_write &&
+ b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+ cpu_to_le16(b->written);
+
+ if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+ cpu_to_le16(b->written);
+
atomic64_inc(&c->btree_writes_nr);
atomic64_add(sectors_to_write, &c->btree_writes_sectors);
@@ -1915,6 +1974,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
return;
err:
set_btree_node_noevict(b);
+ if (!b->written &&
+ b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+ cpu_to_le16(sectors_to_write);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
@@ -1986,7 +2049,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (lock_type_held == SIX_LOCK_intent ||
(lock_type_held == SIX_LOCK_read &&
six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b);
+ __bch2_btree_node_write(c, b, false);
/* don't cycle lock unnecessarily: */
if (btree_node_just_written(b) &&
@@ -1998,7 +2061,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->c.lock);
} else {
- __bch2_btree_node_write(c, b);
+ __bch2_btree_node_write(c, b, false);
if (lock_type_held == SIX_LOCK_write &&
btree_node_just_written(b))
bch2_btree_post_write_cleanup(c, b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index eeb9d23caf88..0f20224e2a77 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -32,6 +32,13 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
atomic_dec(&c->btree_cache.dirty);
}
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+ return k->k.type == KEY_TYPE_btree_ptr_v2
+ ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+ : 0;
+}
+
struct btree_read_bio {
struct bch_fs *c;
struct btree *b;
@@ -48,28 +55,17 @@ struct btree_write_bio {
struct work_struct work;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
void *data;
- unsigned bytes;
+ unsigned data_bytes;
+ unsigned sector_offset;
struct bch_write_bio wbio;
};
-static inline void btree_node_io_unlock(struct btree *b)
-{
- EBUG_ON(!btree_node_write_in_flight(b));
- clear_btree_node_write_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-static inline void btree_node_wait_on_io(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
static inline bool btree_node_may_write(struct btree *b)
{
@@ -126,7 +122,7 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
bytes);
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
@@ -138,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
void bch2_btree_node_drop_keys_outside_node(struct btree *);
void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct bch_fs *, struct btree *,
- struct btree_iter *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool);
@@ -149,9 +144,8 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
-void bch2_btree_write_error_work(struct work_struct *);
-void __bch2_btree_node_write(struct bch_fs *, struct btree *);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -160,18 +154,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
- while (b->written &&
- btree_node_need_write(b) &&
- btree_node_may_write(b)) {
- if (!btree_node_write_in_flight(b)) {
- bch2_btree_node_write(c, b, lock_held);
- break;
- }
-
- six_unlock_type(&b->c.lock, lock_held);
- btree_node_wait_on_io(b);
- btree_node_lock_type(c, b, lock_held);
- }
+ if (b->written &&
+ btree_node_need_write(b) &&
+ btree_node_may_write(b) &&
+ !btree_node_write_in_flight(b))
+ bch2_btree_node_write(c, b, lock_held);
}
#define bch2_btree_node_write_cond(_c, _b, cond) \
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index cd714dc2df3c..eb7e1cde1bf3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -13,16 +13,55 @@
#include "extents.h"
#include "journal.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+static void btree_trans_verify_sorted(struct btree_trans *);
+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+ struct btree_path *);
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
{
- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+ if (need_resched() || race_fault()) {
+ bch2_trans_unlock(trans);
+ schedule();
+ return bch2_trans_relock(trans) ? 0 : -EINTR;
+ } else {
+ return 0;
+ }
+}
+static inline int __btree_path_cmp(const struct btree_path *l,
+ enum btree_id r_btree_id,
+ bool r_cached,
+ struct bpos r_pos,
+ unsigned r_level)
+{
+ return cmp_int(l->btree_id, r_btree_id) ?:
+ cmp_int((int) l->cached, (int) r_cached) ?:
+ bpos_cmp(l->pos, r_pos) ?:
+ -cmp_int(l->level, r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+ const struct btree_path *r)
+{
+ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
/* Are we iterating over keys in all snapshots? */
if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
p = bpos_successor(p);
@@ -36,8 +75,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
{
- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
/* Are we iterating over keys in all snapshots? */
if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
p = bpos_predecessor(p);
@@ -49,10 +86,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
return p;
}
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
- (unsigned long) iter->l[l].b >= 128;
+ (unsigned long) path->l[l].b >= 128;
}
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
@@ -65,41 +102,40 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
return pos;
}
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+static inline bool btree_path_pos_before_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
+ return bpos_cmp(path->pos, b->data->min_key) < 0;
}
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+static inline bool btree_path_pos_after_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
+ return bpos_cmp(b->key.k.p, path->pos) < 0;
}
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+static inline bool btree_path_pos_in_node(struct btree_path *path,
struct btree *b)
{
- return iter->btree_id == b->c.btree_id &&
- !btree_iter_pos_before_node(iter, b) &&
- !btree_iter_pos_after_node(iter, b);
+ return path->btree_id == b->c.btree_id &&
+ !btree_path_pos_before_node(path, b) &&
+ !btree_path_pos_after_node(path, b);
}
/* Btree node locking: */
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
{
- bch2_btree_node_unlock_write_inlined(b, iter);
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
}
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
unsigned readers = 0;
- EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
-
- trans_for_each_iter(iter->trans, linked)
+ trans_for_each_path(trans, linked)
if (linked->l[b->c.level].b == b &&
btree_node_read_locked(linked, b->c.level))
readers++;
@@ -112,138 +148,141 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
*/
atomic64_sub(__SIX_VAL(read_lock, readers),
&b->c.lock.state.counter);
- btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
+ btree_node_lock_type(trans->c, b, SIX_LOCK_write);
atomic64_add(__SIX_VAL(read_lock, readers),
&b->c.lock.state.counter);
}
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree *b = btree_iter_node(iter, level);
- int want = __btree_lock_want(iter, level);
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
- if (!is_btree_node(iter, level))
+ if (!is_btree_node(path, level))
return false;
if (race_fault())
return false;
- if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(iter, b, level) &&
- btree_node_lock_increment(iter->trans, b, level, want))) {
- mark_btree_node_locked(iter, level, want);
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, b, level, want))) {
+ mark_btree_node_locked(path, level, want);
return true;
} else {
return false;
}
}
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree *b = iter->l[level].b;
+ struct btree *b = path->l[level].b;
- EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
-
- if (!is_btree_node(iter, level))
+ if (!is_btree_node(path, level))
return false;
- if (btree_node_intent_locked(iter, level))
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ }
+
+ if (btree_node_intent_locked(path, level))
return true;
if (race_fault())
return false;
- if (btree_node_locked(iter, level)
+ if (btree_node_locked(path, level)
? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
goto success;
- if (btree_node_lock_seq_matches(iter, b, level) &&
- btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(iter, level);
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(path, level);
goto success;
}
return false;
success:
- mark_btree_node_intent_locked(iter, level);
+ mark_btree_node_intent_locked(path, level);
return true;
}
-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
- unsigned long trace_ip)
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade, unsigned long trace_ip)
{
- unsigned l = iter->level;
+ unsigned l = path->level;
int fail_idx = -1;
do {
- if (!btree_iter_node(iter, l))
+ if (!btree_path_node(path, l))
break;
if (!(upgrade
- ? bch2_btree_node_upgrade(iter, l)
- : bch2_btree_node_relock(iter, l))) {
- (upgrade
- ? trace_node_upgrade_fail
- : trace_node_relock_fail)(iter->trans->ip, trace_ip,
- iter->btree_id, &iter->real_pos,
- l, iter->l[l].lock_seq,
- is_btree_node(iter, l)
- ? 0
- : (unsigned long) iter->l[l].b,
- is_btree_node(iter, l)
- ? iter->l[l].b->c.lock.state.seq
- : 0);
-
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l)))
fail_idx = l;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- }
l++;
- } while (l < iter->locks_want);
+ } while (l < path->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
- while (fail_idx >= 0) {
- btree_node_unlock(iter, fail_idx);
- iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- --fail_idx;
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ --fail_idx;
+ } while (fail_idx >= 0);
}
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
- iter->uptodate = BTREE_ITER_NEED_PEEK;
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_trans_verify_locks(iter->trans);
+ bch2_trans_verify_locks(trans);
- return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+ bool cached)
{
- return type != BTREE_ITER_CACHED
+ return !cached
? container_of(_b, struct btree, c)->key.k.p
: container_of(_b, struct bkey_cached, c)->key.pos;
}
/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
- unsigned level, struct btree_iter *iter,
+bool __bch2_btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *linked, *deadlock_iter = NULL;
+ struct btree_path *linked, *deadlock_path = NULL;
u64 start_time = local_clock();
unsigned reason = 9;
bool ret;
/* Check if it's safe to block: */
- trans_for_each_iter(trans, linked) {
+ trans_for_each_path(trans, linked) {
if (!linked->nodes_locked)
continue;
@@ -261,25 +300,25 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- deadlock_iter = linked;
+ deadlock_path = linked;
reason = 1;
}
- if (linked->btree_id != iter->btree_id) {
- if (linked->btree_id > iter->btree_id) {
- deadlock_iter = linked;
+ if (linked->btree_id != path->btree_id) {
+ if (linked->btree_id > path->btree_id) {
+ deadlock_path = linked;
reason = 3;
}
continue;
}
/*
- * Within the same btree, cached iterators come before non
- * cached iterators:
+ * Within the same btree, cached paths come before non
+ * cached paths:
*/
- if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
- if (btree_iter_is_cached(iter)) {
- deadlock_iter = linked;
+ if (linked->cached != path->cached) {
+ if (path->cached) {
+ deadlock_path = linked;
reason = 4;
}
continue;
@@ -287,33 +326,34 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
/*
* Interior nodes must be locked before their descendants: if
- * another iterator has possible descendants locked of the node
+ * another path has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
- deadlock_iter = linked;
+ deadlock_path = linked;
reason = 5;
}
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- btree_iter_type(linked))) <= 0) {
- deadlock_iter = linked;
+ linked->cached)) <= 0) {
+ deadlock_path = linked;
reason = 7;
BUG_ON(trans->in_traverse_all);
}
}
- if (unlikely(deadlock_iter)) {
- trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+ if (unlikely(deadlock_path)) {
+ trace_trans_restart_would_deadlock(trans->ip, ip,
trans->in_traverse_all, reason,
- deadlock_iter->btree_id,
- btree_iter_type(deadlock_iter),
- &deadlock_iter->real_pos,
- iter->btree_id,
- btree_iter_type(iter),
+ deadlock_path->btree_id,
+ deadlock_path->cached,
+ &deadlock_path->pos,
+ path->btree_id,
+ path->cached,
&pos);
+ btree_trans_restart(trans);
return false;
}
@@ -321,9 +361,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
return true;
#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking_iter_idx = iter->idx;
+ trans->locking_path_idx = path->idx;
trans->locking_pos = pos;
- trans->locking_btree_id = iter->btree_id;
+ trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
trans->locking = b;
#endif
@@ -342,52 +382,79 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
/* Btree iterator locking: */
#ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+
+static void bch2_btree_path_verify_locks(struct btree_path *path)
{
unsigned l;
- if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
- BUG_ON(iter->nodes_locked);
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
return;
}
- for (l = 0; is_btree_node(iter, l); l++) {
- if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
- !btree_node_locked(iter, l))
- continue;
-
- BUG_ON(btree_lock_want(iter, l) !=
- btree_node_locked_type(iter, l));
- }
+ for (l = 0; btree_path_node(path, l); l++)
+ BUG_ON(btree_lock_want(path, l) !=
+ btree_node_locked_type(path, l));
}
-void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+void bch2_trans_verify_locks(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- bch2_btree_iter_verify_locks(iter);
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
}
#else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
#endif
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
+ l++) {
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_trans_restart(trans);
+ return false;
+ }
+ }
+
+ return true;
+}
+
__flatten
-static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
{
- return btree_iter_get_locks(iter, false, trace_ip);
+ bool ret = btree_path_get_locks(trans, path, false, trace_ip);
+
+ if (!ret)
+ btree_trans_restart(trans);
+ return ret;
}
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
unsigned new_locks_want)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- EBUG_ON(iter->locks_want >= new_locks_want);
+ EBUG_ON(path->locks_want >= new_locks_want);
- iter->locks_want = new_locks_want;
+ path->locks_want = new_locks_want;
- if (btree_iter_get_locks(iter, true, _THIS_IP_))
+ if (btree_path_get_locks(trans, path, true, _THIS_IP_))
return true;
/*
@@ -395,7 +462,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
* iterators in the btree_trans here.
*
* On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
@@ -407,71 +474,68 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
- * bch2_btree_iter_traverse_all().
+ * bch2_btree_path_traverse_all().
*/
- trans_for_each_iter(iter->trans, linked)
- if (linked != iter &&
- btree_iter_type(linked) == btree_iter_type(iter) &&
- linked->btree_id == iter->btree_id &&
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_iter_get_locks(linked, true, _THIS_IP_);
+ btree_path_get_locks(trans, linked, true, _THIS_IP_);
}
return false;
}
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+void __bch2_btree_path_downgrade(struct btree_path *path,
unsigned new_locks_want)
{
unsigned l;
- EBUG_ON(iter->locks_want < new_locks_want);
+ EBUG_ON(path->locks_want < new_locks_want);
- iter->locks_want = new_locks_want;
+ path->locks_want = new_locks_want;
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
- if (l > iter->level) {
- btree_node_unlock(iter, l);
+ while (path->nodes_locked &&
+ (l = __fls(path->nodes_locked)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(path, l);
} else {
- if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->l[l].b->c.lock);
- iter->nodes_intent_locked ^= 1 << l;
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ path->nodes_intent_locked ^= 1 << l;
}
break;
}
}
- bch2_btree_trans_verify_locks(iter->trans);
+ bch2_btree_path_verify_locks(path);
}
void bch2_trans_downgrade(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- bch2_btree_iter_downgrade(iter);
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(path);
}
/* Btree transaction locking: */
-static inline bool btree_iter_should_be_locked(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
- iter->should_be_locked;
-}
-
bool bch2_trans_relock(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return false;
- trans_for_each_iter(trans, iter)
- if (!bch2_btree_iter_relock(iter, _RET_IP_) &&
- btree_iter_should_be_locked(trans, iter)) {
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock(trans, path, _RET_IP_)) {
trace_trans_restart_relock(trans->ip, _RET_IP_,
- iter->btree_id, &iter->real_pos);
+ path->btree_id, &path->pos);
+ BUG_ON(!trans->restarted);
return false;
}
return true;
@@ -479,36 +543,39 @@ bool bch2_trans_relock(struct btree_trans *trans)
void bch2_trans_unlock(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- __bch2_btree_iter_unlock(iter);
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(path);
+
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
}
/* Btree iterator: */
#ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+ struct btree_path *path)
{
struct bkey_cached *ck;
- bool locked = btree_node_locked(iter, 0);
+ bool locked = btree_node_locked(path, 0);
- if (!bch2_btree_node_relock(iter, 0))
+ if (!bch2_btree_node_relock(trans, path, 0))
return;
- ck = (void *) iter->l[0].b;
- BUG_ON(ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos));
+ ck = (void *) path->l[0].b;
+ BUG_ON(ck->key.btree_id != path->btree_id ||
+ bkey_cmp(ck->key.pos, path->pos));
if (!locked)
- btree_node_unlock(iter, 0);
+ btree_node_unlock(path, 0);
}
-static void bch2_btree_iter_verify_level(struct btree_iter *iter,
- unsigned level)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree_iter_level *l;
+ struct btree_path_level *l;
struct btree_node_iter tmp;
bool locked;
struct bkey_packed *p, *k;
@@ -518,65 +585,52 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
if (!bch2_debug_check_iterators)
return;
- l = &iter->l[level];
+ l = &path->l[level];
tmp = l->iter;
- locked = btree_node_locked(iter, level);
+ locked = btree_node_locked(path, level);
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+ if (path->cached) {
if (!level)
- bch2_btree_iter_verify_cached(iter);
+ bch2_btree_path_verify_cached(trans, path);
return;
}
- BUG_ON(iter->level < iter->min_depth);
-
- if (!btree_iter_node(iter, level))
+ if (!btree_path_node(path, level))
return;
- if (!bch2_btree_node_relock(iter, level))
+ if (!bch2_btree_node_relock(trans, path, level))
return;
- BUG_ON(!btree_iter_pos_in_node(iter, l->b));
-
- /*
- * node iterators don't use leaf node iterator:
- */
- if (btree_iter_type(iter) == BTREE_ITER_NODES &&
- level <= iter->min_depth)
- goto unlock;
+ BUG_ON(!btree_path_pos_in_node(path, l->b));
bch2_btree_node_iter_verify(&l->iter, l->b);
/*
- * For interior nodes, the iterator will have skipped past
- * deleted keys:
- *
- * For extents, the iterator may have skipped past deleted keys (but not
- * whiteouts)
+ * For interior nodes, the iterator will have skipped past deleted keys:
*/
- p = level || btree_node_type_is_extents(iter->btree_id)
+ p = level
? bch2_btree_node_iter_prev(&tmp, l->b)
: bch2_btree_node_iter_prev_all(&tmp, l->b);
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
+ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
msg = "before";
goto err;
}
- if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
msg = "after";
goto err;
}
-unlock:
+
if (!locked)
- btree_node_unlock(iter, level);
+ btree_node_unlock(path, level);
return;
err:
strcpy(buf2, "(none)");
strcpy(buf3, "(none)");
- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+ bch2_bpos_to_text(&PBUF(buf1), path->pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
@@ -588,19 +642,49 @@ err:
bch2_bkey_to_text(&PBUF(buf3), &uk);
}
- panic("iterator should be %s key at level %u:\n"
- "iter pos %s\n"
+ panic("path should be %s key at level %u:\n"
+ "path pos %s\n"
"prev key %s\n"
"cur key %s\n",
msg, level, buf1, buf2, buf3);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path)
{
- enum btree_iter_type type = btree_iter_type(iter);
+ struct bch_fs *c = trans->c;
unsigned i;
- EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+ EBUG_ON(path->btree_id >= BTREE_ID_NR);
+
+ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ if (!path->l[i].b) {
+ BUG_ON(!path->cached &&
+ c->btree_roots[path->btree_id].b->c.level > i);
+ break;
+ }
+
+ bch2_btree_path_verify_level(trans, path, i);
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
@@ -608,51 +692,125 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- BUG_ON(type == BTREE_ITER_NODES &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
- BUG_ON(type != BTREE_ITER_NODES &&
+ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
- bch2_btree_iter_verify_locks(iter);
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- bch2_btree_iter_verify_level(iter, i);
+ bch2_btree_path_verify(trans, iter->path);
}
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
- enum btree_iter_type type = btree_iter_type(iter);
+ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !iter->pos.snapshot);
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON((type == BTREE_ITER_KEYS ||
- type == BTREE_ITER_CACHED) &&
- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
- bkey_cmp(iter->pos, iter->k.p) > 0));
+ BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+ bkey_cmp(iter->pos, iter->k.p) > 0);
}
-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_iter *iter;
+ struct btree_trans *trans = iter->trans;
+ struct btree_iter copy;
+ struct bkey_s_c prev;
+ int ret = 0;
if (!bch2_debug_check_iterators)
- return;
+ return 0;
- trans_for_each_iter_with_node(trans, b, iter)
- bch2_btree_iter_verify_level(iter, b->c.level);
+ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ return 0;
+
+ if (bkey_err(k) || !k.k)
+ return 0;
+
+ BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot));
+
+ bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ prev = bch2_btree_iter_prev(&copy);
+ if (!prev.k)
+ goto out;
+
+ ret = bkey_err(prev);
+ if (ret)
+ goto out;
+
+ if (!bkey_cmp(prev.k->p, k.k->p) &&
+ bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+ prev.k->p.snapshot) > 0) {
+ char buf1[100], buf2[200];
+
+ bch2_bkey_to_text(&PBUF(buf1), k.k);
+ bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+ panic("iter snap %u\n"
+ "k %s\n"
+ "prev %s\n",
+ iter->snapshot,
+ buf1, buf2);
+ }
+out:
+ bch2_trans_iter_exit(trans, &copy);
+ return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache)
+{
+ struct btree_path *path;
+ unsigned idx;
+ char buf[100];
+
+ trans_for_each_path_inorder(trans, path, idx) {
+ int cmp = cmp_int(path->btree_id, id) ?:
+ cmp_int(path->cached, key_cache);
+
+ if (cmp > 0)
+ break;
+ if (cmp < 0)
+ continue;
+
+ if (!(path->nodes_locked & 1) ||
+ !path->should_be_locked)
+ continue;
+
+ if (!key_cache) {
+ if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
+ bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+ return;
+ } else {
+ if (!bkey_cmp(pos, path->pos))
+ return;
+ }
+ }
+
+ bch2_dump_trans_paths_updates(trans);
+ panic("not locked: %s %s%s\n",
+ bch2_btree_ids[id],
+ (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+ key_cache ? " cached" : "");
}
#else
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
#endif
+/* Btree path: fixups after btree updates */
+
static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
struct btree *b,
struct bset_tree *t,
@@ -670,40 +828,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
}
-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
struct btree *b,
struct bkey_packed *where)
{
- struct btree_iter_level *l = &iter->l[b->c.level];
+ struct btree_path_level *l = &path->l[b->c.level];
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
return;
- if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
+ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct btree *b,
struct bkey_packed *where)
{
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter_with_node(iter->trans, b, linked) {
- __bch2_btree_iter_fix_key_modified(linked, b, where);
- bch2_btree_iter_verify_level(linked, b->c.level);
+ trans_for_each_path_with_node(trans, b, path) {
+ __bch2_btree_path_fix_key_modified(path, b, where);
+ bch2_btree_path_verify_level(trans, path, b->c.level);
}
}
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bset_tree *t,
+ struct bkey_packed *where,
+ unsigned clobber_u64s,
+ unsigned new_u64s)
{
const struct bkey_packed *end = btree_bkey_last(b, t);
struct btree_node_iter_set *set;
@@ -721,7 +877,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
@@ -736,7 +892,7 @@ found:
return;
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
@@ -762,8 +918,7 @@ fixup_done:
*/
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
- (b->c.level ||
- btree_node_type_is_extents(iter->btree_id))) {
+ b->c.level) {
struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
@@ -788,14 +943,10 @@ fixup_done:
b, t, k2);
}
}
-
- if (!b->c.level &&
- node_iter == &iter->l[0].iter &&
- iter_current_key_modified)
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_packed *where,
@@ -803,26 +954,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
unsigned new_u64s)
{
struct bset_tree *t = bch2_bkey_to_bset(b, where);
- struct btree_iter *linked;
+ struct btree_path *linked;
- if (node_iter != &iter->l[b->c.level].iter) {
- __bch2_btree_node_iter_fix(iter, b, node_iter, t,
+ if (node_iter != &path->l[b->c.level].iter) {
+ __bch2_btree_node_iter_fix(path, b, node_iter, t,
where, clobber_u64s, new_u64s);
if (bch2_debug_check_iterators)
bch2_btree_node_iter_verify(node_iter, b);
}
- trans_for_each_iter_with_node(iter->trans, b, linked) {
+ trans_for_each_path_with_node(trans, b, linked) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
- bch2_btree_iter_verify_level(linked, b->c.level);
+ bch2_btree_path_verify_level(trans, linked, b->c.level);
}
}
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
- struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+ struct btree_path_level *l,
struct bkey *u,
struct bkey_packed *k)
{
@@ -847,49 +1000,52 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
* assertion here:
*/
if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
- bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
+ bch2_bkey_debugcheck(c, l->b, ret);
return ret;
}
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
- struct btree_iter_level *l,
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+ struct btree_path_level *l,
struct bkey *u)
{
- return __btree_iter_unpack(iter, l, u,
+ return __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
- iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+ path->pos = k.k ? k.k->p : l->b->key.k.p;
return k;
}
-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_prev(&l->iter, l->b));
- iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+ path->pos = k.k ? k.k->p : l->b->data->min_key;
return k;
}
-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
- struct btree_iter_level *l,
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+ struct btree_path_level *l,
int max_advance)
{
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
@@ -903,9 +1059,10 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
/*
* Verify that iterator for parent node points to child node:
*/
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static void btree_path_verify_new_node(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
{
- struct btree_iter_level *l;
+ struct btree_path_level *l;
unsigned plevel;
bool parent_locked;
struct bkey_packed *k;
@@ -914,15 +1071,15 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
return;
plevel = b->c.level + 1;
- if (!btree_iter_node(iter, plevel))
+ if (!btree_path_node(path, plevel))
return;
- parent_locked = btree_node_locked(iter, plevel);
+ parent_locked = btree_node_locked(path, plevel);
- if (!bch2_btree_node_relock(iter, plevel))
+ if (!bch2_btree_node_relock(trans, path, plevel))
return;
- l = &iter->l[plevel];
+ l = &path->l[plevel];
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
if (!k ||
bkey_deleted(k) ||
@@ -933,8 +1090,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
char buf4[100];
struct bkey uk = bkey_unpack_key(b, k);
- bch2_dump_btree_node(iter->trans->c, l->b);
- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+ bch2_dump_btree_node(trans->c, l->b);
+ bch2_bpos_to_text(&PBUF(buf1), path->pos);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
@@ -942,20 +1099,20 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
"iter pos %s %s\n"
"iter key %s\n"
"new node %s-%s\n",
- bch2_btree_ids[iter->btree_id], buf1,
+ bch2_btree_ids[path->btree_id], buf1,
buf2, buf3, buf4);
}
if (!parent_locked)
- btree_node_unlock(iter, b->c.level + 1);
+ btree_node_unlock(path, plevel);
}
-static inline void __btree_iter_init(struct btree_iter *iter,
- unsigned level)
+static inline void __btree_path_level_init(struct btree_path *path,
+ unsigned level)
{
- struct btree_iter_level *l = &iter->l[level];
+ struct btree_path_level *l = &path->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
/*
* Iterators to interior nodes should always be pointed at the first non
@@ -963,63 +1120,48 @@ static inline void __btree_iter_init(struct btree_iter *iter,
*/
if (level)
bch2_btree_node_iter_peek(&l->iter, l->b);
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-static inline void btree_iter_node_set(struct btree_iter *iter,
- struct btree *b)
+static inline void btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+ BUG_ON(path->cached);
- btree_iter_verify_new_node(iter, b);
+ btree_path_verify_new_node(trans, path, b);
- EBUG_ON(!btree_iter_pos_in_node(iter, b));
+ EBUG_ON(!btree_path_pos_in_node(path, b));
EBUG_ON(b->c.lock.state.seq & 1);
- iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
- iter->l[b->c.level].b = b;
- __btree_iter_init(iter, b->c.level);
+ path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+ path->l[b->c.level].b = b;
+ __btree_path_level_init(path, b->c.level);
}
+/* Btree path: fixups after btree node updates: */
+
/*
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
{
- enum btree_node_locked_type t;
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter(iter->trans, linked)
- if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
- btree_iter_pos_in_node(linked, b)) {
- /*
- * bch2_btree_iter_node_drop() has already been called -
- * the old node we're replacing has already been
- * unlocked and the pointer invalidated
- */
- BUG_ON(btree_node_locked(linked, b->c.level));
+ trans_for_each_path(trans, path)
+ if (!path->cached &&
+ btree_path_pos_in_node(path, b)) {
+ enum btree_node_locked_type t =
+ btree_lock_want(path, b->c.level);
- t = btree_lock_want(linked, b->c.level);
- if (t != BTREE_NODE_UNLOCKED) {
+ if (path->nodes_locked &&
+ t != BTREE_NODE_UNLOCKED) {
+ btree_node_unlock(path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(linked, b->c.level, t);
+ mark_btree_node_locked(path, b->c.level, t);
}
- btree_iter_node_set(linked, b);
- }
-}
-
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
- unsigned level = b->c.level;
-
- trans_for_each_iter(iter->trans, linked)
- if (linked->l[level].b == b) {
- btree_node_unlock(linked, level);
- linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+ btree_path_level_init(trans, path, b);
}
}
@@ -1027,14 +1169,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
*/
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter_with_node(iter->trans, b, linked)
- __btree_iter_init(linked, b->c.level);
+ trans_for_each_path_with_node(trans, b, path)
+ __btree_path_level_init(path, b->c.level);
}
+/* Btree path: traverse, set_pos: */
+
static int lock_root_check_fn(struct six_lock *lock, void *p)
{
struct btree *b = container_of(lock, struct btree, c.lock);
@@ -1043,52 +1187,56 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
return b == *rootp ? 0 : -1;
}
-static inline int btree_iter_lock_root(struct btree_iter *iter,
+static inline int btree_path_lock_root(struct btree_trans *trans,
+ struct btree_path *path,
unsigned depth_want,
unsigned long trace_ip)
{
- struct bch_fs *c = iter->trans->c;
- struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
+ struct bch_fs *c = trans->c;
+ struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
enum six_lock_type lock_type;
unsigned i;
- EBUG_ON(iter->nodes_locked);
+ EBUG_ON(path->nodes_locked);
while (1) {
b = READ_ONCE(*rootp);
- iter->level = READ_ONCE(b->c.level);
+ path->level = READ_ONCE(b->c.level);
- if (unlikely(iter->level < depth_want)) {
+ if (unlikely(path->level < depth_want)) {
/*
* the root is at a lower depth than the depth we want:
* got to the end of the btree, or we're walking nodes
* greater than some depth and there are no nodes >=
* that depth
*/
- iter->level = depth_want;
- for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
- iter->l[i].b = NULL;
+ path->level = depth_want;
+ for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
return 1;
}
- lock_type = __btree_lock_want(iter, iter->level);
- if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
- iter, lock_type,
+ lock_type = __btree_lock_want(path, path->level);
+ if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
+ path->level, lock_type,
lock_root_check_fn, rootp,
- trace_ip)))
- return -EINTR;
+ trace_ip))) {
+ if (trans->restarted)
+ return -EINTR;
+ continue;
+ }
if (likely(b == READ_ONCE(*rootp) &&
- b->c.level == iter->level &&
+ b->c.level == path->level &&
!race_fault())) {
- for (i = 0; i < iter->level; i++)
- iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
- iter->l[iter->level].b = b;
- for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
- iter->l[i].b = NULL;
-
- mark_btree_node_locked(iter, iter->level, lock_type);
- btree_iter_node_set(iter, b);
+ for (i = 0; i < path->level; i++)
+ path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+ path->l[path->level].b = b;
+ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
+
+ mark_btree_node_locked(path, path->level, lock_type);
+ btree_path_level_init(trans, path, b);
return 0;
}
@@ -1097,22 +1245,23 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
}
noinline
-static void btree_iter_prefetch(struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
{
- struct bch_fs *c = iter->trans->c;
- struct btree_iter_level *l = &iter->l[iter->level];
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
- ? (iter->level > 1 ? 0 : 2)
- : (iter->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(iter, iter->level);
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
bch2_bkey_buf_init(&tmp);
- while (nr) {
- if (!bch2_btree_node_relock(iter, iter->level))
+ while (nr && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
break;
bch2_btree_node_iter_advance(&node_iter, l->b);
@@ -1121,25 +1270,27 @@ static void btree_iter_prefetch(struct btree_iter *iter)
break;
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
- iter->level - 1);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
}
if (!was_locked)
- btree_node_unlock(iter, iter->level);
+ btree_node_unlock(path, path->level);
bch2_bkey_buf_exit(&tmp, c);
+ return ret;
}
-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+ struct btree_path *path,
unsigned plevel, struct btree *b)
{
- struct btree_iter_level *l = &iter->l[plevel];
- bool locked = btree_node_locked(iter, plevel);
+ struct btree_path_level *l = &path->l[plevel];
+ bool locked = btree_node_locked(path, plevel);
struct bkey_packed *k;
struct bch_btree_ptr_v2 *bp;
- if (!bch2_btree_node_relock(iter, plevel))
+ if (!bch2_btree_node_relock(trans, path, plevel))
return;
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1149,92 +1300,84 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
bp->mem_ptr = (unsigned long)b;
if (!locked)
- btree_node_unlock(iter, plevel);
+ btree_node_unlock(path, plevel);
}
-static __always_inline int btree_iter_down(struct btree_iter *iter,
+static __always_inline int btree_path_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
unsigned long trace_ip)
{
- struct bch_fs *c = iter->trans->c;
- struct btree_iter_level *l = &iter->l[iter->level];
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
struct btree *b;
- unsigned level = iter->level - 1;
- enum six_lock_type lock_type = __btree_lock_want(iter, level);
+ unsigned level = path->level - 1;
+ enum six_lock_type lock_type = __btree_lock_want(path, level);
struct bkey_buf tmp;
int ret;
- EBUG_ON(!btree_node_locked(iter, iter->level));
+ EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
goto err;
- mark_btree_node_locked(iter, level, lock_type);
- btree_iter_node_set(iter, b);
+ mark_btree_node_locked(path, level, lock_type);
+ btree_path_level_init(trans, path, b);
if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
- btree_node_mem_ptr_set(iter, level + 1, b);
+ btree_node_mem_ptr_set(trans, path, level + 1, b);
- if (iter->flags & BTREE_ITER_PREFETCH)
- btree_iter_prefetch(iter);
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch(trans, path);
- iter->level = level;
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(path, level + 1);
+ path->level = level;
+
+ bch2_btree_path_verify_locks(path);
err:
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
-static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
+static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+ unsigned, unsigned long);
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
- u8 sorted[BTREE_ITER_MAX];
- int i, nr_sorted = 0;
- bool relock_fail;
+ struct btree_path *path;
+ int i;
if (trans->in_traverse_all)
return -EINTR;
trans->in_traverse_all = true;
retry_all:
- nr_sorted = 0;
- relock_fail = false;
-
- trans_for_each_iter(trans, iter) {
- if (!bch2_btree_iter_relock(iter, _THIS_IP_))
- relock_fail = true;
- sorted[nr_sorted++] = iter->idx;
- }
-
- if (!relock_fail) {
- trans->in_traverse_all = false;
- return 0;
- }
+ trans->restarted = false;
-#define btree_iter_cmp_by_idx(_l, _r) \
- btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
+ trans_for_each_path(trans, path)
+ path->should_be_locked = false;
- bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
-#undef btree_iter_cmp_by_idx
+ btree_trans_verify_sorted(trans);
- for (i = nr_sorted - 2; i >= 0; --i) {
- struct btree_iter *iter1 = trans->iters + sorted[i];
- struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+ for (i = trans->nr_sorted - 2; i >= 0; --i) {
+ struct btree_path *path1 = trans->paths + trans->sorted[i];
+ struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
- if (iter1->btree_id == iter2->btree_id &&
- iter1->locks_want < iter2->locks_want)
- __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
- else if (!iter1->locks_want && iter2->locks_want)
- __bch2_btree_iter_upgrade(iter1, 1);
+ if (path1->btree_id == path2->btree_id &&
+ path1->locks_want < path2->locks_want)
+ __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
+ else if (!path1->locks_want && path2->locks_want)
+ __bch2_btree_path_upgrade(trans, path1, 1);
}
bch2_trans_unlock(trans);
@@ -1251,37 +1394,36 @@ retry_all:
} while (ret);
}
- if (unlikely(ret == -EIO)) {
- trans->error = true;
+ if (unlikely(ret == -EIO))
goto out;
- }
BUG_ON(ret && ret != -EINTR);
/* Now, redo traversals in correct order: */
- for (i = 0; i < nr_sorted; i++) {
- unsigned idx = sorted[i];
+ i = 0;
+ while (i < trans->nr_sorted) {
+ path = trans->paths + trans->sorted[i];
- /*
- * sucessfully traversing one iterator can cause another to be
- * unlinked, in btree_key_cache_fill()
- */
- if (!(trans->iters_linked & (1ULL << idx)))
- continue;
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
- ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
+ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
if (ret)
goto retry_all;
+
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+
+ if (path->nodes_locked ||
+ !btree_path_node(path, path->level))
+ i++;
}
- if (hweight64(trans->iters_live) > 1)
- ret = -EINTR;
- else
- trans_for_each_iter(trans, iter)
- if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
- ret = -EINTR;
- break;
- }
+ /*
+ * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+ * and relock(), relock() won't relock since path->should_be_locked
+ * isn't set yet, which is all fine
+ */
+ trans_for_each_path(trans, path)
+ BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
out:
bch2_btree_cache_cannibalize_unlock(c);
@@ -1291,37 +1433,50 @@ out:
return ret;
}
-int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
- return __btree_iter_traverse_all(trans, 0, _RET_IP_);
+ return __btree_path_traverse_all(trans, 0, _RET_IP_);
}
-static inline bool btree_iter_good_node(struct btree_iter *iter,
+static inline bool btree_path_good_node(struct btree_trans *trans,
+ struct btree_path *path,
unsigned l, int check_pos)
{
- if (!is_btree_node(iter, l) ||
- !bch2_btree_node_relock(iter, l))
+ if (!is_btree_node(path, l) ||
+ !bch2_btree_node_relock(trans, path, l))
return false;
- if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
return false;
- if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
return false;
return true;
}
-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
int check_pos)
{
- unsigned l = iter->level;
+ unsigned i, l = path->level;
- while (btree_iter_node(iter, l) &&
- !btree_iter_good_node(iter, l, check_pos)) {
- btree_node_unlock(iter, l);
- iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+ while (btree_path_node(path, l) &&
+ !btree_path_good_node(trans, path, l, check_pos)) {
+ btree_node_unlock(path, l);
+ path->l[l].b = BTREE_ITER_NO_NODE_UP;
l++;
}
+ /* If we need intent locks, take them too: */
+ for (i = l + 1;
+ i < path->locks_want && btree_path_node(path, i);
+ i++)
+ if (!bch2_btree_node_relock(trans, path, i))
+ while (l <= i) {
+ btree_node_unlock(path, l);
+ path->l[l].b = BTREE_ITER_NO_NODE_UP;
+ l++;
+ }
+
return l;
}
@@ -1334,102 +1489,436 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_iter_traverse_one(struct btree_iter *iter,
+static int btree_path_traverse_one(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
unsigned long trace_ip)
{
- unsigned depth_want = iter->level;
+ unsigned depth_want = path->level;
int ret = 0;
+ if (unlikely(trans->restarted)) {
+ ret = -EINTR;
+ goto out;
+ }
+
/*
- * if we need interior nodes locked, call btree_iter_relock() to make
- * sure we walk back up enough that we lock them:
+ * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+ * and re-traverse the path without a transaction restart:
*/
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
- iter->locks_want > 1)
- bch2_btree_iter_relock(iter, _THIS_IP_);
-
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
- ret = bch2_btree_iter_traverse_cached(iter);
+ if (path->should_be_locked) {
+ ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
goto out;
}
- if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+ if (path->cached) {
+ ret = bch2_btree_path_traverse_cached(trans, path, flags);
goto out;
+ }
- if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+ if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
- iter->level = btree_iter_up_until_good_node(iter, 0);
+ path->level = btree_path_up_until_good_node(trans, path, 0);
/*
- * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+ * Note: path->nodes[path->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
* here it indicates that relocking the root failed - it's critical that
- * btree_iter_lock_root() comes next and that it can't fail
+ * btree_path_lock_root() comes next and that it can't fail
*/
- while (iter->level > depth_want) {
- ret = btree_iter_node(iter, iter->level)
- ? btree_iter_down(iter, trace_ip)
- : btree_iter_lock_root(iter, depth_want, trace_ip);
+ while (path->level > depth_want) {
+ ret = btree_path_node(path, path->level)
+ ? btree_path_down(trans, path, flags, trace_ip)
+ : btree_path_lock_root(trans, path, depth_want, trace_ip);
if (unlikely(ret)) {
if (ret == 1) {
/*
- * Got to the end of the btree (in
- * BTREE_ITER_NODES mode)
+ * No nodes at this level - got to the end of
+ * the btree:
*/
ret = 0;
goto out;
}
- iter->level = depth_want;
+ __bch2_btree_path_unlock(path);
+ path->level = depth_want;
- if (ret == -EIO) {
- iter->flags |= BTREE_ITER_ERROR;
- iter->l[iter->level].b =
+ if (ret == -EIO)
+ path->l[path->level].b =
BTREE_ITER_NO_NODE_ERROR;
- } else {
- iter->l[iter->level].b =
+ else
+ path->l[path->level].b =
BTREE_ITER_NO_NODE_DOWN;
- }
goto out;
}
}
- iter->uptodate = BTREE_ITER_NEED_PEEK;
+ path->uptodate = BTREE_ITER_UPTODATE;
out:
- trace_iter_traverse(iter->trans->ip, trace_ip,
- iter->btree_id, &iter->real_pos, ret);
- bch2_btree_iter_verify(iter);
+ BUG_ON((ret == -EINTR) != !!trans->restarted);
+ bch2_btree_path_verify(trans, path);
return ret;
}
-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+ struct btree_path *path, unsigned flags)
{
- struct btree_trans *trans = iter->trans;
- int ret;
+ if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ return 0;
- ret = bch2_trans_cond_resched(trans) ?:
- btree_iter_traverse_one(iter, _RET_IP_);
- if (unlikely(ret))
- ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
+ return bch2_trans_cond_resched(trans) ?:
+ btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
- return ret;
+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+ struct btree_path *src)
+{
+ unsigned i;
+
+ memcpy(&dst->pos, &src->pos,
+ sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_locked(dst, i))
+ six_lock_increment(&dst->l[i].b->c.lock,
+ __btree_lock_want(dst, i));
+
+ btree_path_check_sort(trans, dst, 0);
}
-/*
- * Note:
- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
- * for internal btree iterator users
- *
- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
- * btree_iter_traverse() does not:
- */
-static inline int __must_check
-btree_iter_traverse(struct btree_iter *iter)
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+ bool intent)
+{
+ struct btree_path *new = btree_path_alloc(trans, src);
+
+ btree_path_copy(trans, new, src);
+ __btree_path_get(new, intent);
+ return new;
+}
+
+inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ if (path->ref > 1 || path->preserve) {
+ __btree_path_put(path, intent);
+ path = btree_path_clone(trans, path, intent);
+ path->preserve = false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ path->ip_allocated = ip;
+#endif
+ btree_trans_verify_sorted(trans);
+ }
+
+ return path;
+}
+
+static struct btree_path * __must_check
+btree_path_set_pos(struct btree_trans *trans,
+ struct btree_path *path, struct bpos new_pos,
+ bool intent, unsigned long ip)
+{
+ int cmp = bpos_cmp(new_pos, path->pos);
+ unsigned l = path->level;
+
+ EBUG_ON(trans->restarted);
+ EBUG_ON(!path->ref);
+
+ if (!cmp)
+ return path;
+
+ path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+ path->pos = new_pos;
+ path->should_be_locked = false;
+
+ btree_path_check_sort(trans, path, cmp);
+
+ if (unlikely(path->cached)) {
+ btree_node_unlock(path, 0);
+ path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ goto out;
+ }
+
+ l = btree_path_up_until_good_node(trans, path, cmp);
+
+ if (btree_path_node(path, l)) {
+ /*
+ * We might have to skip over many keys, or just a few: try
+ * advancing the node iterator, and if we have to skip over too
+ * many keys just reinit it (or if we're rewinding, since that
+ * is expensive).
+ */
+ if (cmp < 0 ||
+ !btree_path_advance_to_pos(path, &path->l[l], 8))
+ __btree_path_level_init(path, l);
+ }
+
+ if (l != path->level) {
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ __bch2_btree_path_unlock(path);
+ }
+out:
+ bch2_btree_path_verify(trans, path);
+ return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
{
- return iter->uptodate >= BTREE_ITER_NEED_RELOCK
- ? __bch2_btree_iter_traverse(iter)
- : 0;
+ struct btree_path *next;
+
+ next = prev_btree_path(trans, path);
+ if (next && !btree_path_cmp(next, path))
+ return next;
+
+ next = next_btree_path(trans, path);
+ if (next && !btree_path_cmp(next, path))
+ return next;
+
+ return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree_path *next;
+
+ next = prev_btree_path(trans, path);
+ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+ return next;
+
+ next = next_btree_path(trans, path);
+ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+ return next;
+
+ return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+ __bch2_btree_path_unlock(path);
+ btree_path_list_remove(trans, path);
+ trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+ struct btree_path *dup;
+
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
+ return;
+
+ /*
+ * Perhaps instead we should check for duplicate paths in traverse_all:
+ */
+ if (path->preserve &&
+ (dup = have_path_at_pos(trans, path))) {
+ dup->preserve = true;
+ path->preserve = false;
+ goto free;
+ }
+
+ if (!path->preserve &&
+ (dup = have_node_at_pos(trans, path)))
+ goto free;
+ return;
+free:
+ if (path->should_be_locked &&
+ !btree_node_locked(dup, path->level))
+ return;
+
+ dup->should_be_locked |= path->should_be_locked;
+ __bch2_path_free(trans, path);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ struct btree_insert_entry *i;
+ unsigned idx;
+ char buf1[300], buf2[300];
+
+ btree_trans_verify_sorted(trans);
+
+ trans_for_each_path_inorder(trans, path, idx)
+ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
+ path->idx, path->ref, path->intent_ref,
+ path->should_be_locked ? " S" : "",
+ path->preserve ? " P" : "",
+ bch2_btree_ids[path->btree_id],
+ (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+ path->nodes_locked,
+#ifdef CONFIG_BCACHEFS_DEBUG
+ (void *) path->ip_allocated
+#else
+ NULL
+#endif
+ );
+
+ trans_for_each_update(trans, i) {
+ struct bkey u;
+ struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+
+ printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s",
+ bch2_btree_ids[i->btree_id],
+ (void *) i->ip_allocated,
+ (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
+ (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+ }
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *trans,
+ struct btree_path *pos)
+{
+ struct btree_path *path;
+ unsigned idx;
+
+ if (unlikely(trans->paths_allocated ==
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans path oveflow\n");
+ }
+
+ idx = __ffs64(~trans->paths_allocated);
+ trans->paths_allocated |= 1ULL << idx;
+
+ path = &trans->paths[idx];
+
+ path->idx = idx;
+ path->ref = 0;
+ path->intent_ref = 0;
+ path->nodes_locked = 0;
+ path->nodes_intent_locked = 0;
+
+ btree_path_list_add(trans, pos, path);
+ return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ bool intent, unsigned long ip)
+{
+ struct btree_path *path, *path_pos = NULL;
+ int i;
+
+ BUG_ON(trans->restarted);
+
+ trans_for_each_path_inorder(trans, path, i) {
+ if (__btree_path_cmp(path,
+ btree_id,
+ cached,
+ pos,
+ level) > 0)
+ break;
+
+ path_pos = path;
+ }
+
+ if (path_pos &&
+ path_pos->cached == cached &&
+ path_pos->btree_id == btree_id &&
+ path_pos->level == level) {
+ __btree_path_get(path_pos, intent);
+ path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path->preserve = true;
+ } else {
+ path = btree_path_alloc(trans, path_pos);
+ path_pos = NULL;
+
+ __btree_path_get(path, intent);
+ path->pos = pos;
+ path->btree_id = btree_id;
+ path->cached = cached;
+ path->preserve = true;
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ path->should_be_locked = false;
+ path->level = level;
+ path->locks_want = locks_want;
+ path->nodes_locked = 0;
+ path->nodes_intent_locked = 0;
+ for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ path->l[i].b = BTREE_ITER_NO_NODE_INIT;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ path->ip_allocated = ip;
+#endif
+ btree_trans_verify_sorted(trans);
+ }
+
+ if (path->intent_ref)
+ locks_want = max(locks_want, level + 1);
+
+ /*
+ * If the path has locks_want greater than requested, we don't downgrade
+ * it here - on transaction restart because btree node split needs to
+ * upgrade locks, we might be putting/getting the iterator again.
+ * Downgrading iterators only happens via bch2_trans_downgrade(), after
+ * a successful transaction commit.
+ */
+
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+ if (locks_want > path->locks_want) {
+ path->locks_want = locks_want;
+ btree_path_get_locks(trans, path, true, _THIS_IP_);
+ }
+
+ return path;
+}
+
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+ struct bkey_s_c k;
+
+ BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+ if (!path->cached) {
+ struct btree_path_level *l = path_l(path);
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+
+ if (!k.k || bpos_cmp(path->pos, k.k->p))
+ goto hole;
+ } else {
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ EBUG_ON(path->btree_id != ck->key.btree_id ||
+ bkey_cmp(path->pos, ck->key.pos));
+
+ /* BTREE_ITER_CACHED_NOFILL? */
+ if (unlikely(!ck->valid))
+ goto hole;
+
+ k = bkey_i_to_s_c(ck->k);
+ }
+
+ return k;
+hole:
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
}
int __must_check
@@ -1437,13 +1926,16 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
- btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+ iter->path = btree_path_set_pos(iter->trans, iter->path,
+ btree_iter_search_key(iter),
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
if (ret)
return ret;
- iter->should_be_locked = true;
+ iter->path->should_be_locked = true;
return 0;
}
@@ -1451,143 +1943,132 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
{
- struct btree *b;
+ struct btree_trans *trans = iter->trans;
+ struct btree *b = NULL;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
- return NULL;
+ goto err;
- b = btree_iter_node(iter, iter->level);
+ b = btree_path_node(iter->path, iter->path->level);
if (!b)
- return NULL;
+ goto out;
BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = iter->real_pos = b->key.k.p;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
+ iter->path->should_be_locked = true;
+ BUG_ON(iter->path->uptodate);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
}
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
- struct btree *b;
+ struct btree_trans *trans = iter->trans;
+ struct btree_path *path = iter->path;
+ struct btree *b = NULL;
+ unsigned l;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+ BUG_ON(trans->restarted);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
- /* already got to end? */
- if (!btree_iter_node(iter, iter->level))
- return NULL;
-
- bch2_trans_cond_resched(iter->trans);
-
- btree_node_unlock(iter, iter->level);
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
- iter->level++;
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = btree_iter_traverse(iter);
- if (ret)
+ /* already at end? */
+ if (!btree_path_node(path, path->level))
return NULL;
/* got to end? */
- b = btree_iter_node(iter, iter->level);
- if (!b)
+ if (!btree_path_node(path, path->level + 1)) {
+ btree_node_unlock(path, path->level);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+ path->level++;
return NULL;
+ }
- if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
+ if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ __bch2_btree_path_unlock(path);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ btree_trans_restart(trans);
+ ret = -EINTR;
+ goto err;
+ }
+
+ b = btree_path_node(path, path->level + 1);
+
+ if (!bpos_cmp(iter->pos, b->key.k.p)) {
+ btree_node_unlock(path, path->level);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+ path->level++;
+ } else {
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
+ path = iter->path =
+ btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
+
+ path->level = iter->min_depth;
- /* Unlock to avoid screwing up our lock invariants: */
- btree_node_unlock(iter, iter->level);
+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(path, l);
- iter->level = iter->min_depth;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
bch2_btree_iter_verify(iter);
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(trans, path, iter->flags);
if (ret)
- return NULL;
+ goto err;
- b = iter->l[iter->level].b;
+ b = path->l[path->level].b;
}
- iter->pos = iter->real_pos = b->key.k.p;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+ iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
+ iter->path->should_be_locked = true;
+ BUG_ON(iter->path->uptodate);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
}
/* Iterate across keys (in leaf nodes only) */
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
-{
- struct bpos old_pos = iter->real_pos;
- int cmp = bpos_cmp(new_pos, iter->real_pos);
- unsigned l = iter->level;
-
- if (!cmp)
- goto out;
-
- iter->real_pos = new_pos;
- iter->should_be_locked = false;
-
- if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
- btree_node_unlock(iter, 0);
- iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- return;
- }
-
- l = btree_iter_up_until_good_node(iter, cmp);
-
- if (btree_iter_node(iter, l)) {
- /*
- * We might have to skip over many keys, or just a few: try
- * advancing the node iterator, and if we have to skip over too
- * many keys just reinit it (or if we're rewinding, since that
- * is expensive).
- */
- if (cmp < 0 ||
- !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
- __btree_iter_init(iter, l);
-
- /* Don't leave it locked if we're not supposed to: */
- if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, l);
- }
-out:
- if (l != iter->level)
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- else
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
- bch2_btree_iter_verify(iter);
-#ifdef CONFIG_BCACHEFS_DEBUG
- trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
- iter->btree_id,
- &old_pos, &new_pos, l);
-#endif
-}
-
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = bpos_cmp(pos, POS_MAX) != 0;
+ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_cmp(pos, SPOS_MAX)
+ : bkey_cmp(pos, SPOS_MAX)) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
pos = bkey_successor(iter, pos);
@@ -1598,7 +2079,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = bpos_cmp(pos, POS_MIN) != 0;
+ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_cmp(pos, POS_MIN)
+ : bkey_cmp(pos, POS_MIN)) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
pos = bkey_predecessor(iter, pos);
@@ -1606,91 +2089,84 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
return ret;
}
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
-{
- struct bpos next_pos = iter->l[0].b->key.k.p;
- bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
-
- /*
- * Typically, we don't want to modify iter->pos here, since that
- * indicates where we searched from - unless we got to the end of the
- * btree, in that case we want iter->pos to reflect that:
- */
- if (ret)
- btree_iter_set_search_pos(iter, bpos_successor(next_pos));
- else
- bch2_btree_iter_set_pos(iter, POS_MAX);
-
- return ret;
-}
-
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
-{
- struct bpos next_pos = iter->l[0].b->data->min_key;
- bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
-
- if (ret)
- btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
- else
- bch2_btree_iter_set_pos(iter, POS_MIN);
-
- return ret;
-}
-
-static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update2(trans, i)
- if ((cmp_int(btree_id, i->iter->btree_id) ?:
- bkey_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->iter->btree_id)
- return i->k;
- break;
- }
-
- return NULL;
-}
-
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_i *next_update;
struct bkey_s_c k;
- int ret;
+ int ret, cmp;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+ EBUG_ON(iter->path->cached || iter->path->level);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
-start:
- next_update = with_updates
- ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
- : NULL;
- btree_iter_set_search_pos(iter, search_key);
while (1) {
- ret = btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
- k = btree_iter_level_peek(iter, &iter->l[0]);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out;
+ }
+
+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+ : NULL;
+ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+
+ /* * In the btree, deleted keys sort before non deleted: */
+ if (k.k && bkey_deleted(k.k) &&
+ (!next_update ||
+ bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
+ search_key = k.k->p;
+ continue;
+ }
if (next_update &&
- bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
+ bpos_cmp(next_update->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
+ }
if (likely(k.k)) {
- if (bkey_deleted(k.k)) {
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
search_key = bkey_successor(iter, k.k->p);
- goto start;
+ continue;
}
break;
+ } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+ /* Advance to next leaf node: */
+ search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+ } else {
+ /* End of btree: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
}
-
- if (!btree_iter_set_pos_to_next_leaf(iter))
- return bkey_s_c_null;
}
/*
@@ -1702,19 +2178,27 @@ start:
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
+
+ cmp = bpos_cmp(k.k->p, iter->path->pos);
+ if (cmp) {
+ iter->path = bch2_btree_path_make_mut(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
+ iter->path->pos = k.k->p;
+ btree_path_check_sort(trans, iter->path, cmp);
+ }
+out:
+ iter->path->should_be_locked = true;
+
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
- return k;
-}
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
-{
- return __btree_iter_peek(iter, false);
+ return k;
}
/**
@@ -1729,55 +2213,105 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return bch2_btree_iter_peek(iter);
}
-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
-{
- return __btree_iter_peek(iter, true);
-}
-
-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
-{
- if (!bch2_btree_iter_advance(iter))
- return bkey_s_c_null;
-
- return bch2_btree_iter_peek_with_updates(iter);
-}
-
/**
* bch2_btree_iter_peek_prev: returns first key less than or equal to
* iterator's current position
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = iter->pos;
+ struct btree_path *saved_path = NULL;
struct bkey_s_c k;
+ struct bkey saved_k;
+ const struct bch_val *saved_v;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+ EBUG_ON(iter->path->cached || iter->path->level);
+ EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- btree_iter_set_search_pos(iter, iter->pos);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
while (1) {
- ret = btree_iter_traverse(iter);
+ iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto no_key;
+ goto out;
}
- k = btree_iter_level_peek(iter, l);
+ k = btree_path_level_peek(trans->c, iter->path,
+ &iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
- : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
- k = btree_iter_level_prev(iter, l);
+ ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+ : bpos_cmp(k.k->p, search_key) > 0))
+ k = btree_path_level_prev(trans->c, iter->path,
+ &iter->path->l[0], &iter->k);
- if (likely(k.k))
- break;
+ btree_path_check_sort(trans, iter->path, 0);
+
+ if (likely(k.k)) {
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (k.k->p.snapshot == iter->snapshot)
+ goto got_key;
+
+ /*
+ * If we have a saved candidate, and we're no
+ * longer at the same _key_ (not pos), return
+ * that candidate
+ */
+ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = saved_path;
+ saved_path = NULL;
+ iter->k = saved_k;
+ k.v = saved_v;
+ goto got_key;
+ }
+
+ if (bch2_snapshot_is_ancestor(iter->trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ if (saved_path)
+ bch2_path_put(trans, saved_path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_path = btree_path_clone(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_k = *k.k;
+ saved_v = k.v;
+ }
+
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+got_key:
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_predecessor(iter, k.k->p);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+ continue;
+ }
- if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+ break;
+ } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(iter, POS_MIN);
k = bkey_s_c_null;
- goto no_key;
+ goto out;
}
}
@@ -1786,20 +2320,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
/* Extents can straddle iter->pos: */
if (bkey_cmp(k.k->p, iter->pos) < 0)
iter->pos = k.k->p;
+
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
out:
+ if (saved_path)
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+ iter->path->should_be_locked = true;
+
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
+
return k;
-no_key:
- /*
- * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
- * then we errored going to the previous leaf - make sure it's
- * consistent with iter->pos:
- */
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- goto out;
}
/**
@@ -1814,82 +2346,98 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
return bch2_btree_iter_peek_prev(iter);
}
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key;
struct bkey_s_c k;
- struct bpos pos, next_start;
+ int ret;
+
+ EBUG_ON(iter->path->level);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
- /* keys & holes can't span inode numbers: */
- if (iter->pos.offset == KEY_OFFSET_MAX) {
+ /* extents can't span inode numbers: */
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
}
- pos = iter->pos;
- k = bch2_btree_iter_peek(iter);
- iter->pos = pos;
-
- if (bkey_err(k))
- return k;
-
- if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
- return k;
-
- next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
+ search_key = btree_iter_search_key(iter);
+ iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ iter->ip_allocated);
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- bch2_key_resize(&iter->k,
- min_t(u64, KEY_SIZE_MAX,
- (next_start.inode == iter->pos.inode
- ? next_start.offset
- : KEY_OFFSET_MAX) -
- iter->pos.offset));
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
- EBUG_ON(!iter->k.size);
+ if ((iter->flags & BTREE_ITER_CACHED) ||
+ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+ struct bkey_i *next_update;
- bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+ : NULL;
- return (struct bkey_s_c) { &iter->k, NULL };
-}
+ if (next_update &&
+ !bpos_cmp(next_update->k.p, iter->pos)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ } else {
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ }
+ } else {
+ struct bpos next;
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_s_c k;
- int ret;
+ if (iter->flags & BTREE_ITER_INTENT) {
+ struct btree_iter iter2;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_entry_exit(iter);
+ bch2_trans_copy_iter(&iter2, iter);
+ k = bch2_btree_iter_peek(&iter2);
- btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+ if (k.k && !bkey_err(k)) {
+ iter->k = iter2.k;
+ k.k = &iter->k;
+ }
+ bch2_trans_iter_exit(trans, &iter2);
+ } else {
+ struct bpos pos = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- return __bch2_btree_iter_peek_slot_extents(iter);
+ k = bch2_btree_iter_peek(iter);
+ iter->pos = pos;
+ }
- ret = btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(bkey_err(k)))
+ return k;
- k = btree_iter_level_peek_all(iter, l, &iter->k);
+ next = k.k ? bkey_start_pos(k.k) : POS_MAX;
- EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
+ if (bkey_cmp(iter->pos, next) < 0) {
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+ bch2_key_resize(&iter->k,
+ min_t(u64, KEY_SIZE_MAX,
+ (next.inode == iter->pos.inode
+ ? next.offset
+ : KEY_OFFSET_MAX) -
+ iter->pos.offset));
- if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
- /* hole */
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- k = (struct bkey_s_c) { &iter->k, NULL };
+ k = (struct bkey_s_c) { &iter->k, NULL };
+ EBUG_ON(!k.k->size);
+ }
}
+ iter->path->should_be_locked = true;
+
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
return k;
}
@@ -1910,298 +2458,200 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
-{
- struct bkey_cached *ck;
- int ret;
-
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
- bch2_btree_iter_verify(iter);
-
- ret = btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- ck = (void *) iter->l[0].b;
-
- EBUG_ON(iter->btree_id != ck->key.btree_id ||
- bkey_cmp(iter->pos, ck->key.pos));
- BUG_ON(!ck->valid);
-
- iter->should_be_locked = true;
+/* new transactional stuff: */
- return bkey_i_to_s_c(ck->k);
+static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+ EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
}
-static inline void bch2_btree_iter_init(struct btree_trans *trans,
- struct btree_iter *iter, enum btree_id btree_id)
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
+#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
- iter->trans = trans;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
- iter->btree_id = btree_id;
- iter->real_pos = POS_MIN;
- iter->level = 0;
- iter->min_depth = 0;
- iter->locks_want = 0;
- iter->nodes_locked = 0;
- iter->nodes_intent_locked = 0;
- for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
-
- prefetch(c->btree_roots[btree_id].b);
+ for (i = 0; i < trans->nr_sorted; i++)
+ btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
+#endif
}
-/* new transactional stuff: */
-
-static inline void __bch2_trans_iter_free(struct btree_trans *trans,
- unsigned idx)
+static void btree_trans_verify_sorted(struct btree_trans *trans)
{
- __bch2_btree_iter_unlock(&trans->iters[idx]);
- trans->iters_linked &= ~(1ULL << idx);
- trans->iters_live &= ~(1ULL << idx);
- trans->iters_touched &= ~(1ULL << idx);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct btree_path *path, *prev = NULL;
+ unsigned i;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+ prev = path;
+ }
+#endif
}
-int bch2_trans_iter_put(struct btree_trans *trans,
- struct btree_iter *iter)
+static inline void btree_path_swap(struct btree_trans *trans,
+ struct btree_path *l, struct btree_path *r)
{
- int ret;
+ swap(l->sorted_idx, r->sorted_idx);
+ swap(trans->sorted[l->sorted_idx],
+ trans->sorted[r->sorted_idx]);
- if (IS_ERR_OR_NULL(iter))
- return 0;
-
- BUG_ON(trans->iters + iter->idx != iter);
- BUG_ON(!btree_iter_live(trans, iter));
-
- ret = btree_iter_err(iter);
-
- if (!(trans->iters_touched & (1ULL << iter->idx)) &&
- !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
- __bch2_trans_iter_free(trans, iter->idx);
-
- trans->iters_live &= ~(1ULL << iter->idx);
- return ret;
+ btree_path_verify_sorted_ref(trans, l);
+ btree_path_verify_sorted_ref(trans, r);
}
-int bch2_trans_iter_free(struct btree_trans *trans,
- struct btree_iter *iter)
+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+ int cmp)
{
- if (IS_ERR_OR_NULL(iter))
- return 0;
+ struct btree_path *n;
+
+ if (cmp <= 0) {
+ n = prev_btree_path(trans, path);
+ if (n && btree_path_cmp(n, path) > 0) {
+ do {
+ btree_path_swap(trans, n, path);
+ n = prev_btree_path(trans, path);
+ } while (n && btree_path_cmp(n, path) > 0);
- set_btree_iter_dontneed(trans, iter);
+ goto out;
+ }
+ }
- return bch2_trans_iter_put(trans, iter);
+ if (cmp >= 0) {
+ n = next_btree_path(trans, path);
+ if (n && btree_path_cmp(path, n) > 0) {
+ do {
+ btree_path_swap(trans, path, n);
+ n = next_btree_path(trans, path);
+ } while (n && btree_path_cmp(path, n) > 0);
+ }
+ }
+out:
+ btree_trans_verify_sorted(trans);
}
-noinline __cold
-static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+ struct btree_path *path)
{
+ unsigned i;
- struct btree_iter *iter;
- struct btree_insert_entry *i;
- char buf[100];
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
- trans_for_each_iter(trans, iter)
- printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
- bch2_btree_ids[iter->btree_id],
- (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
- btree_iter_live(trans, iter) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
+ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
- trans_for_each_update(trans, i) {
- char buf[300];
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
- bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
- printk(KERN_ERR "update: btree %s %s\n",
- bch2_btree_ids[i->iter->btree_id], buf);
- }
- panic("trans iter oveflow\n");
+ path->sorted_idx = U8_MAX;
+
+ btree_trans_verify_sorted_refs(trans);
}
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
+static inline void btree_path_list_add(struct btree_trans *trans,
+ struct btree_path *pos,
+ struct btree_path *path)
{
- unsigned idx;
-
- if (unlikely(trans->iters_linked ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
- btree_trans_iter_alloc_fail(trans);
+ unsigned i;
- idx = __ffs64(~trans->iters_linked);
+ btree_trans_verify_sorted_refs(trans);
- trans->iters_linked |= 1ULL << idx;
- trans->iters[idx].idx = idx;
- trans->iters[idx].flags = 0;
- return &trans->iters[idx];
-}
+ path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
-static inline void btree_iter_copy(struct btree_iter *dst,
- struct btree_iter *src)
-{
- unsigned i, idx = dst->idx;
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
- *dst = *src;
- dst->idx = idx;
- dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_locked(dst, i))
- six_lock_increment(&dst->l[i].b->c.lock,
- __btree_lock_want(dst, i));
+ btree_trans_verify_sorted_refs(trans);
+}
- dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
- dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
+{
+ if (iter->path)
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = NULL;
}
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
+static void __bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags,
+ unsigned long ip)
{
- struct btree_iter *iter, *best = NULL;
- struct bpos real_pos, pos_min = POS_MIN;
+ EBUG_ON(trans->restarted);
- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
- btree_node_type_is_extents(btree_id) &&
- !(flags & BTREE_ITER_NOT_EXTENTS) &&
- !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ btree_node_type_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
- pos.snapshot = btree_type_has_snapshots(btree_id)
- ? U32_MAX : 0;
-
- real_pos = pos;
-
- if ((flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(pos, POS_MAX))
- real_pos = bpos_nosnap_successor(pos);
-
- trans_for_each_iter(trans, iter) {
- if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
- continue;
-
- if (iter->btree_id != btree_id)
- continue;
-
- if (best) {
- int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
- bpos_diff(iter->real_pos, real_pos));
-
- if (cmp < 0 ||
- ((cmp == 0 && btree_iter_keep(trans, iter))))
- continue;
- }
-
- best = iter;
- }
-
- if (!best) {
- iter = btree_trans_iter_alloc(trans);
- bch2_btree_iter_init(trans, iter, btree_id);
- } else if (btree_iter_keep(trans, best)) {
- iter = btree_trans_iter_alloc(trans);
- btree_iter_copy(iter, best);
- } else {
- iter = best;
- }
-
- trans->iters_live |= 1ULL << iter->idx;
- trans->iters_touched |= 1ULL << iter->idx;
-
- iter->flags = flags;
-
- iter->snapshot = pos.snapshot;
-
- /*
- * If the iterator has locks_want greater than requested, we explicitly
- * do not downgrade it here - on transaction restart because btree node
- * split needs to upgrade locks, we might be putting/getting the
- * iterator again. Downgrading iterators only happens via an explicit
- * bch2_trans_downgrade().
- */
-
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > iter->locks_want) {
- iter->locks_want = locks_want;
- btree_iter_get_locks(iter, true, _THIS_IP_);
- }
-
- while (iter->level != depth) {
- btree_node_unlock(iter, iter->level);
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (iter->level < depth)
- iter->level++;
- else
- iter->level--;
- }
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ btree_type_has_snapshots(btree_id))
+ flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ iter->trans = trans;
+ iter->path = NULL;
+ iter->btree_id = btree_id;
iter->min_depth = depth;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k.type = KEY_TYPE_deleted;
+ iter->k.p = pos;
+ iter->k.size = 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ iter->ip_allocated = ip;
+#endif
- bch2_btree_iter_set_pos(iter, pos);
- btree_iter_set_search_pos(iter, real_pos);
-
- trace_trans_get_iter(_RET_IP_, trans->ip,
- btree_id,
- &real_pos, locks_want, iter->uptodate,
- best ? &best->real_pos : &pos_min,
- best ? best->locks_want : U8_MAX,
- best ? best->uptodate : U8_MAX);
-
- return iter;
+ iter->path = bch2_path_get(trans,
+ flags & BTREE_ITER_CACHED,
+ btree_id,
+ iter->pos,
+ locks_want,
+ depth,
+ flags & BTREE_ITER_INTENT, ip);
}
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
+void bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
{
- struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos,
- locks_want, depth,
- BTREE_ITER_NODES|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- flags);
-
- BUG_ON(bkey_cmp(iter->pos, pos));
- BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(iter->level != depth);
- BUG_ON(iter->min_depth != depth);
- iter->ip_allocated = _RET_IP_;
-
- return iter;
+ __bch2_trans_iter_init(trans, iter, btree_id, pos,
+ 0, 0, flags, _RET_IP_);
}
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
- struct btree_iter *src)
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ enum btree_id btree_id,
+ struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags)
{
- struct btree_iter *iter;
-
- iter = btree_trans_iter_alloc(trans);
- btree_iter_copy(iter, src);
-
- trans->iters_live |= 1ULL << iter->idx;
- /*
- * We don't need to preserve this iter since it's cheap to copy it
- * again - this will cause trans_iter_put() to free it right away:
- */
- set_btree_iter_dontneed(trans, iter);
+ __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
+ BTREE_ITER_NOT_EXTENTS|
+ __BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ flags, _RET_IP_);
+ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(iter->path->level != depth);
+ BUG_ON(iter->min_depth != depth);
+}
- return iter;
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+ *dst = *src;
+ if (src->path)
+ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -2231,43 +2681,36 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (old_bytes) {
trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+ btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
}
p = trans->mem + trans->mem_top;
trans->mem_top += size;
+ memset(p, 0, size);
return p;
}
-inline void bch2_trans_unlink_iters(struct btree_trans *trans)
-{
- u64 iters = trans->iters_linked &
- ~trans->iters_touched &
- ~trans->iters_live;
-
- while (iters) {
- unsigned idx = __ffs64(iters);
-
- iters &= ~(1ULL << idx);
- __bch2_trans_iter_free(trans, idx);
- }
-}
-
-void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
+/**
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree
+ * node may return EINTR when the trylock fails. When this occurs
+ * bch2_trans_begin() should be called and the transaction retried.
+ */
+void bch2_trans_begin(struct btree_trans *trans)
{
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter)
- iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
- BTREE_ITER_SET_POS_AFTER_COMMIT);
-
- bch2_trans_unlink_iters(trans);
+ struct btree_insert_entry *i;
+ struct btree_path *path;
- trans->iters_touched &= trans->iters_live;
+ trans_for_each_update(trans, i)
+ __btree_path_put(i->path, true);
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ trans->extra_journal_res = 0;
trans->nr_updates = 0;
- trans->nr_updates2 = 0;
trans->mem_top = 0;
trans->hooks = NULL;
@@ -2281,31 +2724,44 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
(void *) &trans->fs_usage_deltas->memset_start);
}
- if (!(flags & TRANS_RESET_NOUNLOCK))
- bch2_trans_cond_resched(trans);
+ trans_for_each_path(trans, path) {
+ path->should_be_locked = false;
+
+ /*
+ * XXX: we probably shouldn't be doing this if the transaction
+ * was restarted, but currently we still overflow transaction
+ * iterators if we do that
+ */
+ if (!path->ref && !path->preserve)
+ __bch2_path_free(trans, path);
+ else if (!path->ref)
+ path->preserve = false;
+ }
+
+ bch2_trans_cond_resched(trans);
+
+ if (trans->restarted)
+ bch2_btree_path_traverse_all(trans);
- if (!(flags & TRANS_RESET_NOTRAVERSE) &&
- trans->iters_linked)
- bch2_btree_iter_traverse_all(trans);
+ trans->restarted = false;
}
-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
{
- size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+ size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX;
size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
void *p = NULL;
BUG_ON(trans->used_mempool);
#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+ p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
#endif
if (!p)
- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+ p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
- trans->iters = p; p += iters_bytes;
+ trans->paths = p; p += paths_bytes;
trans->updates = p; p += updates_bytes;
- trans->updates2 = p; p += updates_bytes;
}
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
@@ -2317,11 +2773,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->c = c;
trans->ip = _RET_IP_;
- /*
- * reallocating iterators currently completely breaks
- * bch2_trans_iter_put(), we always allocate the max:
- */
- bch2_trans_alloc_iters(trans, c);
+ bch2_trans_alloc_paths(trans, c);
if (expected_mem_bytes) {
trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
@@ -2343,47 +2795,63 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
#endif
}
-int bch2_trans_exit(struct btree_trans *trans)
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ goto leaked;
+ return;
+leaked:
+ bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ printk(KERN_ERR " btree %s %pS\n",
+ bch2_btree_ids[path->btree_id],
+ (void *) path->ip_allocated);
+ /* Be noisy about this: */
+ bch2_fatal_error(c);
+#endif
+}
+
+void bch2_trans_exit(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
+ struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
bch2_trans_unlock(trans);
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (trans->iters_live) {
- struct btree_iter *iter;
+ trans_for_each_update(trans, i)
+ __btree_path_put(i->path, true);
+ trans->nr_updates = 0;
- bch_err(c, "btree iterators leaked!");
- trans_for_each_iter(trans, iter)
- if (btree_iter_live(trans, iter))
- printk(KERN_ERR " btree %s allocated at %pS\n",
- bch2_btree_ids[iter->btree_id],
- (void *) iter->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
- }
+ check_btree_paths_leaked(trans);
- mutex_lock(&trans->c->btree_trans_lock);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ mutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
- mutex_unlock(&trans->c->btree_trans_lock);
+ mutex_unlock(&c->btree_trans_lock);
#endif
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
mempool_free(trans->fs_usage_deltas,
- &trans->c->replicas_delta_pool);
+ &c->replicas_delta_pool);
else
kfree(trans->fs_usage_deltas);
}
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
- mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
@@ -2391,36 +2859,33 @@ int bch2_trans_exit(struct btree_trans *trans)
/*
* Userspace doesn't have a real percpu implementation:
*/
- trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+ trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
#endif
- if (trans->iters)
- mempool_free(trans->iters, &trans->c->btree_iters_pool);
+ if (trans->paths)
+ mempool_free(trans->paths, &c->btree_paths_pool);
trans->mem = (void *) 0x1;
- trans->iters = (void *) 0x1;
-
- return trans->error ? -EIO : 0;
+ trans->paths = (void *) 0x1;
}
static void __maybe_unused
-bch2_btree_iter_node_to_text(struct printbuf *out,
+bch2_btree_path_node_to_text(struct printbuf *out,
struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+ bool cached)
{
pr_buf(out, " l=%u %s:",
_b->level, bch2_btree_ids[_b->btree_id]);
- bch2_bpos_to_text(out, btree_node_pos(_b, type));
+ bch2_bpos_to_text(out, btree_node_pos(_b, cached));
}
#ifdef CONFIG_BCACHEFS_DEBUG
-static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+static bool trans_has_locks(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
- iter->nodes_locked)
+ trans_for_each_path(trans, path)
+ if (path->nodes_locked)
return true;
return false;
}
@@ -2430,35 +2895,36 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_trans *trans;
- struct btree_iter *iter;
+ struct btree_path *path;
struct btree *b;
unsigned l;
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (!trans_has_btree_nodes_locked(trans))
+ if (!trans_has_locks(trans))
continue;
pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
- trans_for_each_iter(trans, iter) {
- if (!iter->nodes_locked)
+ trans_for_each_path(trans, path) {
+ if (!path->nodes_locked)
continue;
- pr_buf(out, " iter %u %c %s:",
- iter->idx,
- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
- bch2_btree_ids[iter->btree_id]);
- bch2_bpos_to_text(out, iter->pos);
+ pr_buf(out, " path %u %c l=%u %s:",
+ path->idx,
+ path->cached ? 'c' : 'b',
+ path->level,
+ bch2_btree_ids[path->btree_id]);
+ bch2_bpos_to_text(out, path->pos);
pr_buf(out, "\n");
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(iter, l)) {
+ if (btree_node_locked(path, l)) {
pr_buf(out, " %s l=%u ",
- btree_node_intent_locked(iter, l) ? "i" : "r", l);
- bch2_btree_iter_node_to_text(out,
- (void *) iter->l[l].b,
- btree_iter_type(iter));
+ btree_node_intent_locked(path, l) ? "i" : "r", l);
+ bch2_btree_path_node_to_text(out,
+ (void *) path->l[l].b,
+ path->cached);
pr_buf(out, "\n");
}
}
@@ -2466,18 +2932,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
b = READ_ONCE(trans->locking);
if (b) {
- iter = &trans->iters[trans->locking_iter_idx];
- pr_buf(out, " locking iter %u %c l=%u %s:",
- trans->locking_iter_idx,
- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+ path = &trans->paths[trans->locking_path_idx];
+ pr_buf(out, " locking path %u %c l=%u %s:",
+ trans->locking_path_idx,
+ path->cached ? 'c' : 'b',
trans->locking_level,
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
pr_buf(out, " node ");
- bch2_btree_iter_node_to_text(out,
- (void *) b,
- btree_iter_type(iter));
+ bch2_btree_path_node_to_text(out,
+ (void *) b, path->cached);
pr_buf(out, "\n");
}
}
@@ -2488,7 +2953,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_iters_pool);
+ mempool_exit(&c->btree_paths_pool);
cleanup_srcu_struct(&c->btree_trans_barrier);
}
@@ -2500,9 +2965,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
mutex_init(&c->btree_trans_lock);
return init_srcu_struct(&c->btree_trans_barrier) ?:
- mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
- sizeof(struct btree_iter) * nr +
- sizeof(struct btree_insert_entry) * nr +
+ mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+ sizeof(struct btree_path) * nr +
sizeof(struct btree_insert_entry) * nr) ?:
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
BTREE_TRANS_MEM_MAX);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index a2ce711fd61f..26eb90a7eab8 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -5,40 +5,49 @@
#include "bset.h"
#include "btree_types.h"
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
- enum btree_iter_uptodate u)
+static inline void __btree_path_get(struct btree_path *path, bool intent)
{
- iter->uptodate = max_t(unsigned, iter->uptodate, u);
+ path->ref++;
+ path->intent_ref += intent;
}
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+ EBUG_ON(!path->ref);
+ EBUG_ON(!path->intent_ref && intent);
+ path->intent_ref -= intent;
+ return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+ enum btree_path_uptodate u)
+{
+ path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
unsigned level)
{
- return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
}
-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
const struct btree *b, unsigned level)
{
/*
* We don't compare the low bits of the lock sequence numbers because
- * @iter might have taken a write lock on @b, and we don't want to skip
- * the linked iterator if the sequence numbers were equal before taking
- * that write lock. The lock sequence number is incremented by taking
- * and releasing write locks and is even when unlocked:
+ * @path might have taken a write lock on @b, and we don't want to skip
+ * the linked path if the sequence numbers were equal before taking that
+ * write lock. The lock sequence number is incremented by taking and
+ * releasing write locks and is even when unlocked:
*/
- return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+ return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
}
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
+static inline struct btree *btree_node_parent(struct btree_path *path,
struct btree *b)
{
- return btree_iter_node(iter, b->c.level + 1);
-}
-
-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
-{
- return hweight64(trans->iters_linked) > 1;
+ return btree_path_node(path, b->c.level + 1);
}
static inline int btree_iter_err(const struct btree_iter *iter)
@@ -46,116 +55,164 @@ static inline int btree_iter_err(const struct btree_iter *iter)
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
}
-/* Iterate over iters within a transaction: */
+/* Iterate over paths within a transaction: */
-static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, unsigned idx)
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
{
u64 l;
if (idx == BTREE_ITER_MAX)
return NULL;
- l = trans->iters_linked >> idx;
+ l = trans->paths_allocated >> idx;
if (!l)
return NULL;
idx += __ffs64(l);
EBUG_ON(idx >= BTREE_ITER_MAX);
- EBUG_ON(trans->iters[idx].idx != idx);
- return &trans->iters[idx];
+ EBUG_ON(trans->paths[idx].idx != idx);
+ return &trans->paths[idx];
}
-#define trans_for_each_iter(_trans, _iter) \
- for (_iter = __trans_next_iter((_trans), 0); \
- (_iter); \
- _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ for (_path = __trans_next_path((_trans), 0); \
+ (_path); \
+ _path = __trans_next_path((_trans), (_path)->idx + 1))
-static inline bool __iter_has_node(const struct btree_iter *iter,
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+ unsigned idx = path ? path->sorted_idx + 1 : 0;
+
+ EBUG_ON(idx > trans->nr_sorted);
+
+ return idx < trans->nr_sorted
+ ? trans->paths + trans->sorted[idx]
+ : NULL;
+}
+
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+ return path->sorted_idx
+ ? trans->paths + trans->sorted[path->sorted_idx - 1]
+ : NULL;
+}
+
+#define trans_for_each_path_inorder(_trans, _path, _i) \
+ for (_i = 0; \
+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+ _i++)
+
+static inline bool __path_has_node(const struct btree_path *path,
const struct btree *b)
{
- return iter->l[b->c.level].b == b &&
- btree_node_lock_seq_matches(iter, b, b->c.level);
+ return path->l[b->c.level].b == b &&
+ btree_node_lock_seq_matches(path, b, b->c.level);
}
-static inline struct btree_iter *
-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
unsigned idx)
{
- struct btree_iter *iter = __trans_next_iter(trans, idx);
+ struct btree_path *path = __trans_next_path(trans, idx);
- while (iter && !__iter_has_node(iter, b))
- iter = __trans_next_iter(trans, iter->idx + 1);
+ while (path && !__path_has_node(path, b))
+ path = __trans_next_path(trans, path->idx + 1);
- return iter;
+ return path;
}
-#define trans_for_each_iter_with_node(_trans, _b, _iter) \
- for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \
- (_iter); \
- _iter = __trans_next_iter_with_node((_trans), (_b), \
- (_iter)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path) \
+ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
+ (_path); \
+ _path = __trans_next_path_with_node((_trans), (_b), \
+ (_path)->idx + 1))
+
+struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+ bool, unsigned long);
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+ struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
+ struct bpos, unsigned, unsigned, bool,
+ unsigned long);
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
-void bch2_btree_trans_verify_locks(struct btree_trans *);
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_trans_verify_locks(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+ struct bpos, bool);
#else
-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
- struct btree *b) {}
-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache) {}
#endif
-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
- struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bkey_packed *,
- unsigned, unsigned);
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+ struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_packed *, unsigned, unsigned);
+
+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
bool bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans)
+{
+ trans->restarted = true;
+ bch2_trans_unlock(trans);
+ return -EINTR;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- return iter->locks_want < new_locks_want
- ? __bch2_btree_iter_upgrade(iter, new_locks_want)
- : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+ return path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ : path->uptodate == BTREE_ITER_UPTODATE;
}
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_btree_path_downgrade(struct btree_path *path)
{
- unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+ unsigned new_locks_want = path->level + !!path->intent_ref;
- if (iter->locks_want > new_locks_want)
- __bch2_btree_iter_downgrade(iter, new_locks_want);
+ if (path->locks_want > new_locks_want)
+ __bch2_btree_path_downgrade(path, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
-
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
int __must_check bch2_btree_iter_traverse(struct btree_iter *);
-int bch2_btree_iter_traverse_all(struct btree_trans *);
-
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *);
-
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
@@ -163,8 +220,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
-
bool bch2_btree_iter_advance(struct btree_iter *);
bool bch2_btree_iter_rewind(struct btree_iter *);
@@ -178,151 +233,130 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
iter->k.p.offset = iter->pos.offset = new_pos.offset;
iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
iter->k.size = 0;
- iter->should_be_locked = false;
}
-/* Sort order for locking btree iterators: */
-static inline int btree_iter_lock_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
{
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
- bkey_cmp(l->real_pos, r->real_pos);
+ BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+ iter->pos = bkey_start_pos(&iter->k);
}
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
{
- if (need_resched() || race_fault()) {
- bch2_trans_unlock(trans);
- schedule();
- return bch2_trans_relock(trans) ? 0 : -EINTR;
- } else {
- return 0;
- }
-}
+ struct bpos pos = iter->pos;
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b) \
- for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \
- _start, _locks_want, _depth, _flags), \
- _b = bch2_btree_iter_peek_node(_iter); \
- (_b); \
- (_b) = bch2_btree_iter_next_node(_iter))
-
-#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b) \
- __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b)
-
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
- unsigned flags)
-{
- if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
- return bch2_btree_iter_peek_cached(iter);
- else
- return flags & BTREE_ITER_SLOTS
- ? bch2_btree_iter_peek_slot(iter)
- : bch2_btree_iter_peek(iter);
+ iter->snapshot = snapshot;
+ pos.snapshot = snapshot;
+ bch2_btree_iter_set_pos(iter, pos);
}
-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
- unsigned flags)
-{
- return flags & BTREE_ITER_SLOTS
- ? bch2_btree_iter_next_slot(iter)
- : bch2_btree_iter_next(iter);
-}
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+ unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+ enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
-static inline int bkey_err(struct bkey_s_c k)
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- return PTR_ERR_OR_ZERO(k.k);
+ iter->path->preserve = false;
}
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \
- (_start), (_flags)), \
- (_k) = __bch2_btree_iter_peek(_iter, _flags); \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- (_k) = __bch2_btree_iter_next(_iter, _flags))
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
-#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \
- for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- (_k) = __bch2_btree_iter_next(_iter, _flags))
+static inline struct btree *
+__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct btree *b;
-/* new multiple iterator interface: */
+ while (b = bch2_btree_iter_peek_node(iter),
+ PTR_ERR_OR_ZERO(b) == -EINTR)
+ bch2_trans_begin(trans);
-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+ return b;
+}
-void bch2_trans_unlink_iters(struct btree_trans *);
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b, _ret) \
+ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
+ _start, _locks_want, _depth, _flags); \
+ (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
+ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
+ (_b) = bch2_btree_iter_next_node(&(_iter)))
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned,
- unsigned, unsigned);
+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _flags, _b, _ret) \
+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ 0, 0, _flags, _b, _ret)
-static inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
- struct bpos pos, unsigned flags)
+static inline int bkey_err(struct bkey_s_c k)
{
- struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos,
- (flags & BTREE_ITER_INTENT) != 0, 0,
- flags);
- iter->ip_allocated = _THIS_IP_;
- return iter;
+ return PTR_ERR_OR_ZERO(k.k);
}
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
- struct btree_iter *);
-static inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+ unsigned flags)
{
- struct btree_iter *iter =
- __bch2_trans_copy_iter(trans, src);
-
- iter->ip_allocated = _THIS_IP_;
- return iter;
+ return flags & BTREE_ITER_SLOTS
+ ? bch2_btree_iter_peek_slot(iter)
+ : bch2_btree_iter_peek(iter);
}
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
- enum btree_id, struct bpos,
- unsigned, unsigned, unsigned);
-
-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- return (trans->iters_live & (1ULL << iter->idx)) != 0;
+ return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
+ ? -EINTR : 0;
}
-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
{
- return btree_iter_live(trans, iter) ||
- (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-}
+ struct bkey_s_c k;
-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
- trans->iters_touched &= ~(1ULL << iter->idx);
+ while (btree_trans_too_many_iters(trans) ||
+ (k = __bch2_btree_iter_peek(iter, flags),
+ bkey_err(k) == -EINTR))
+ bch2_trans_begin(trans);
+
+ return k;
}
-#define TRANS_RESET_NOTRAVERSE (1 << 0)
-#define TRANS_RESET_NOUNLOCK (1 << 1)
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
-void bch2_trans_reset(struct btree_trans *, unsigned);
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
-static inline void bch2_trans_begin(struct btree_trans *trans)
-{
- return bch2_trans_reset(trans, 0);
-}
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = __bch2_btree_iter_peek(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+/* new multiple iterator interface: */
+
+void bch2_dump_trans_paths_updates(struct btree_trans *);
void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
-int bch2_trans_exit(struct btree_trans *);
+void bch2_trans_exit(struct btree_trans *);
void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 017c4f55fdaa..4f1bc1d165aa 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c,
was_new = false;
}
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
+
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
@@ -196,25 +201,25 @@ btree_key_cache_create(struct btree_key_cache *c,
}
static int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_iter *ck_iter,
+ struct btree_path *ck_path,
struct bkey_cached *ck)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
int ret;
- iter = bch2_trans_get_iter(trans, ck->key.btree_id,
- ck->key.pos, BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
+ ck->key.pos, BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (!bch2_btree_node_relock(ck_iter, 0)) {
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- ret = -EINTR;
+ ret = btree_trans_restart(trans);
goto err;
}
@@ -233,7 +238,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
}
}
- bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+ /*
+ * XXX: not allowed to be holding read locks when we take a write lock,
+ * currently
+ */
+ bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
@@ -242,93 +251,93 @@ static int btree_key_cache_fill(struct btree_trans *trans,
bkey_reassemble(ck->k, k);
ck->valid = true;
- bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(trans, iter);
+ set_btree_iter_dontneed(&iter);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bkey_cached_check_fn(struct six_lock *lock, void *p)
{
struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
- const struct btree_iter *iter = p;
+ const struct btree_path *path = p;
- return ck->key.btree_id == iter->btree_id &&
- !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+ return ck->key.btree_id == path->btree_id &&
+ !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
}
__flatten
-int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
int ret = 0;
- BUG_ON(iter->level);
+ BUG_ON(path->level);
+
+ path->l[1].b = NULL;
- if (btree_node_locked(iter, 0)) {
- ck = (void *) iter->l[0].b;
+ if (bch2_btree_node_relock(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
goto fill;
}
retry:
- ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
- if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
- iter->l[0].b = NULL;
+ if (flags & BTREE_ITER_CACHED_NOCREATE) {
+ path->l[0].b = NULL;
return 0;
}
ck = btree_key_cache_create(&c->btree_key_cache,
- iter->btree_id, iter->pos);
+ path->btree_id, path->pos);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
- mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
- iter->locks_want = 1;
+ mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+ path->locks_want = 1;
} else {
- enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
- bkey_cached_check_fn, iter, _THIS_IP_)) {
- if (ck->key.btree_id != iter->btree_id ||
- bpos_cmp(ck->key.pos, iter->pos)) {
+ if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+ lock_want,
+ bkey_cached_check_fn, path, _THIS_IP_)) {
+ if (!trans->restarted)
goto retry;
- }
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
- if (ck->key.btree_id != iter->btree_id ||
- bpos_cmp(ck->key.pos, iter->pos)) {
+ if (ck->key.btree_id != path->btree_id ||
+ bpos_cmp(ck->key.pos, path->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
- mark_btree_node_locked(iter, 0, lock_want);
+ mark_btree_node_locked(path, 0, lock_want);
}
- iter->l[0].lock_seq = ck->c.lock.state.seq;
- iter->l[0].b = (void *) ck;
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ path->l[0].b = (void *) ck;
fill:
- if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
- if (!btree_node_intent_locked(iter, 0))
- bch2_btree_iter_upgrade(iter, 1);
- if (!btree_node_intent_locked(iter, 0)) {
+ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ if (!path->locks_want &&
+ !__bch2_btree_path_upgrade(trans, path, 1)) {
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- ret = -EINTR;
+ ret = btree_trans_restart(trans);
goto err;
}
- ret = btree_key_cache_fill(trans, iter, ck);
+ ret = btree_key_cache_fill(trans, path, ck);
if (ret)
goto err;
}
@@ -336,21 +345,14 @@ fill:
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- iter->uptodate = BTREE_ITER_NEED_PEEK;
-
- if (!(iter->flags & BTREE_ITER_INTENT))
- bch2_btree_iter_downgrade(iter);
- else if (!iter->locks_want) {
- if (!__bch2_btree_iter_upgrade(iter, 1))
- ret = -EINTR;
- }
+ path->uptodate = BTREE_ITER_UPTODATE;
+ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
return ret;
err:
if (ret != -EINTR) {
- btree_node_unlock(iter, 0);
- iter->flags |= BTREE_ITER_ERROR;
- iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+ btree_node_unlock(path, 0);
+ path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
}
return ret;
}
@@ -363,24 +365,24 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- struct btree_iter *c_iter = NULL, *b_iter = NULL;
+ struct btree_iter c_iter, b_iter;
struct bkey_cached *ck = NULL;
int ret;
- b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
- BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT);
- c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_CACHED_NOCREATE|
- BTREE_ITER_INTENT);
-retry:
- ret = bch2_btree_iter_traverse(c_iter);
+ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_INTENT|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_CACHED_NOCREATE|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
- goto err;
+ goto out;
- ck = (void *) c_iter->l[0].b;
+ ck = (void *) c_iter.path->l[0].b;
if (!ck ||
(journal_seq && ck->journal.seq != journal_seq))
goto out;
@@ -396,10 +398,11 @@ retry:
* allocator/copygc depend on journal reclaim making progress, we need
* to be using alloc reserves:
* */
- ret = bch2_btree_iter_traverse(b_iter) ?:
- bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
+ ret = bch2_btree_iter_traverse(&b_iter) ?:
+ bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
@@ -407,15 +410,10 @@ retry:
? BTREE_INSERT_JOURNAL_RESERVED
: 0)|
commit_flags);
-err:
- if (ret == -EINTR)
- goto retry;
-
- if (ret == -EAGAIN)
- goto out;
-
if (ret) {
- bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+ bch2_fs_fatal_err_on(ret != -EINTR &&
+ ret != -EAGAIN &&
+ !bch2_journal_error(j), c,
"error flushing key cache: %i", ret);
goto out;
}
@@ -423,7 +421,7 @@ err:
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
- BUG_ON(!btree_node_locked(c_iter, 0));
+ BUG_ON(!btree_node_locked(c_iter.path, 0));
if (!evict) {
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -432,10 +430,10 @@ err:
}
} else {
evict:
- BUG_ON(!btree_node_intent_locked(c_iter, 0));
+ BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
- mark_btree_node_unlocked(c_iter, 0);
- c_iter->l[0].b = NULL;
+ mark_btree_node_unlocked(c_iter.path, 0);
+ c_iter.path->l[0].b = NULL;
six_lock_write(&ck->c.lock, NULL, NULL);
@@ -451,8 +449,8 @@ evict:
mutex_unlock(&c->btree_key_cache.lock);
}
out:
- bch2_trans_iter_put(trans, b_iter);
- bch2_trans_iter_put(trans, c_iter);
+ bch2_trans_iter_exit(trans, &b_iter);
+ bch2_trans_iter_exit(trans, &c_iter);
return ret;
}
@@ -463,7 +461,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
- struct btree_trans trans;
int ret = 0;
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -478,10 +475,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
- bch2_trans_init(&trans, c, 0, 0);
- ret = btree_key_cache_flush_pos(&trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false);
- bch2_trans_exit(&trans);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ btree_key_cache_flush_pos(&trans, key, seq,
+ BTREE_INSERT_JOURNAL_RECLAIM, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@@ -505,11 +501,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ struct bkey_cached *ck = (void *) path->l[0].b;
bool kick_reclaim = false;
BUG_ON(insert->u64s > ck->u64s);
@@ -602,7 +598,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
do {
struct rhash_head *pos, *next;
- pos = *rht_bucket(tbl, bc->shrink_iter);
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 7e2b0a08f745..0768ef3ca776 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -26,10 +26,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-int bch2_btree_iter_traverse_cached(struct btree_iter *);
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+ unsigned);
bool bch2_btree_insert_key_cached(struct btree_trans *,
- struct btree_iter *, struct bkey_i *);
+ struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
#ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7532bcdef967..d599008c5fc1 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -21,7 +21,7 @@ enum btree_node_locked_type {
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
};
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
/*
@@ -30,35 +30,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter,
* branches:
*/
return BTREE_NODE_UNLOCKED +
- ((iter->nodes_locked >> level) & 1) +
- ((iter->nodes_intent_locked >> level) & 1);
+ ((path->nodes_locked >> level) & 1) +
+ ((path->nodes_intent_locked >> level) & 1);
}
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
+static inline bool btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
- return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+ return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
}
-static inline bool btree_node_read_locked(struct btree_iter *iter,
+static inline bool btree_node_read_locked(struct btree_path *path,
unsigned level)
{
- return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+ return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
}
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
- return iter->nodes_locked & (1 << level);
+ return path->nodes_locked & (1 << level);
}
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+static inline void mark_btree_node_unlocked(struct btree_path *path,
unsigned level)
{
- iter->nodes_locked &= ~(1 << level);
- iter->nodes_intent_locked &= ~(1 << level);
+ path->nodes_locked &= ~(1 << level);
+ path->nodes_intent_locked &= ~(1 << level);
}
-static inline void mark_btree_node_locked(struct btree_iter *iter,
+static inline void mark_btree_node_locked(struct btree_path *path,
unsigned level,
enum six_lock_type type)
{
@@ -66,52 +66,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter,
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
- iter->nodes_locked |= 1 << level;
- iter->nodes_intent_locked |= type << level;
+ path->nodes_locked |= 1 << level;
+ path->nodes_intent_locked |= type << level;
}
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
- mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+ mark_btree_node_locked(path, level, SIX_LOCK_intent);
}
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
{
- return level < iter->locks_want
+ return level < path->locks_want
? SIX_LOCK_intent
: SIX_LOCK_read;
}
static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
{
- if (level < iter->level)
+ if (level < path->level)
return BTREE_NODE_UNLOCKED;
- if (level < iter->locks_want)
+ if (level < path->locks_want)
return BTREE_NODE_INTENT_LOCKED;
- if (level == iter->level)
+ if (level == path->level)
return BTREE_NODE_READ_LOCKED;
return BTREE_NODE_UNLOCKED;
}
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
{
- int lock_type = btree_node_locked_type(iter, level);
+ int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
- six_unlock_type(&iter->l[level].b->c.lock, lock_type);
- mark_btree_node_unlocked(iter, level);
+ six_unlock_type(&path->l[level].b->c.lock, lock_type);
+ mark_btree_node_unlocked(path, level);
}
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline void __bch2_btree_path_unlock(struct btree_path *path)
{
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
- while (iter->nodes_locked)
- btree_node_unlock(iter, __ffs(iter->nodes_locked));
+ while (path->nodes_locked)
+ btree_node_unlock(path, __ffs(path->nodes_locked));
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
@@ -155,11 +155,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
struct btree *b, unsigned level,
enum btree_node_locked_type want)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- if (iter->l[level].b == b &&
- btree_node_locked_type(iter, level) >= want) {
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b &&
+ btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->c.lock, want);
return true;
}
@@ -167,40 +167,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
return false;
}
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
- struct btree_iter *, enum six_lock_type,
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
+ struct btree *, struct bpos, unsigned,
+ enum six_lock_type,
six_lock_should_sleep_fn, void *,
unsigned long);
-static inline bool btree_node_lock(struct btree *b,
- struct bpos pos, unsigned level,
- struct btree_iter *iter,
+static inline bool btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b, struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
- struct btree_trans *trans = iter->trans;
-
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
return likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
- __bch2_btree_node_lock(b, pos, level, iter, type,
+ __bch2_btree_node_lock(trans, path, b, pos, level, type,
should_sleep_fn, p, ip);
}
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
- unsigned level)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- EBUG_ON(btree_node_locked(iter, level) &&
- btree_node_locked_type(iter, level) !=
- __btree_lock_want(iter, level));
+ EBUG_ON(btree_node_locked(path, level) &&
+ btree_node_locked_type(path, level) !=
+ __btree_lock_want(path, level));
- return likely(btree_node_locked(iter, level)) ||
- __bch2_btree_node_relock(iter, level);
+ return likely(btree_node_locked(path, level)) ||
+ __bch2_btree_node_relock(trans, path, level);
}
/*
@@ -208,30 +207,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
* succeed:
*/
static inline void
-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- EBUG_ON(iter->l[b->c.level].b != b);
- EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
- trans_for_each_iter_with_node(iter->trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked)
linked->l[b->c.level].lock_seq += 2;
six_unlock_write(&b->c.lock);
}
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- EBUG_ON(iter->l[b->c.level].b != b);
- EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
+ EBUG_ON(!btree_node_intent_locked(path, b->c.level));
if (unlikely(!six_trylock_write(&b->c.lock)))
- __bch2_btree_node_lock_write(b, iter);
+ __bch2_btree_node_lock_write(trans, b);
}
#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bc0f482b53d2..2c2e2f794b8f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -176,51 +176,45 @@ struct btree_node_iter {
} data[MAX_BSETS];
};
-enum btree_iter_type {
- BTREE_ITER_KEYS,
- BTREE_ITER_NODES,
- BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE ((1 << 2) - 1)
-
/*
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
-#define BTREE_ITER_SLOTS (1 << 2)
+#define BTREE_ITER_SLOTS (1 << 0)
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-#define BTREE_ITER_INTENT (1 << 3)
+#define BTREE_ITER_INTENT (1 << 1)
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-#define BTREE_ITER_PREFETCH (1 << 4)
+#define BTREE_ITER_PREFETCH (1 << 2)
/*
* Indicates that this iterator should not be reused until transaction commit,
* either because a pending update references it or because the update depends
* on that particular key being locked (e.g. by the str_hash code, for hash
* table consistency)
*/
-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 6)
-#define BTREE_ITER_ERROR (1 << 7)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
-#define BTREE_ITER_CACHED_NOFILL (1 << 9)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
-#define BTREE_ITER_NOT_EXTENTS (1 << 11)
+#define BTREE_ITER_IS_EXTENTS (1 << 4)
+#define BTREE_ITER_NOT_EXTENTS (1 << 5)
+#define BTREE_ITER_ERROR (1 << 6)
+#define BTREE_ITER_CACHED (1 << 7)
+#define BTREE_ITER_CACHED_NOFILL (1 << 8)
+#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
+#define BTREE_ITER_WITH_UPDATES (1 << 10)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
-enum btree_iter_uptodate {
+enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_PEEK = 1,
- BTREE_ITER_NEED_RELOCK = 2,
- BTREE_ITER_NEED_TRAVERSE = 3,
+ BTREE_ITER_NEED_RELOCK = 1,
+ BTREE_ITER_NEED_TRAVERSE = 2,
};
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
@@ -232,69 +226,76 @@ enum btree_iter_uptodate {
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8)
-/*
- * @pos - iterator's current position
- * @level - current btree depth
- * @locks_want - btree level below which we start taking intent locks
- * @nodes_locked - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
- struct btree_trans *trans;
- struct bpos pos;
- /* what we're searching for/what the iterator actually points to: */
- struct bpos real_pos;
- struct bpos pos_after_commit;
- /* When we're filtering by snapshot, the snapshot ID we're looking for: */
- unsigned snapshot;
-
- u16 flags;
+struct btree_path {
u8 idx;
+ u8 sorted_idx;
+ u8 ref;
+ u8 intent_ref;
+
+ /* btree_iter_copy starts here: */
+ struct bpos pos;
enum btree_id btree_id:4;
- enum btree_iter_uptodate uptodate:3;
+ bool cached:1;
+ bool preserve:1;
+ enum btree_path_uptodate uptodate:2;
/*
- * True if we've returned a key (and thus are expected to keep it
- * locked), false after set_pos - for avoiding spurious transaction
- * restarts in bch2_trans_relock():
+ * When true, failing to relock this path will cause the transaction to
+ * restart:
*/
bool should_be_locked:1;
- unsigned level:4,
- min_depth:4,
+ unsigned level:3,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
- struct btree_iter_level {
+ struct btree_path_level {
struct btree *b;
struct btree_node_iter iter;
u32 lock_seq;
} l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned long ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+ return path->l + path->level;
+}
+
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct btree_trans *trans;
+ struct btree_path *path;
+
+ enum btree_id btree_id:4;
+ unsigned min_depth:4;
+
+ /* btree_iter_copy starts here: */
+ u16 flags;
+
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
+ struct bpos pos;
+ struct bpos pos_after_commit;
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+#ifdef CONFIG_BCACHEFS_DEBUG
unsigned long ip_allocated;
+#endif
};
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
- return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline bool btree_iter_is_cached(const struct btree_iter *iter)
-{
- return btree_iter_type(iter) == BTREE_ITER_CACHED;
-}
-
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
- return iter->l + iter->level;
-}
-
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
@@ -335,14 +336,16 @@ struct bkey_cached {
};
struct btree_insert_entry {
- unsigned trigger_flags;
+ unsigned flags;
u8 bkey_type;
enum btree_id btree_id:8;
u8 level;
- unsigned trans_triggers_run:1;
- unsigned is_extent:1;
+ bool cached:1;
+ bool insert_trigger_run:1;
+ bool overwrite_trigger_run:1;
struct bkey_i *k;
- struct btree_iter *iter;
+ struct btree_path *path;
+ unsigned long ip_allocated;
};
#ifndef CONFIG_LOCKDEP
@@ -359,14 +362,14 @@ struct btree_trans_commit_hook {
struct btree_trans_commit_hook *next;
};
-#define BTREE_TRANS_MEM_MAX 4096
+#define BTREE_TRANS_MEM_MAX (1U << 14)
struct btree_trans {
struct bch_fs *c;
#ifdef CONFIG_BCACHEFS_DEBUG
struct list_head list;
struct btree *locking;
- unsigned locking_iter_idx;
+ unsigned locking_path_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
@@ -375,23 +378,26 @@ struct btree_trans {
unsigned long ip;
int srcu_idx;
+ u8 nr_sorted;
u8 nr_updates;
- u8 nr_updates2;
- unsigned used_mempool:1;
- unsigned error:1;
- unsigned in_traverse_all:1;
+ bool used_mempool:1;
+ bool in_traverse_all:1;
+ bool restarted:1;
+ /*
+ * For when bch2_trans_update notices we'll be splitting a compressed
+ * extent:
+ */
+ unsigned extra_journal_res;
- u64 iters_linked;
- u64 iters_live;
- u64 iters_touched;
+ u64 paths_allocated;
unsigned mem_top;
unsigned mem_bytes;
void *mem;
- struct btree_iter *iters;
+ u8 sorted[BTREE_ITER_MAX];
+ struct btree_path *paths;
struct btree_insert_entry *updates;
- struct btree_insert_entry *updates2;
/* update path: */
struct btree_trans_commit_hook *hooks;
@@ -428,6 +434,7 @@ enum btree_flags {
BTREE_NODE_write_idx,
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
+ BTREE_NODE_write_in_flight_inner,
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
@@ -442,6 +449,7 @@ BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
+BTREE_FLAG(write_in_flight_inner);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
@@ -592,16 +600,6 @@ static inline bool btree_node_is_extents(struct btree *b)
return btree_node_type_is_extents(btree_node_type(b));
}
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
-{
- return __btree_node_type(iter->level, iter->btree_id);
-}
-
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
-{
- return btree_node_type_is_extents(btree_iter_key_type(iter));
-}
-
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
(1U << BKEY_TYPE_inodes)| \
@@ -611,7 +609,9 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
((1U << BKEY_TYPE_alloc)| \
- (1U << BKEY_TYPE_stripes))
+ (1U << BKEY_TYPE_inodes)| \
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
@@ -632,28 +632,39 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
}
-enum btree_trigger_flags {
+enum btree_update_flags {
+ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
- __BTREE_TRIGGER_OVERWRITE_SPLIT,
__BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
__BTREE_TRIGGER_NOATOMIC,
};
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
+ ((1U << KEY_TYPE_alloc)| \
+ (1U << KEY_TYPE_alloc_v2)| \
+ (1U << KEY_TYPE_alloc_v3)| \
+ (1U << KEY_TYPE_stripe)| \
+ (1U << KEY_TYPE_inode)| \
+ (1U << KEY_TYPE_inode_v2)| \
+ (1U << KEY_TYPE_snapshot))
+
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
@@ -669,16 +680,10 @@ struct btree_root {
s8 error;
};
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
enum btree_insert_ret {
BTREE_INSERT_OK,
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
- BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_MARK_REPLICAS,
BTREE_INSERT_NEED_JOURNAL_RES,
BTREE_INSERT_NEED_JOURNAL_RECLAIM,
@@ -695,8 +700,4 @@ enum btree_node_sibling {
btree_next_sib,
};
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 56131ac516ce..0268dd74f0ab 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -8,14 +8,14 @@
struct bch_fs;
struct btree;
-void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
- struct btree_iter *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
+ struct btree *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
- __BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
@@ -29,11 +29,6 @@ enum btree_insert_flags {
__BCH_HASH_SET_MUST_REPLACE,
};
-/*
- * Don't drop locks _after_ successfully updating btree:
- */
-#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
-
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
@@ -66,18 +61,20 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos, u64 *);
+ struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
- __le64, unsigned);
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+ struct btree *, unsigned);
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
- struct btree *, struct bkey_i *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+ struct btree *, struct bkey_i *, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *,
+ struct btree *, struct bkey_i *, bool);
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_trigger_flags);
+ struct bkey_i *, enum btree_update_flags);
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
@@ -108,12 +105,10 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
({ \
int _ret; \
\
- while (1) { \
+ do { \
+ bch2_trans_begin(_trans); \
_ret = (_do); \
- if (_ret != -EINTR) \
- break; \
- bch2_trans_reset(_trans, 0); \
- } \
+ } while (_ret == -EINTR); \
\
_ret; \
})
@@ -125,14 +120,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
({ \
struct btree_trans trans; \
- int _ret, _ret2; \
+ int _ret; \
\
bch2_trans_init(&trans, (_c), 0, 0); \
_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
_do); \
- _ret2 = bch2_trans_exit(&trans); \
+ bch2_trans_exit(&trans); \
\
- _ret ?: _ret2; \
+ _ret; \
})
#define trans_for_each_update(_trans, _i) \
@@ -140,9 +135,21 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-#define trans_for_each_update2(_trans, _i) \
- for ((_i) = (_trans)->updates2; \
- (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
- (_i)++)
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if ((cmp_int(btree_id, i->btree_id) ?:
+ bpos_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->btree_id)
+ return i->k;
+ break;
+ }
+
+ return NULL;
+}
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2d8093d1bf00..dfff972551ee 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -22,6 +22,11 @@
#include <linux/random.h>
#include <trace/events/bcachefs.h>
+static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+ struct btree_path *, struct btree *,
+ struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+
/* Debug code: */
/*
@@ -148,38 +153,26 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
clear_btree_node_noevict(b);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
}
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree *b)
{
- struct open_buckets ob = b->ob;
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
- b->ob.nr = 0;
+ trans_for_each_path(trans, path)
+ BUG_ON(path->l[b->c.level].b == b &&
+ path->l[b->c.level].lock_seq == b->c.lock.state.seq);
- clear_btree_node_dirty(c, b);
+ six_lock_write(&b->c.lock, NULL, NULL);
- btree_node_lock_type(c, b, SIX_LOCK_write);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
__btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
- bch2_open_buckets_put(c, &ob);
-}
-
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
-{
- struct btree_iter *linked;
-
- trans_for_each_iter(iter->trans, linked)
- BUG_ON(linked->l[b->c.level].b == b);
-
- six_lock_write(&b->c.lock, NULL, NULL);
- __btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -242,11 +235,7 @@ retry:
goto retry;
}
- if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
- bkey_btree_ptr_v2_init(&tmp.k);
- else
- bkey_btree_ptr_init(&tmp.k);
-
+ bkey_btree_ptr_v2_init(&tmp.k);
bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
bch2_open_bucket_get(c, wp, &ob);
@@ -367,7 +356,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
struct btree *b = bch2_btree_node_alloc(as, level);
btree_set_min(b, POS_MIN);
- btree_set_max(b, POS_MAX);
+ btree_set_max(b, SPOS_MAX);
b->data->format = bch2_btree_calc_format(b);
btree_node_set_format(b, b->data->format);
@@ -511,7 +500,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
ret = bch2_trans_mark_key(trans,
bkey_s_c_null,
bkey_i_to_s_c(k),
- 0, 0, BTREE_TRIGGER_INSERT);
+ BTREE_TRIGGER_INSERT);
if (ret)
return ret;
}
@@ -520,7 +509,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
ret = bch2_trans_mark_key(trans,
bkey_i_to_s_c(k),
bkey_s_c_null,
- 0, 0, BTREE_TRIGGER_OVERWRITE);
+ BTREE_TRIGGER_OVERWRITE);
if (ret)
return ret;
}
@@ -563,7 +552,8 @@ static void btree_update_nodes_written(struct btree_update *as)
six_unlock_read(&old->c.lock);
if (seq == as->old_nodes_seq[i])
- btree_node_wait_on_io(old);
+ wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+ TASK_UNINTERRUPTIBLE);
}
/*
@@ -772,7 +762,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
* And it adds @b to the list of @as's new nodes, so that we can update sector
* counts in bch2_btree_update_nodes_written:
*/
-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
@@ -826,7 +816,7 @@ found:
closure_put(&as->cl);
}
-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
{
while (b->ob.nr)
as->open_buckets[as->nr_open_buckets++] =
@@ -838,7 +828,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b
* nodes and thus outstanding btree_updates - redirect @b's
* btree_updates to point to this btree_update:
*/
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree *b)
{
struct bch_fs *c = as->c;
@@ -910,7 +900,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
as->nr_old_nodes++;
}
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as)
{
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
@@ -924,11 +914,10 @@ void bch2_btree_update_done(struct btree_update *as)
as->c->btree_interior_update_worker);
}
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *iter, unsigned level,
- unsigned nr_nodes, unsigned flags)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+ unsigned level, unsigned nr_nodes, unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct btree_update *as;
struct closure cl;
@@ -937,36 +926,28 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
int journal_flags = 0;
int ret = 0;
+ BUG_ON(!path->should_be_locked);
+
if (flags & BTREE_INSERT_JOURNAL_RESERVED)
journal_flags |= JOURNAL_RES_GET_RESERVED;
closure_init_stack(&cl);
retry:
- /*
- * This check isn't necessary for correctness - it's just to potentially
- * prevent us from doing a lot of work that'll end up being wasted:
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- return ERR_PTR(ret);
/*
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+ if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
- iter->btree_id,
- &iter->real_pos);
- return ERR_PTR(-EINTR);
+ path->btree_id, &path->pos);
+ ret = btree_trans_restart(trans);
+ return ERR_PTR(ret);
}
if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK)
- return ERR_PTR(-EINTR);
-
bch2_trans_unlock(trans);
down_read(&c->gc_lock);
if (!bch2_trans_relock(trans)) {
@@ -981,7 +962,7 @@ retry:
as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
- as->btree_id = iter->btree_id;
+ as->btree_id = path->btree_id;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -989,24 +970,31 @@ retry:
bch2_keylist_init(&as->new_keys, as->_new_keys);
bch2_keylist_init(&as->parent_keys, as->inline_keys);
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->list, &c->btree_interior_update_list);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ /*
+ * We don't want to allocate if we're in an error state, that can cause
+ * deadlock on emergency shutdown due to open buckets getting stuck in
+ * the btree_reserve_cache after allocator shutdown has cleared it out.
+ * This check needs to come after adding us to the btree_interior_update
+ * list but before calling bch2_btree_reserve_get, to synchronize with
+ * __bch2_fs_read_only().
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret == -EAGAIN) {
- /*
- * this would be cleaner if bch2_journal_preres_get() took a
- * closure argument
- */
- if (flags & BTREE_INSERT_NOUNLOCK) {
- trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
- ret = -EINTR;
- goto err;
- }
-
bch2_trans_unlock(trans);
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
bch2_btree_update_free(as);
+ btree_trans_restart(trans);
return ERR_PTR(ret);
}
@@ -1031,8 +1019,7 @@ retry:
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags,
- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
if (ret)
goto err;
@@ -1040,17 +1027,11 @@ retry:
atomic64_read(&c->journal.seq),
&as->journal, NULL);
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->list, &c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
return as;
err:
bch2_btree_update_free(as);
if (ret == -EAGAIN) {
- BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-
bch2_trans_unlock(trans);
closure_sync(&cl);
ret = -EINTR;
@@ -1099,8 +1080,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
* is nothing new to be done. This just guarantees that there is a
* journal write.
*/
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
- struct btree_iter *iter)
+static void bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
struct bch_fs *c = as->c;
struct btree *old;
@@ -1115,7 +1098,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write(old, iter);
+ bch2_btree_node_lock_write(trans, path, old);
bch2_btree_set_root_inmem(c, b);
@@ -1128,20 +1111,25 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
* an intent lock on the new root, and any updates that would
* depend on the new root would have to update the new root.
*/
- bch2_btree_node_unlock_write(old, iter);
+ bch2_btree_node_unlock_write(trans, path, old);
}
/* Interior node updates: */
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct btree_node_iter *node_iter)
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = as->c;
struct bkey_packed *k;
const char *invalid;
+ BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+ !btree_ptr_sectors_written(insert));
+
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
@@ -1165,15 +1153,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
- bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys,
- struct btree_node_iter node_iter)
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1185,8 +1176,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
;
while (!bch2_keylist_empty(keys)) {
- bch2_insert_fixup_btree_ptr(as, b, iter,
- bch2_keylist_front(keys), &node_iter);
+ bch2_insert_fixup_btree_ptr(as, trans, path, b,
+ &node_iter, bch2_keylist_front(keys));
bch2_keylist_pop_front(keys);
}
}
@@ -1196,8 +1187,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
- struct btree *n1,
- struct btree_iter *iter)
+ struct btree *n1)
{
struct bkey_format_state s;
size_t nr_packed = 0, nr_unpacked = 0;
@@ -1312,8 +1302,10 @@ static struct btree *__btree_split_node(struct btree_update *as,
* nodes that were coalesced, and thus in the middle of a child node post
* coalescing:
*/
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
- struct btree_iter *iter,
+static void btree_split_insert_keys(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
struct keylist *keys)
{
struct btree_node_iter node_iter;
@@ -1323,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
- __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
+ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
/*
* We can't tolerate whiteouts here - with whiteouts there can be
@@ -1353,17 +1345,17 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
btree_node_interior_verify(as->c, b);
}
-static void btree_split(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys,
- unsigned flags)
+static void btree_split(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(iter, b);
+ struct btree *parent = btree_node_parent(path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
u64 start_time = local_clock();
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
bch2_btree_interior_update_will_free_node(as, b);
@@ -1371,18 +1363,19 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_update_add_new_node(as, n1);
if (keys)
- btree_split_insert_keys(as, n1, iter, keys);
+ btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_split(c, b);
- n2 = __btree_split_node(as, n1, iter);
+ n2 = __btree_split_node(as, n1);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent);
bch2_btree_node_write(c, n2, SIX_LOCK_intent);
/*
@@ -1400,7 +1393,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
- btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
bch2_btree_node_write(c, n3, SIX_LOCK_intent);
}
@@ -1410,22 +1403,22 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
}
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-
/* New nodes all written, now make them visible: */
if (parent) {
/* Split a non root node */
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
} else if (n3) {
- bch2_btree_set_root(as, n3, iter);
+ bch2_btree_set_root(as, trans, path, n3);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, n1, iter);
+ bch2_btree_set_root(as, trans, path, n1);
}
bch2_btree_update_get_open_buckets(as, n1);
@@ -1434,15 +1427,14 @@ static void btree_split(struct btree_update *as, struct btree *b,
if (n3)
bch2_btree_update_get_open_buckets(as, n3);
- /* Successful split, update the iterator to point to the new nodes: */
+ /* Successful split, update the path to point to the new nodes: */
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
if (n3)
- bch2_btree_iter_node_replace(iter, n3);
+ bch2_trans_node_add(trans, n3);
if (n2)
- bch2_btree_iter_node_replace(iter, n2);
- bch2_btree_iter_node_replace(iter, n1);
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@@ -1450,7 +1442,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(c, b, iter);
+ bch2_btree_node_free_inmem(trans, b);
if (n3)
six_unlock_intent(&n3->c.lock);
@@ -1458,26 +1450,30 @@ static void btree_split(struct btree_update *as, struct btree *b,
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
- bch2_btree_trans_verify_locks(iter->trans);
+ bch2_trans_verify_locks(trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
start_time);
}
static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct keylist *keys)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
+ __bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
- trans_for_each_iter_with_node(iter->trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
- bch2_btree_trans_verify_iters(iter->trans, b);
+ bch2_trans_verify_paths(trans);
}
/**
@@ -1492,9 +1488,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
-void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys,
- unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1502,21 +1498,21 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
int live_u64s_added, u64s_added;
lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- bch2_btree_node_lock_for_insert(c, b, iter);
+ bch2_btree_node_lock_for_insert(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
- bch2_btree_node_unlock_write(b, iter);
+ bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
btree_node_interior_verify(c, b);
- bch2_btree_insert_keys_interior(as, b, iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, keys);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1528,46 +1524,48 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
- bch2_btree_node_unlock_write(b, iter);
+ bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
return;
split:
- btree_split(as, b, iter, keys, flags);
+ btree_split(as, trans, path, b, keys, flags);
}
-int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_split_leaf(struct btree_trans *trans,
+ struct btree_path *path,
unsigned flags)
{
- struct btree *b = iter_l(iter)->b;
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
- as = bch2_btree_update_start(iter, iter->level,
+ as = bch2_btree_update_start(trans, path, path->level,
btree_update_reserve_required(c, b), flags);
if (IS_ERR(as))
return PTR_ERR(as);
- btree_split(as, b, iter, NULL, flags);
+ btree_split(as, trans, path, b, NULL, flags);
bch2_btree_update_done(as);
- for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
- ret = bch2_foreground_maybe_merge(c, iter, l, flags);
+ for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+ ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
}
-int __bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
+ struct btree_path *path,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *sib_iter = NULL;
+ struct bch_fs *c = trans->c;
+ struct btree_path *sib_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
@@ -1575,39 +1573,35 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
- int ret = 0, ret2 = 0;
-
- BUG_ON(!btree_node_locked(iter, level));
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
+ int ret = 0;
- BUG_ON(!btree_node_locked(iter, level));
+ BUG_ON(!path->should_be_locked);
+ BUG_ON(!btree_node_locked(path, level));
- b = iter->l[level].b;
+ b = path->l[level].b;
if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
- (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
b->sib_u64s[sib] = U16_MAX;
- goto out;
+ return 0;
}
sib_pos = sib == btree_prev_sib
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
- sib_pos, U8_MAX, level,
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(sib_iter);
+ sib_path = bch2_path_get(trans, false, path->btree_id, sib_pos,
+ U8_MAX, level, true, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- m = sib_iter->l[level].b;
+ sib_path->should_be_locked = true;
- if (btree_node_parent(iter, b) !=
- btree_node_parent(sib_iter, m)) {
+ m = sib_path->l[level].b;
+
+ if (btree_node_parent(path, b) !=
+ btree_node_parent(sib_path, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@@ -1658,8 +1652,8 @@ retry:
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- parent = btree_node_parent(iter, b);
- as = bch2_btree_update_start(iter, level,
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, level,
btree_update_reserve_required(c, parent) + 1,
flags|
BTREE_INSERT_NOFAIL|
@@ -1688,92 +1682,63 @@ retry:
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent);
+
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
bch2_keylist_add(&as->parent_keys, &delete);
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ bch2_trans_verify_paths(trans);
+
+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+ bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
- bch2_btree_iter_node_drop(iter, m);
- bch2_btree_iter_node_replace(iter, n);
+ bch2_trans_node_add(trans, n);
- bch2_btree_trans_verify_iters(trans, n);
+ bch2_trans_verify_paths(trans);
- bch2_btree_node_free_inmem(c, b, iter);
- bch2_btree_node_free_inmem(c, m, iter);
+ bch2_btree_node_free_inmem(trans, b);
+ bch2_btree_node_free_inmem(trans, m);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
out:
- bch2_btree_trans_verify_locks(trans);
- bch2_trans_iter_free(trans, sib_iter);
-
- /*
- * Don't downgrade locks here: we're called after successful insert,
- * and the caller will downgrade locks after a successful insert
- * anyways (in case e.g. a split was required first)
- *
- * And we're also called when inserting into interior nodes in the
- * split path, and downgrading to read locks in there is potentially
- * confusing:
- */
- return ret ?: ret2;
err:
- bch2_trans_iter_put(trans, sib_iter);
- sib_iter = NULL;
-
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
-
- if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
- ret2 = ret;
- ret = bch2_btree_iter_traverse_all(trans);
- if (!ret)
- goto retry;
- }
-
- goto out;
+ bch2_path_put(trans, sib_path, true);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
/**
* bch_btree_node_rewrite - Rewrite/move a btree node
*/
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
- __le64 seq, unsigned flags)
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree *b,
+ unsigned flags)
{
- struct btree *b, *n, *parent;
+ struct bch_fs *c = trans->c;
+ struct btree *n, *parent;
struct btree_update *as;
int ret;
flags |= BTREE_INSERT_NOFAIL;
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
- b = bch2_btree_iter_peek_node(iter);
- if (!b || b->data->keys.seq != seq)
- goto out;
-
- parent = btree_node_parent(iter, b);
- as = bch2_btree_update_start(iter, b->c.level,
+ parent = btree_node_parent(iter->path, b);
+ as = bch2_btree_update_start(trans, iter->path, b->c.level,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
flags);
ret = PTR_ERR_OR_ZERO(as);
- if (ret == -EINTR)
- goto retry;
if (ret) {
trace_btree_gc_rewrite_node_fail(c, b);
goto out;
@@ -1793,22 +1758,22 @@ retry:
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
+ bch2_btree_insert_node(as, trans, iter->path, parent,
+ &as->parent_keys, flags);
} else {
- bch2_btree_set_root(as, n, iter);
+ bch2_btree_set_root(as, trans, iter->path, n);
}
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
- bch2_btree_iter_node_replace(iter, n);
- bch2_btree_node_free_inmem(c, b, iter);
+ bch2_trans_node_add(trans, n);
+ bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
out:
- bch2_btree_iter_downgrade(iter);
+ bch2_btree_path_downgrade(iter->path);
return ret;
}
@@ -1821,20 +1786,38 @@ struct async_btree_rewrite {
__le64 seq;
};
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+ struct async_btree_rewrite *a)
+{
+ struct btree_iter iter;
+ struct btree *b;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+ BTREE_MAX_DEPTH, a->level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto out;
+
+ if (!b || b->data->keys.seq != a->seq)
+ goto out;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out :
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- struct btree_trans trans;
- struct btree_iter *iter;
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
- BTREE_MAX_DEPTH, a->level, 0);
- bch2_btree_node_rewrite(c, iter, a->seq, 0);
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ bch2_trans_do(c, NULL, NULL, 0,
+ async_btree_node_rewrite_trans(&trans, a));
percpu_ref_put(&c->writes);
kfree(a);
}
@@ -1865,75 +1848,123 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
queue_work(c->btree_interior_update_worker, &a->work);
}
-static void __bch2_btree_node_update_key(struct bch_fs *c,
- struct btree_update *as,
- struct btree_iter *iter,
- struct btree *b, struct btree *new_hash,
- struct bkey_i *new_key)
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree *b, struct btree *new_hash,
+ struct bkey_i *new_key,
+ bool skip_triggers)
{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter2 = { NULL };
struct btree *parent;
+ u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
- btree_update_will_delete_key(as, &b->key);
- btree_update_will_add_key(as, new_key);
+ if (!skip_triggers) {
+ ret = bch2_trans_mark_key(trans,
+ bkey_s_c_null,
+ bkey_i_to_s_c(new_key),
+ BTREE_TRIGGER_INSERT);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_mark_key(trans,
+ bkey_i_to_s_c(&b->key),
+ bkey_s_c_null,
+ BTREE_TRIGGER_OVERWRITE);
+ if (ret)
+ return ret;
+ }
- parent = btree_node_parent(iter, b);
+ if (new_hash) {
+ bkey_copy(&new_hash->key, new_key);
+ ret = bch2_btree_node_hash_insert(&c->btree_cache,
+ new_hash, b->c.level, b->c.btree_id);
+ BUG_ON(ret);
+ }
+
+ parent = btree_node_parent(iter->path, b);
if (parent) {
- if (new_hash) {
- bkey_copy(&new_hash->key, new_key);
- ret = bch2_btree_node_hash_insert(&c->btree_cache,
- new_hash, b->c.level, b->c.btree_id);
- BUG_ON(ret);
- }
+ bch2_trans_copy_iter(&iter2, iter);
- bch2_keylist_add(&as->parent_keys, new_key);
- bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
+ iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+ iter2.flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+ BUG_ON(iter2.path->level != b->c.level);
+ BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ btree_node_unlock(iter2.path, iter2.path->level);
+ path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
+ iter2.path->level++;
- bkey_copy(&b->key, new_key);
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- } else {
- bkey_copy(&b->key, new_key);
- }
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
} else {
BUG_ON(btree_node_root(c, b) != b);
- bch2_btree_node_lock_write(b, iter);
- bkey_copy(&b->key, new_key);
+ trans->extra_journal_entries = (void *) &journal_entries[0];
+ trans->extra_journal_entry_u64s =
+ journal_entry_set((void *) &journal_entries[0],
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ new_key, new_key->k.u64s);
+ }
- if (btree_ptr_hash_val(&b->key) != b->hash_val) {
- mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ BTREE_INSERT_JOURNAL_RESERVED);
+ if (ret)
+ goto err;
- ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
- BUG_ON(ret);
- mutex_unlock(&c->btree_cache.lock);
- }
+ bch2_btree_node_lock_write(trans, iter->path, b);
- btree_update_updated_root(as, b);
- bch2_btree_node_unlock_write(b, iter);
+ if (new_hash) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new_key);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ } else {
+ bkey_copy(&b->key, new_key);
}
- bch2_btree_update_done(as);
+ bch2_btree_node_unlock_write(trans, iter->path, b);
+out:
+ bch2_trans_iter_exit(trans, &iter2);
+ return ret;
+err:
+ if (new_hash) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+ }
+ goto out;
}
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
- struct btree *b,
- struct bkey_i *new_key)
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b, struct bkey_i *new_key,
+ bool skip_triggers)
{
- struct btree *parent = btree_node_parent(iter, b);
- struct btree_update *as = NULL;
+ struct bch_fs *c = trans->c;
struct btree *new_hash = NULL;
+ struct btree_path *path = iter->path;
struct closure cl;
int ret = 0;
+ if (!btree_node_intent_locked(path, b->c.level) &&
+ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
+ btree_trans_restart(trans);
+ return -EINTR;
+ }
+
closure_init_stack(&cl);
/*
@@ -1943,27 +1974,20 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
if (btree_ptr_hash_val(new_key) != b->hash_val) {
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
- bch2_trans_unlock(iter->trans);
+ bch2_trans_unlock(trans);
closure_sync(&cl);
- if (!bch2_trans_relock(iter->trans))
+ if (!bch2_trans_relock(trans))
return -EINTR;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
- as = bch2_btree_update_start(iter, b->c.level,
- parent ? btree_update_reserve_required(c, parent) : 0,
- BTREE_INSERT_NOFAIL);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- goto err;
- }
-
- __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+ path->intent_ref++;
+ ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
+ new_key, skip_triggers);
+ --path->intent_ref;
- bch2_btree_iter_downgrade(iter);
-err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1977,6 +2001,35 @@ err:
return ret;
}
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+ struct btree *b, struct bkey_i *new_key,
+ bool skip_triggers)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
+
+ /* has node been freed? */
+ if (iter.path->l[b->c.level].b != b) {
+ /* node has been freed: */
+ BUG_ON(!btree_node_dying(b));
+ goto out;
+ }
+
+ BUG_ON(!btree_node_hashed(b));
+
+ ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
/* Init code: */
/*
@@ -2012,7 +2065,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
b->c.btree_id = id;
bkey_btree_ptr_init(&b->key);
- b->key.k.p = POS_MAX;
+ b->key.k.p = SPOS_MAX;
*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
bch2_bset_init_first(b, &b->data->keys);
@@ -2020,7 +2073,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
b->data->flags = 0;
btree_set_min(b, POS_MIN);
- btree_set_max(b, POS_MAX);
+ btree_set_max(b, SPOS_MAX);
b->data->format = bch2_btree_calc_format(b);
btree_node_set_format(b, b->data->format);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7ed67b47e1b9..8e03bd987d6d 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -113,60 +113,39 @@ struct btree_update {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
- struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
-
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
- struct btree *);
-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-void bch2_btree_insert_node(struct btree_update *, struct btree *,
- struct btree_iter *, struct keylist *,
- unsigned);
-int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
-int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
unsigned, unsigned, enum btree_node_sibling);
-static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
- struct btree_iter *iter,
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
+ struct btree_path *path,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
struct btree *b;
- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return 0;
-
- if (!bch2_btree_node_relock(iter, level))
- return 0;
+ EBUG_ON(!btree_node_locked(path, level));
- b = iter->l[level].b;
- if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+ b = path->l[level].b;
+ if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
- return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
}
-static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- unsigned level,
- unsigned flags)
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ unsigned flags)
{
- return bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
- bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_next_sib);
}
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 0d566be7455e..112ac7caf579 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -15,6 +15,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "subvolume.h"
#include "replicas.h"
#include <linux/prefetch.h>
@@ -29,37 +30,59 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
bpos_cmp(l->k->k.p, r->k->k.p);
}
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+ return i->path->l + i->level;
+}
+
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- return i != trans->updates2 &&
- iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
+ return i != trans->updates &&
+ insert_l(&i[0])->b == insert_l(&i[-1])->b;
}
-inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+ struct btree_insert_entry *i)
{
- bch2_btree_node_lock_write(b, iter);
+ return i + 1 < trans->updates + trans->nr_updates &&
+ insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
- if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+
+ if (path->cached)
return;
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
/*
* If the last bset has been written, or if it's gotten too big - start
* a new bset to insert into:
*/
if (want_new_bset(c, b))
- bch2_btree_init_next(c, b, iter);
+ bch2_btree_init_next(trans, b);
+}
+
+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ bch2_btree_node_lock_write(trans, path, b);
+ bch2_btree_node_prep_for_write(trans, path, b);
}
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
@@ -73,8 +96,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(iter->trans->c, b));
- EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+ bch_btree_keys_u64s_remaining(trans->c, b));
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -93,7 +115,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(iter->trans->c, b, insert->k.p);
+ push_whiteout(trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -101,7 +123,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
bch2_bset_delete(b, k, clobber_u64s);
goto fix_iter;
} else {
- bch2_btree_iter_fix_key_modified(iter, b, k);
+ bch2_btree_path_fix_key_modified(trans, b, k);
}
return true;
@@ -119,7 +141,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
clobber_u64s = k->u64s;
goto overwrite;
} else {
- bch2_btree_iter_fix_key_modified(iter, b, k);
+ bch2_btree_path_fix_key_modified(trans, b, k);
}
}
@@ -129,7 +151,7 @@ overwrite:
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
clobber_u64s, new_u64s);
return true;
}
@@ -173,22 +195,21 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
* btree_insert_key - insert a key one key into a leaf node
*/
static bool btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
+ struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
+ struct btree *b = insert_l(insert)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- EBUG_ON(!iter->level &&
+ EBUG_ON(!insert->level &&
!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
- if (unlikely(!bch2_btree_bset_insert_key(iter, b,
- &iter_l(iter)->iter, insert)))
+ if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
+ &insert_l(insert)->iter, insert->k)))
return false;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
@@ -209,9 +230,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
- trace_btree_insert_key(c, b, insert);
return true;
}
@@ -222,9 +242,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
- BUG_ON(i->level != i->iter->level);
- BUG_ON(i->btree_id != i->iter->btree_id);
+ BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+ BUG_ON(i->cached != i->path->cached);
+ BUG_ON(i->level != i->path->level);
+ BUG_ON(i->btree_id != i->path->btree_id);
+ EBUG_ON(!i->level &&
+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ i->k->k.p.snapshot &&
+ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
@@ -264,13 +290,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
}
-static enum btree_insert_ret
+static inline enum btree_insert_ret
btree_key_can_insert(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree *b,
unsigned u64s)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
if (!bch2_btree_node_insert_fits(c, b, u64s))
return BTREE_INSERT_BTREE_NODE_FULL;
@@ -280,14 +305,14 @@ btree_key_can_insert(struct btree_trans *trans,
static enum btree_insert_ret
btree_key_can_insert_cached(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned u64s)
{
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
- BUG_ON(iter->level);
+ EBUG_ON(path->level);
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(trans->c) &&
@@ -325,9 +350,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
i->k->k.needs_whiteout = false;
- did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
- ? btree_insert_key_leaf(trans, i->iter, i->k)
- : bch2_btree_insert_key_cached(trans, i->iter, i->k);
+ did_work = !i->cached
+ ? btree_insert_key_leaf(trans, i)
+ : bch2_btree_insert_key_cached(trans, i->path, i->k);
if (!did_work)
return;
@@ -337,19 +362,11 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
i->level,
i->k);
- bch2_journal_set_has_inode(j, &trans->journal_res,
- i->k->k.p.inode);
-
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
}
-static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
-{
- __bch2_btree_iter_unlock(iter);
-}
-
static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -358,12 +375,13 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
+ * XXX: synchronization of interior node updates with gc
*/
- BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+ BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
- bch2_mark_update(trans, i->iter, i->k, NULL,
- i->trigger_flags|BTREE_TRIGGER_GC);
+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
+ bch2_mark_update(trans, i->path, i->k,
+ i->flags|BTREE_TRIGGER_GC);
}
}
@@ -381,6 +399,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (race_fault()) {
trace_trans_restart_fault_inject(trans->ip, trace_ip);
+ trans->restarted = true;
return -EINTR;
}
@@ -400,15 +419,15 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
h = h->next;
}
- trans_for_each_update2(trans, i) {
+ trans_for_each_update(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
u64s += i->k->k.u64s;
- ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
- ? btree_key_can_insert(trans, i->iter, u64s)
- : btree_key_can_insert_cached(trans, i->iter, u64s);
+ ret = !i->cached
+ ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+ : btree_key_can_insert_cached(trans, i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
@@ -458,17 +477,16 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (bch2_journal_seq_verify)
- trans_for_each_update2(trans, i)
+ trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys)
- trans_for_each_update2(trans, i)
+ trans_for_each_update(trans, i)
i->k->k.version = MAX_VERSION;
}
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
- bch2_mark_update(trans, i->iter, i->k,
- NULL, i->trigger_flags);
+ bch2_mark_update(trans, i->path, i->k, i->flags);
if (marking && trans->fs_usage_deltas)
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
@@ -476,7 +494,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
- trans_for_each_update2(trans, i)
+ trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
err:
if (marking) {
@@ -486,41 +504,106 @@ err:
return ret;
}
-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_insert_entry *i;
- struct btree *b = iter_l(iter)->b;
- struct bkey_s_c old;
- int u64s_delta = 0;
- int ret;
+ unsigned l;
- /*
- * Inserting directly into interior nodes is an uncommon operation with
- * various weird edge cases: also, a lot of things about
- * BTREE_ITER_NODES iters need to be audited
- */
- if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
- return 0;
+ for (l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_read_locked(path, l))
+ BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
+}
- BUG_ON(iter->level);
+static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree *b = path_l(path)->b;
- trans_for_each_update2(trans, i) {
- if (iter_l(i->iter)->b != b)
+ do {
+ if (path->nodes_locked &&
+ path->nodes_locked != path->nodes_intent_locked)
+ path_upgrade_readers(trans, path);
+ } while ((path = prev_btree_path(trans, path)) &&
+ path_l(path)->b == b);
+}
+
+/*
+ * Check for nodes that we have both read and intent locks on, and upgrade the
+ * readers to intent:
+ */
+static inline void normalize_read_intent_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i, nr_read = 0, nr_intent = 0;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ struct btree_path *next = i + 1 < trans->nr_sorted
+ ? trans->paths + trans->sorted[i + 1]
+ : NULL;
+
+ if (path->nodes_locked) {
+ if (path->nodes_intent_locked)
+ nr_intent++;
+ else
+ nr_read++;
+ }
+
+ if (!next || path_l(path)->b != path_l(next)->b) {
+ if (nr_read && nr_intent)
+ upgrade_readers(trans, path);
+
+ nr_read = nr_intent = 0;
+ }
+ }
+
+ bch2_trans_verify_locks(trans);
+}
+
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ //if (path == pos)
+ // break;
+
+ if (path->nodes_locked != path->nodes_intent_locked &&
+ !bch2_btree_path_upgrade(trans, path, path->level + 1))
+ return true;
+ }
+
+ return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i) {
+ if (same_leaf_as_prev(trans, i))
continue;
- old = bch2_btree_iter_peek_slot(i->iter);
- ret = bkey_err(old);
- if (ret)
- return ret;
+ if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+ if (have_conflicting_read_lock(trans, i->path))
+ goto fail;
- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ __btree_node_lock_type(trans->c, insert_l(i)->b,
+ SIX_LOCK_write);
+ }
+
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
- return u64s_delta <= 0
- ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
- trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
- : 0;
+ return 0;
+fail:
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_trans_restart_would_deadlock_write(trans->ip);
+ return btree_trans_restart(trans);
}
/*
@@ -532,29 +615,55 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct btree_iter *iter;
- int ret;
+ struct bkey_s_c old;
+ int ret, u64s_delta = 0;
- trans_for_each_update2(trans, i) {
- struct btree *b;
+ trans_for_each_update(trans, i) {
+ const char *invalid = bch2_bkey_invalid(c,
+ bkey_i_to_s_c(i->k), i->bkey_type);
+ if (invalid) {
+ char buf[200];
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+ bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
+ buf, (void *) trans->ip,
+ (void *) i->ip_allocated, invalid);
+ bch2_fatal_error(c);
+ return -EINVAL;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
+ trans_for_each_update(trans, i) {
+ struct bkey u;
- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+ /*
+ * peek_slot() doesn't yet work on iterators that point to
+ * interior nodes:
+ */
+ if (i->cached || i->level)
continue;
- b = iter_l(i->iter)->b;
- if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
- b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
- ret = maybe_do_btree_merge(trans, i->iter);
- if (unlikely(ret))
- return ret;
+ old = bch2_btree_path_peek_slot(i->path, &u);
+ ret = bkey_err(old);
+ if (unlikely(ret))
+ return ret;
+
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+
+ if (!same_leaf_as_next(trans, i)) {
+ if (u64s_delta <= 0) {
+ ret = bch2_foreground_maybe_merge(trans, i->path,
+ i->level, trans->flags);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ u64s_delta = 0;
}
}
- trans_for_each_update2(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
-
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
@@ -566,57 +675,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (unlikely(ret))
return ret;
- /*
- * Can't be holding any read locks when we go to take write locks:
- * another thread could be holding an intent lock on the same node we
- * have a read lock on, and it'll block trying to take a write lock
- * (because we hold a read lock) and it could be blocking us by holding
- * its own read lock (while we're trying to to take write locks).
- *
- * note - this must be done after bch2_trans_journal_preres_get_cold()
- * or anything else that might call bch2_trans_relock(), since that
- * would just retake the read locks:
- */
- trans_for_each_iter(trans, iter) {
- if (iter->nodes_locked != iter->nodes_intent_locked) {
- if (btree_iter_keep(trans, iter)) {
- if (!bch2_btree_iter_upgrade(iter, 1)) {
- trace_trans_restart_upgrade(trans->ip, trace_ip,
- iter->btree_id,
- &iter->real_pos);
- return -EINTR;
- }
- } else {
- bch2_btree_iter_unlock_noinline(iter);
- }
- }
- }
-
- trans_for_each_update2(trans, i) {
- const char *invalid = bch2_bkey_invalid(c,
- bkey_i_to_s_c(i->k), i->bkey_type);
- if (invalid) {
- char buf[200];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
- bch2_fatal_error(c);
- }
- btree_insert_entry_checks(trans, i);
- }
- bch2_btree_trans_verify_locks(trans);
+ normalize_read_intent_locks(trans);
- trans_for_each_update2(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(c,
- iter_l(i->iter)->b, i->iter);
+ ret = trans_lock_write(trans);
+ if (unlikely(ret))
+ return ret;
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
- trans_for_each_update2(trans, i)
+ trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
- i->iter);
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -652,60 +722,23 @@ int bch2_trans_commit_error(struct btree_trans *trans,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- unsigned flags = trans->flags;
-
- /*
- * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
- * update; if we haven't done anything yet it doesn't apply
- */
- flags &= ~BTREE_INSERT_NOUNLOCK;
switch (ret) {
case BTREE_INSERT_BTREE_NODE_FULL:
- ret = bch2_btree_split_leaf(c, i->iter, flags);
-
- /*
- * if the split succeeded without dropping locks the insert will
- * still be atomic (what the caller peeked() and is overwriting
- * won't have changed)
- */
-#if 0
- /*
- * XXX:
- * split -> btree node merging (of parent node) might still drop
- * locks when we're not passing it BTREE_INSERT_NOUNLOCK
- *
- * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
- * will inhibit merging - but we don't have a reliable way yet
- * (do we?) of checking if we dropped locks in this path
- */
+ ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
if (!ret)
- goto retry;
-#endif
+ return 0;
- /*
- * don't care if we got ENOSPC because we told split it
- * couldn't block:
- */
- if (!ret ||
- ret == -EINTR ||
- (flags & BTREE_INSERT_NOUNLOCK)) {
+ if (ret == -EINTR)
trace_trans_restart_btree_node_split(trans->ip, trace_ip,
- i->iter->btree_id,
- &i->iter->real_pos);
- ret = -EINTR;
- }
- break;
- case BTREE_INSERT_ENOSPC:
- BUG_ON(flags & BTREE_INSERT_NOFAIL);
- ret = -ENOSPC;
+ i->btree_id, &i->path->pos);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
if (ret)
- return ret;
+ break;
if (bch2_trans_relock(trans))
return 0;
@@ -717,12 +750,15 @@ int bch2_trans_commit_error(struct btree_trans *trans,
bch2_trans_unlock(trans);
if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
- return -EAGAIN;
+ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
+ trans->restarted = true;
+ ret = -EAGAIN;
+ break;
+ }
ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
if (ret)
- return ret;
+ break;
if (bch2_trans_relock(trans))
return 0;
@@ -738,7 +774,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
if (ret < 0)
- return ret;
+ break;
if (bch2_trans_relock(trans))
return 0;
@@ -751,7 +787,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
break;
}
- BUG_ON(ret == -ENOSPC && (flags & BTREE_INSERT_NOFAIL));
+ BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
+ BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
return ret;
}
@@ -771,150 +808,124 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
if (ret)
return ret;
- percpu_ref_get(&c->writes);
- return 0;
-}
-
-static void __bch2_trans_update2(struct btree_trans *trans,
- struct btree_insert_entry n)
-{
- struct btree_insert_entry *i;
-
- btree_insert_entry_checks(trans, &n);
-
- EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
-
- n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
- trans_for_each_update2(trans, i)
- if (btree_insert_entry_cmp(&n, i) <= 0)
- break;
-
- if (i < trans->updates2 + trans->nr_updates2 &&
- !btree_insert_entry_cmp(&n, i))
- *i = n;
- else
- array_insert_item(trans->updates2, trans->nr_updates2,
- i - trans->updates2, n);
-}
-
-static void bch2_trans_update2(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- __bch2_trans_update2(trans, (struct btree_insert_entry) {
- .bkey_type = __btree_node_type(iter->level, iter->btree_id),
- .btree_id = iter->btree_id,
- .level = iter->level,
- .iter = iter,
- .k = insert,
- });
-}
-
-static int extent_update_to_keys(struct btree_trans *trans,
- struct btree_insert_entry n)
-{
- int ret;
-
- ret = bch2_extent_can_insert(trans, n.iter, n.k);
- if (ret)
- return ret;
-
- if (bkey_deleted(&n.k->k))
- return 0;
-
- n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS);
- n.is_extent = false;
+ if (!bch2_trans_relock(trans))
+ return -EINTR;
- __bch2_trans_update2(trans, n);
- bch2_trans_iter_put(trans, n.iter);
+ percpu_ref_get(&c->writes);
return 0;
}
-static int extent_handle_overwrites(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_i *insert)
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_iter *iter, *update_iter;
- struct bpos start = bkey_start_pos(&insert->k);
- struct bkey_i *update;
- struct bkey_s_c k;
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
+ struct bkey_s_c old;
+ struct bkey unpacked;
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ bool trans_trigger_run;
+ unsigned btree_id = 0;
int ret = 0;
- iter = bch2_trans_get_iter(trans, btree_id, start,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_with_updates(iter);
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
- while (k.k && !(ret = bkey_err(k))) {
- if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
- break;
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
- if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- break;
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->insert_trigger_run ||
+ (i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ continue;
- bkey_reassemble(update, k);
+ BUG_ON(i->overwrite_trigger_run);
- bch2_cut_back(start, update);
+ i->insert_trigger_run = true;
+ trans_trigger_run = true;
- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- bch2_trans_update2(trans, update_iter, update);
- bch2_trans_iter_put(trans, update_iter);
- }
+ old = bch2_btree_path_peek_slot(i->path, &unpacked);
+ _deleted.p = i->path->pos;
+
+ if (old.k->type == i->k->k.type &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+ } else {
+ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|i->flags);
+ }
- if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
- (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- break;
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip, _RET_IP_,
+ i->btree_id, &i->path->pos);
+ if (ret)
+ return ret;
+ }
+ } while (trans_trigger_run);
- bkey_init(&update->k);
- update->k.p = k.k->p;
+ do {
+ trans_trigger_run = false;
- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- bch2_trans_update2(trans, update_iter, update);
- bch2_trans_iter_put(trans, update_iter);
- }
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->overwrite_trigger_run ||
+ (i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ continue;
- if (bkey_cmp(k.k->p, insert->k.p) > 0) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- break;
+ BUG_ON(!i->insert_trigger_run);
- bkey_reassemble(update, k);
- bch2_cut_front(insert->k.p, update);
+ i->overwrite_trigger_run = true;
+ trans_trigger_run = true;
- update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- bch2_trans_update2(trans, update_iter, update);
- bch2_trans_iter_put(trans, update_iter);
- break;
- }
+ old = bch2_btree_path_peek_slot(i->path, &unpacked);
+ _deleted.p = i->path->pos;
- k = bch2_btree_iter_next_with_updates(iter);
+ ret = bch2_trans_mark_key(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|i->flags);
+
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip, _RET_IP_,
+ i->btree_id, &i->path->pos);
+ if (ret)
+ return ret;
+ }
+ } while (trans_trigger_run);
}
- bch2_trans_iter_put(trans, iter);
- return ret;
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
+
+ return 0;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
- struct btree_iter *iter;
- bool trans_trigger_run;
- unsigned u64s, reset_flags = 0;
+ unsigned u64s;
int ret = 0;
- if (!trans->nr_updates)
+ if (!trans->nr_updates &&
+ !trans->extra_journal_entry_u64s)
goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -933,106 +944,80 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
#ifdef CONFIG_BCACHEFS_DEBUG
+ /*
+ * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
+ * from the key cache flush code:
+ */
trans_for_each_update(trans, i)
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
- !(i->trigger_flags & BTREE_TRIGGER_NORUN))
+ if (!i->cached &&
+ !(i->flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
i->btree_id, i->k->k.p);
#endif
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- trans_for_each_update(trans, i) {
- if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- !i->trans_triggers_run) {
- i->trans_triggers_run = true;
- trans_trigger_run = true;
-
- ret = bch2_trans_mark_update(trans, i->iter, i->k,
- i->trigger_flags);
- if (unlikely(ret)) {
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip, _RET_IP_,
- i->iter->btree_id,
- &i->iter->pos);
- goto out;
- }
- }
- }
- } while (trans_trigger_run);
-
- /* Turn extents updates into keys: */
- trans_for_each_update(trans, i)
- if (i->is_extent) {
- ret = extent_handle_overwrites(trans, i->btree_id, i->k);
- if (unlikely(ret))
- goto out;
- }
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out;
trans_for_each_update(trans, i) {
- ret = i->is_extent
- ? extent_update_to_keys(trans, *i)
- : (__bch2_trans_update2(trans, *i), 0);
- if (unlikely(ret))
- goto out;
- }
-
- trans_for_each_update2(trans, i) {
- ret = bch2_btree_iter_traverse(i->iter);
- if (unlikely(ret)) {
- trace_trans_restart_traverse(trans->ip, _RET_IP_,
- i->iter->btree_id,
- &i->iter->pos);
- goto out;
- }
+ BUG_ON(!i->path->should_be_locked);
- if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
+ if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
trace_trans_restart_upgrade(trans->ip, _RET_IP_,
- i->iter->btree_id,
- &i->iter->pos);
- ret = -EINTR;
+ i->btree_id, &i->path->pos);
+ ret = btree_trans_restart(trans);
goto out;
}
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+ BUG_ON(!btree_node_intent_locked(i->path, i->level));
u64s = jset_u64s(i->k->k.u64s);
- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+ if (i->cached &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
}
+
+ if (trans->extra_journal_res) {
+ ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+ trans->extra_journal_res,
+ (trans->flags & BTREE_INSERT_NOFAIL)
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ goto err;
+ }
retry:
+ BUG_ON(trans->restarted);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
- bch2_btree_trans_verify_locks(trans);
+ bch2_trans_verify_locks(trans);
if (ret)
goto err;
-
- trans_for_each_iter(trans, iter)
- if (btree_iter_live(trans, iter) &&
- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
- bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
out:
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
out_reset:
- if (!ret)
- reset_flags |= TRANS_RESET_NOTRAVERSE;
- if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
- reset_flags |= TRANS_RESET_NOUNLOCK;
- bch2_trans_reset(trans, reset_flags);
+ trans_for_each_update(trans, i)
+ bch2_path_put(trans, i->path, true);
+
+ trans->extra_journal_res = 0;
+ trans->nr_updates = 0;
+ trans->hooks = NULL;
+ trans->extra_journal_entries = NULL;
+ trans->extra_journal_entry_u64s = 0;
+
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset(&trans->fs_usage_deltas->memset_start, 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
+ }
return ret;
err:
@@ -1043,125 +1028,363 @@ err:
goto retry;
}
-int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_trigger_flags flags)
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
{
- struct btree_insert_entry *i, n = (struct btree_insert_entry) {
- .trigger_flags = flags,
- .bkey_type = __btree_node_type(iter->level, iter->btree_id),
- .btree_id = iter->btree_id,
- .level = iter->level,
- .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
- .iter = iter,
- .k = k
- };
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
- BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ if (!snapshot_t(c, pos.snapshot)->children[0])
+ return 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(bkey_cmp(iter->pos,
- n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
+ bch2_trans_iter_init(trans, &iter, id, pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
- trans_for_each_update(trans, i) {
- BUG_ON(bkey_cmp(i->iter->pos,
- i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
+ if (!k.k)
+ break;
- BUG_ON(i != trans->updates &&
- btree_insert_entry_cmp(i - 1, i) >= 0);
+ if (bkey_cmp(pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+ ret = 1;
+ break;
+ }
}
-#endif
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert,
+ enum btree_update_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bpos start = bkey_start_pos(&insert->k);
+ struct bkey_i *update;
+ struct bkey_s_c k;
+ enum btree_id btree_id = orig_iter->btree_id;
+ int ret = 0, compressed_sectors;
+
+ bch2_trans_iter_init(trans, &iter, btree_id, start,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_NOT_EXTENTS);
+ k = bch2_btree_iter_peek(&iter);
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+
+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+ /*
+ * We can't merge extents if they belong to interior snapshot
+ * tree nodes, and there's a snapshot in which one extent is
+ * visible and the other is not - i.e. if visibility is
+ * different.
+ *
+ * Instead of checking if visibilitiy of the two extents is
+ * different, for now we just check if either has been
+ * overwritten:
+ */
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge1;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge1;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ bkey_reassemble(update, k);
- if (n.is_extent) {
- iter->pos_after_commit = k->k.p;
- iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+ if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
+ ret = bch2_btree_delete_at(trans, &iter, flags);
+ if (ret)
+ goto err;
+
+ insert = update;
+ goto next;
+ }
}
+nomerge1:
+ ret = 0;
+ if (!bkey_cmp(k.k->p, start))
+ goto next;
- /*
- * Pending updates are kept sorted: first, find position of new update,
- * then delete/trim any updates the new update overwrites:
- */
- if (!n.is_extent) {
- trans_for_each_update(trans, i)
- if (btree_insert_entry_cmp(&n, i) <= 0)
- break;
+ while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+ bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+ bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0;
- if (i < trans->updates + trans->nr_updates &&
- !btree_insert_entry_cmp(&n, i))
- *i = n;
- else
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, n);
- } else {
- trans_for_each_update(trans, i)
- if (btree_insert_entry_cmp(&n, i) < 0)
- break;
+ /*
+ * If we're going to be splitting a compressed extent, note it
+ * so that __bch2_trans_commit() can increase our disk
+ * reservation:
+ */
+ if (((front_split && back_split) ||
+ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
+ (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+ trans->extra_journal_res += compressed_sectors;
- while (i > trans->updates &&
- i[-1].btree_id == n.btree_id &&
- bkey_cmp(bkey_start_pos(&n.k->k),
- bkey_start_pos(&i[-1].k->k)) <= 0) {
- --i;
- array_remove_item(trans->updates, trans->nr_updates,
- i - trans->updates);
+ if (front_split) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+
+ bch2_cut_back(start, update);
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
}
- if (i > trans->updates &&
- i[-1].btree_id == n.btree_id &&
- bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
- bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
+ if (k.k->p.snapshot != insert->k.p.snapshot &&
+ (front_split || back_split)) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
- if (i < trans->updates + trans->nr_updates &&
- i->btree_id == n.btree_id &&
- bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
- if (bkey_cmp(bkey_start_pos(&n.k->k),
- bkey_start_pos(&i->k->k)) > 0) {
- struct btree_insert_entry split = *i;
- int ret;
+ bkey_reassemble(update, k);
- BUG_ON(trans->nr_updates + 1 >= BTREE_ITER_MAX);
+ bch2_cut_front(start, update);
+ bch2_cut_back(insert->k.p, update);
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ goto err;
+ }
- split.k = bch2_trans_kmalloc(trans, bkey_bytes(&i->k->k));
- ret = PTR_ERR_OR_ZERO(split.k);
- if (ret)
- return ret;
+ if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+ update = bch2_trans_kmalloc(trans, sizeof(*update));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
- bkey_copy(split.k, i->k);
- bch2_cut_back(bkey_start_pos(&n.k->k), split.k);
-
- split.iter = bch2_trans_get_iter(trans, split.btree_id,
- bkey_start_pos(&split.k->k),
- BTREE_ITER_INTENT);
- split.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- bch2_trans_iter_put(trans, split.iter);
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, split);
- i++;
- }
+ bkey_init(&update->k);
+ update->k.p = k.k->p;
- /*
- * When we have an extent that overwrites the start of another
- * update, trimming that extent will mean the iterator's
- * position has to change since the iterator position has to
- * match the extent's start pos - but we don't want to change
- * the iterator pos if some other code is using it, so we may
- * need to clone it:
- */
- if (btree_iter_live(trans, i->iter)) {
- i->iter = bch2_trans_copy_iter(trans, i->iter);
-
- i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- bch2_trans_iter_put(trans, i->iter);
+ if (insert->k.p.snapshot != k.k->p.snapshot) {
+ update->k.p.snapshot = insert->k.p.snapshot;
+ update->k.type = KEY_TYPE_whiteout;
}
- bch2_cut_front(n.k->k.p, i->k);
- bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
+ }
+
+ if (back_split) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_front(insert->k.p, update);
+
+ bch2_trans_copy_iter(&update_iter, &iter);
+ update_iter.pos = update->k.p;
+ ret = bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
+ goto out;
}
+next:
+ k = bch2_btree_iter_next(&iter);
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+ }
+
+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge2;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge2;
+
+ bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ }
+nomerge2:
+ ret = 0;
+out:
+ if (!bkey_deleted(&insert->k)) {
+ /*
+ * Rewinding iterators is expensive: get a new one and the one
+ * that points to the start of insert will be cloned from:
+ */
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, insert, flags);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot = pos.snapshot;
+ int ret;
+
+ if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+ return 0;
+
+ pos.snapshot++;
+ for_each_btree_key_norestart(trans, iter, btree_id, pos,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (bkey_cmp(k.k->p, pos))
+ break;
+
+ if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+ k.k->p.snapshot)) {
+ ret = !bkey_whiteout(k.k);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_insert_entry *i, n;
+
+ BUG_ON(!iter->path->should_be_locked);
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return bch2_trans_update_extent(trans, iter, k, flags);
+
+ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+
+ n = (struct btree_insert_entry) {
+ .flags = flags,
+ .bkey_type = __btree_node_type(iter->path->level, iter->btree_id),
+ .btree_id = iter->btree_id,
+ .level = iter->path->level,
+ .cached = iter->flags & BTREE_ITER_CACHED,
+ .path = iter->path,
+ .k = k,
+ .ip_allocated = _RET_IP_,
+ };
+
+ __btree_path_get(n.path, true);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans_for_each_update(trans, i)
+ BUG_ON(i != trans->updates &&
+ btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+ if (bkey_deleted(&n.k->k) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ n.k->k.type = KEY_TYPE_whiteout;
+ }
+
+ /*
+ * Pending updates are kept sorted: first, find position of new update,
+ * then delete/trim any updates the new update overwrites:
+ */
+ trans_for_each_update(trans, i)
+ if (btree_insert_entry_cmp(&n, i) <= 0)
+ break;
+
+ if (i < trans->updates + trans->nr_updates &&
+ !btree_insert_entry_cmp(&n, i)) {
+ BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+ /*
+ * This is a hack to ensure that inode creates update the btree,
+ * not the key cache, which helps with cache coherency issues in
+ * other areas:
+ */
+ if (n.cached && !i->cached) {
+ i->k = n.k;
+ i->flags = n.flags;
+
+ __btree_path_get(n.path, false);
+ } else {
+ bch2_path_put(trans, i->path, true);
+ *i = n;
+ }
+ } else
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
- }
return 0;
}
@@ -1176,14 +1399,14 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
int __bch2_btree_insert(struct btree_trans *trans,
enum btree_id id, struct bkey_i *k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- ret = bch2_trans_update(trans, iter, k, 0);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1204,35 +1427,36 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
}
int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
+ struct btree_iter *iter, unsigned update_flags)
{
- struct bkey_i k;
+ struct bkey_i *k;
- bkey_init(&k.k);
- k.k.p = iter->pos;
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
- return bch2_trans_update(trans, iter, &k, 0) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
+ bkey_init(&k->k);
+ k->k.p = iter->pos;
+ return bch2_trans_update(trans, iter, k, update_flags);
}
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
+ unsigned iter_flags,
u64 *journal_seq)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((bch2_trans_begin(trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
!(ret = bkey_err(k)) &&
- bkey_cmp(iter->pos, end) < 0) {
+ bkey_cmp(iter.pos, end) < 0) {
struct bkey_i delete;
- bch2_trans_begin(trans);
-
bkey_init(&delete.k);
/*
@@ -1249,9 +1473,9 @@ retry:
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
* bkey_start_pos(k.k)).
*/
- delete.k.p = iter->pos;
+ delete.k.p = iter.pos;
- if (btree_node_type_is_extents(iter->btree_id)) {
+ if (btree_node_type_is_extents(id)) {
unsigned max_sectors =
KEY_SIZE_MAX & (~0 << trans->c->block_bits);
@@ -1259,18 +1483,16 @@ retry:
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete);
- ret = bch2_extent_trim_atomic(&delete, iter);
+ ret = bch2_extent_trim_atomic(trans, &iter, &delete);
if (ret)
break;
}
- ret = bch2_trans_update(trans, iter, &delete, 0) ?:
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, journal_seq,
BTREE_INSERT_NOFAIL);
if (ret)
break;
-
- bch2_trans_cond_resched(trans);
}
if (ret == -EINTR) {
@@ -1278,7 +1500,7 @@ retry:
goto retry;
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1292,5 +1514,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+ bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 76d15a5dc62f..6fc93b56bcb2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -13,9 +13,12 @@
#include "buckets.h"
#include "ec.h"
#include "error.h"
+#include "inode.h"
#include "movinggc.h"
+#include "recovery.h"
#include "reflink.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
@@ -114,6 +117,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
unsigned journal_seq,
bool gc)
{
+ BUG_ON(!gc && !journal_seq);
+
return this_cpu_ptr(gc
? ca->usage_gc
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -139,6 +144,8 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
unsigned journal_seq,
bool gc)
{
+ BUG_ON(!gc && !journal_seq);
+
return this_cpu_ptr(gc
? c->usage_gc
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -351,17 +358,23 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
u64 journal_seq, bool gc)
{
+ struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
+ /*
+ * Hack for bch2_fs_initialize path, where we're first marking sb and
+ * journal non-transactionally:
+ */
+ if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
+ journal_seq = 1;
+
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
- if (!fs_usage)
- fs_usage = fs_usage_ptr(c, journal_seq, gc);
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
@@ -390,30 +403,48 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
+static inline int __update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
+{
+ int idx = bch2_replicas_entry_idx(c, r);
+
+ if (idx < 0)
+ return -1;
+
+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage->replicas[idx] += sectors;
+ return 0;
+}
+
static inline int update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
- s64 sectors)
+ struct bch_replicas_entry *r, s64 sectors,
+ unsigned journal_seq, bool gc)
{
+ struct bch_fs_usage __percpu *fs_usage;
int idx = bch2_replicas_entry_idx(c, r);
if (idx < 0)
return -1;
+ preempt_disable();
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
+ preempt_enable();
return 0;
}
static inline int update_cached_sectors(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- unsigned dev, s64 sectors)
+ unsigned dev, s64 sectors,
+ unsigned journal_seq, bool gc)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, fs_usage, &r.e, sectors);
+ return update_replicas(c, &r.e, sectors, journal_seq, gc);
}
static struct replicas_delta_list *
@@ -505,20 +536,20 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(owned_by_allocator == old.owned_by_allocator);
}
-static int bch2_mark_alloc(struct bch_fs *c,
+static int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
struct bkey_alloc_unpacked u;
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
- if (new.k->type != KEY_TYPE_alloc &&
- new.k->type != KEY_TYPE_alloc_v2)
+ if (!bkey_is_alloc(new.k))
return 0;
/*
@@ -528,6 +559,15 @@ static int bch2_mark_alloc(struct bch_fs *c,
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+
+ BUG_ON(!journal_seq);
+ BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
+
+ v->journal_seq = cpu_to_le64(journal_seq);
+ }
+
ca = bch_dev_bkey_exists(c, new.k->p.inode);
if (new.k->p.offset >= ca->mi.nbuckets)
@@ -549,7 +589,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
+ bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
@@ -565,8 +605,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old_m.cached_sectors) {
- if (update_cached_sectors(c, fs_usage, ca->dev_idx,
- -old_m.cached_sectors)) {
+ if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
+ journal_seq, gc)) {
bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
return -1;
}
@@ -617,8 +657,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
old.dirty_sectors, sectors);
if (c)
- bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
- old, new, 0, gc);
+ bch2_dev_usage_update(c, ca, old, new, 0, gc);
return 0;
}
@@ -637,57 +676,27 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (b >= ca->mi.nbuckets)
return;
- preempt_disable();
-
if (likely(c)) {
do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
ca, b, type, sectors);
} else {
__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
}
-
- preempt_enable();
-}
-
-static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
-{
- return DIV_ROUND_UP(sectors * n, d);
}
-static s64 __ptr_disk_sectors_delta(unsigned old_size,
- unsigned offset, s64 delta,
- unsigned flags,
- unsigned n, unsigned d)
+static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
{
- BUG_ON(!n || !d);
+ EBUG_ON(sectors < 0);
- if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
- BUG_ON(offset + -delta > old_size);
-
- return -disk_sectors_scaled(n, d, old_size) +
- disk_sectors_scaled(n, d, offset) +
- disk_sectors_scaled(n, d, old_size - offset + delta);
- } else if (flags & BTREE_TRIGGER_OVERWRITE) {
- BUG_ON(offset + -delta > old_size);
-
- return -disk_sectors_scaled(n, d, old_size) +
- disk_sectors_scaled(n, d, old_size + delta);
- } else {
- return disk_sectors_scaled(n, d, delta);
- }
+ return p.crc.compression_type &&
+ p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+ p.crc.uncompressed_size)
+ : sectors;
}
-static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
- unsigned offset, s64 delta,
- unsigned flags)
-{
- return __ptr_disk_sectors_delta(p.crc.live_size,
- offset, delta, flags,
- p.crc.compressed_size,
- p.crc.uncompressed_size);
-}
-
-static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+static int check_bucket_ref(struct bch_fs *c,
+ struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 bucket_data_type,
@@ -761,11 +770,12 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
- unsigned ptr_idx,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ u64 journal_seq, unsigned flags)
{
+ struct bch_fs *c = trans->c;
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
@@ -805,11 +815,12 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
return 0;
}
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
@@ -818,7 +829,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
u16 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+ int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
bucket_gen, *bucket_data_type,
*dirty_sectors, *cached_sectors);
@@ -831,13 +842,15 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
@@ -850,7 +863,8 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
new.v.counter = old.v.counter = v;
bucket_data_type = new.data_type;
- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, new.gen,
&bucket_data_type,
&new.dirty_sectors,
&new.cached_sectors);
@@ -872,20 +886,21 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
return 0;
}
-static int bch2_mark_stripe_ptr(struct bch_fs *c,
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
- struct bch_fs_usage *fs_usage,
- s64 sectors, unsigned flags)
+ s64 sectors,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
+ struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
struct stripe *m;
unsigned i, blocks_nonempty = 0;
@@ -918,23 +933,29 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
r.e.data_type = data_type;
- update_replicas(c, fs_usage, &r.e, sectors);
+ update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc);
return 0;
}
-static int bch2_mark_extent(struct bch_fs *c,
+static int bch2_mark_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
- unsigned offset, s64 sectors,
- enum bch_data_type data_type,
- struct bch_fs_usage *fs_usage,
- unsigned journal_seq, unsigned flags)
+ unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_replicas_padded r;
+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+ ? BCH_DATA_btree
+ : BCH_DATA_user;
+ s64 sectors = bkey_is_btree_ptr(k.k)
+ ? c->opts.btree_node_size
+ : k.k->size;
s64 dirty_sectors = 0;
bool stale;
int ret;
@@ -943,15 +964,14 @@ static int bch2_mark_extent(struct bch_fs *c,
r.e.nr_devs = 0;
r.e.nr_required = 1;
- BUG_ON(!sectors);
-
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = data_type == BCH_DATA_btree
- ? sectors
- : ptr_disk_sectors_delta(p, offset, sectors, flags);
+ s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
- ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
- fs_usage, journal_seq, flags);
+ ret = bch2_mark_pointer(trans, k, p, disk_sectors,
+ data_type, flags);
if (ret < 0)
return ret;
@@ -959,8 +979,8 @@ static int bch2_mark_extent(struct bch_fs *c,
if (p.ptr.cached) {
if (!stale)
- if (update_cached_sectors(c, fs_usage, p.ptr.dev,
- disk_sectors)) {
+ if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
+ journal_seq, gc)) {
bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
return -1;
@@ -969,8 +989,8 @@ static int bch2_mark_extent(struct bch_fs *c,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
- fs_usage, disk_sectors, flags);
+ ret = bch2_mark_stripe_ptr(trans, p.ec, data_type,
+ disk_sectors, flags);
if (ret)
return ret;
@@ -984,7 +1004,7 @@ static int bch2_mark_extent(struct bch_fs *c,
}
if (r.e.nr_devs) {
- if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) {
+ if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, k);
@@ -996,12 +1016,13 @@ static int bch2_mark_extent(struct bch_fs *c,
return 0;
}
-static int bch2_mark_stripe(struct bch_fs *c,
+static int bch2_mark_stripe(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
size_t idx = new.k->p.offset;
const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(old).v : NULL;
@@ -1014,8 +1035,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
BUG_ON(gc && old_s);
if (!m || (old_s && !m->alive)) {
- bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
- idx);
+ char buf1[200], buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, old);
+ bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1, buf2);
bch2_inconsistent_error(c);
return -1;
}
@@ -1060,14 +1086,14 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(c, new, i, fs_usage,
- journal_seq, flags);
+ ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
if (ret)
return ret;
}
- if (update_replicas(c, fs_usage, &m->r.e,
- ((s64) m->sectors * m->nr_redundant))) {
+ if (update_replicas(c, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ journal_seq, gc)) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, new);
@@ -1079,98 +1105,123 @@ static int bch2_mark_stripe(struct bch_fs *c,
return 0;
}
-static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
- u64 p_start, u64 p_end,
- u64 v_start, u64 v_end)
+static int bch2_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
- if (p_start == p_end)
- return false;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage __percpu *fs_usage;
+ u64 journal_seq = trans->journal_res.seq;
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
- p_start += le64_to_cpu(p.v->idx);
- p_end += le64_to_cpu(p.v->idx);
+ BUG_ON(!journal_seq);
+ BUG_ON(new.k->type != KEY_TYPE_inode_v2);
- if (p_end <= v_start)
- return false;
- if (p_start >= v_end)
- return false;
- return true;
+ v->bi_journal_seq = cpu_to_le64(journal_seq);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ preempt_disable();
+ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+ fs_usage->nr_inodes += bkey_is_inode(new.k);
+ fs_usage->nr_inodes -= bkey_is_inode(old.k);
+ preempt_enable();
+ }
+ return 0;
}
-static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- struct bkey_s_c k)
+static int bch2_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
- return __reflink_p_frag_references(p, start, end,
- bkey_start_offset(k.k),
- k.k->p.offset);
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bch_fs_usage __percpu *fs_usage;
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+ s64 sectors = (s64) k.k->size;
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ sectors = -sectors;
+ sectors *= replicas;
+
+ preempt_disable();
+ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(fs_usage->persistent_reserved));
+
+ fs_usage->reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
+ preempt_enable();
+
+ return 0;
}
-static int __bch2_mark_reflink_p(struct bch_fs *c,
- struct bkey_s_c_reflink_p p,
- u64 idx, unsigned sectors,
- unsigned front_frag,
- unsigned back_frag,
- unsigned flags,
- size_t *r_idx)
+static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags, size_t r_idx)
{
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- int frags_referenced;
-
- while (1) {
- if (*r_idx >= c->reflink_gc_nr)
- goto not_found;
- r = genradix_ptr(&c->reflink_gc_table, *r_idx);
- BUG_ON(!r);
+ s64 ret = 0;
- if (r->offset > idx)
- break;
- (*r_idx)++;
- }
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
- frags_referenced =
- __reflink_p_frag_references(p, 0, front_frag,
- r->offset - r->size, r->offset) +
- __reflink_p_frag_references(p, back_frag, p.k->size,
- r->offset - r->size, r->offset);
-
- if (frags_referenced == 2) {
- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
- add = -add;
- } else if (frags_referenced == 1) {
- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
- add = 0;
- }
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ if (*idx < r->offset - r->size)
+ goto not_found;
BUG_ON((s64) r->refcount + add < 0);
r->refcount += add;
- return min_t(u64, sectors, r->offset - idx);
+ *idx = r->offset;
+ return 0;
not_found:
- bch2_fs_inconsistent(c,
- "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, idx);
- bch2_inconsistent_error(c);
- return -EIO;
+ *idx = U64_MAX;
+ ret = -EIO;
+
+ /*
+ * XXX: we're replacing the entire reflink pointer with an error
+ * key, we should just be replacing the part that was missing:
+ */
+ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+ struct bkey_i_error *new;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_init(&new->k);
+ new->k.type = KEY_TYPE_error;
+ new->k.p = p.k->p;
+ new->k.size = p.k->size;
+ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
+ }
+fsck_err:
+ return ret;
}
-static int bch2_mark_reflink_p(struct bch_fs *c,
- struct bkey_s_c_reflink_p p, unsigned offset,
- s64 sectors, unsigned flags)
+static int bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
- u64 idx = le64_to_cpu(p.v->idx) + offset;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
- unsigned front_frag, back_frag;
- s64 ret = 0;
-
- if (sectors < 0)
- sectors = -sectors;
-
- BUG_ON(offset + sectors > p.k->size);
+ u64 idx = le64_to_cpu(p.v->idx);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ int ret = 0;
- front_frag = offset;
- back_frag = offset + sectors;
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
+ idx -= le32_to_cpu(p.v->front_pad);
+ end += le32_to_cpu(p.v->back_pad);
+ }
l = 0;
r = c->reflink_gc_nr;
@@ -1184,203 +1235,89 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
r = m;
}
- while (sectors) {
- ret = __bch2_mark_reflink_p(c, p, idx, sectors,
- front_frag, back_frag, flags, &l);
- if (ret < 0)
- return ret;
+ while (idx < end && !ret)
+ ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
- idx += ret;
- sectors -= ret;
- }
-
- return 0;
+ return ret;
}
-static int bch2_mark_key_locked(struct bch_fs *c,
+static int bch2_mark_key_locked(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_s_c new,
- unsigned offset, s64 sectors,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
- int ret = 0;
-
- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
-
- preempt_disable();
-
- if (!fs_usage || (flags & BTREE_TRIGGER_GC))
- fs_usage = fs_usage_ptr(c, journal_seq,
- flags & BTREE_TRIGGER_GC);
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
switch (k.k->type) {
case KEY_TYPE_alloc:
case KEY_TYPE_alloc_v2:
- ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
- break;
+ case KEY_TYPE_alloc_v3:
+ return bch2_mark_alloc(trans, old, new, flags);
case KEY_TYPE_btree_ptr:
case KEY_TYPE_btree_ptr_v2:
- sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
- ? c->opts.btree_node_size
- : -c->opts.btree_node_size;
-
- ret = bch2_mark_extent(c, old, new, offset, sectors,
- BCH_DATA_btree, fs_usage, journal_seq, flags);
- break;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- ret = bch2_mark_extent(c, old, new, offset, sectors,
- BCH_DATA_user, fs_usage, journal_seq, flags);
- break;
+ return bch2_mark_extent(trans, old, new, flags);
case KEY_TYPE_stripe:
- ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
- break;
+ return bch2_mark_stripe(trans, old, new, flags);
case KEY_TYPE_inode:
- fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
- fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
- break;
- case KEY_TYPE_reservation: {
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-
- sectors *= replicas;
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(fs_usage->persistent_reserved));
-
- fs_usage->reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
- break;
- }
+ case KEY_TYPE_inode_v2:
+ return bch2_mark_inode(trans, old, new, flags);
+ case KEY_TYPE_reservation:
+ return bch2_mark_reservation(trans, old, new, flags);
case KEY_TYPE_reflink_p:
- ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
- offset, sectors, flags);
- break;
+ return bch2_mark_reflink_p(trans, old, new, flags);
+ case KEY_TYPE_snapshot:
+ return bch2_mark_snapshot(trans, old, new, flags);
+ default:
+ return 0;
}
-
- preempt_enable();
-
- return ret;
}
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
- unsigned offset, s64 sectors,
- struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags)
+int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags)
{
- struct bkey deleted;
+ struct bch_fs *c = trans->c;
+ struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
int ret;
- bkey_init(&deleted);
+ deleted.p = new.k->p;
percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(c, old, new, offset, sectors,
- fs_usage, journal_seq,
- BTREE_TRIGGER_INSERT|flags);
+ ret = bch2_mark_key_locked(trans, old, new, flags);
percpu_up_read(&c->mark_lock);
return ret;
}
-int bch2_mark_update(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *new,
- struct bch_fs_usage *fs_usage,
- unsigned flags)
+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *new, unsigned flags)
{
- struct bch_fs *c = trans->c;
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
struct bkey_s_c old;
struct bkey unpacked;
- int ret = 0;
+ int ret;
+
+ _deleted.p = path->pos;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(iter->btree_id))
+ if (!btree_node_type_needs_gc(path->btree_id))
return 0;
- bkey_init(&unpacked);
- old = (struct bkey_s_c) { &unpacked, NULL };
+ old = bch2_btree_path_peek_slot(path, &unpacked);
- if (!btree_node_type_is_extents(iter->btree_id)) {
- /* iterators should be uptodate, shouldn't get errors here: */
- if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
- old = bch2_btree_iter_peek_slot(iter);
- BUG_ON(bkey_err(old));
- } else {
- struct bkey_cached *ck = (void *) iter->l[0].b;
-
- if (ck->valid)
- old = bkey_i_to_s_c(ck->k);
- }
-
- if (old.k->type == new->k.type) {
- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
- fs_usage, trans->journal_res.seq,
+ if (old.k->type == new->k.type &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ ret = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-
- } else {
- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
- fs_usage, trans->journal_res.seq,
- BTREE_TRIGGER_INSERT|flags);
- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
- fs_usage, trans->journal_res.seq,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
} else {
- struct btree_iter *copy;
-
- BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
- bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
- 0, new->k.size,
- fs_usage, trans->journal_res.seq,
- BTREE_TRIGGER_INSERT|flags);
-
- copy = bch2_trans_copy_iter(trans, iter);
-
- for_each_btree_key_continue(copy, 0, old, ret) {
- unsigned offset = 0;
- s64 sectors = -((s64) old.k->size);
-
- flags |= BTREE_TRIGGER_OVERWRITE;
-
- if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
- break;
-
- switch (bch2_extent_overlap(&new->k, old.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- offset = 0;
- sectors = -((s64) old.k->size);
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = bkey_start_offset(&new->k) -
- old.k->p.offset;
- break;
- case BCH_EXTENT_OVERLAP_FRONT:
- offset = 0;
- sectors = bkey_start_offset(old.k) -
- new->k.p.offset;
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = -((s64) new->k.size);
- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
- break;
- }
-
- BUG_ON(sectors >= 0);
-
- ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
- offset, sectors, fs_usage,
- trans->journal_res.seq, flags) ?: 1;
- if (ret <= 0)
- break;
- }
- bch2_trans_iter_put(trans, copy);
+ ret = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_mark_key_locked(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
}
return ret;
@@ -1404,23 +1341,14 @@ void fs_usage_apply_warn(struct btree_trans *trans,
pr_err("%s", buf);
pr_err("overlapping with");
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
- struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
- struct bkey_s_c k;
- int ret;
+ if (!i->cached) {
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
- for_each_btree_key_continue(copy, 0, k, ret) {
- if (btree_node_type_is_extents(i->iter->btree_id)
- ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
- : bkey_cmp(i->k->k.p, k.k->p))
- break;
-
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- pr_err("%s", buf);
- }
- bch2_trans_iter_put(trans, copy);
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ pr_err("%s", buf);
} else {
- struct bkey_cached *ck = (void *) i->iter->l[0].b;
+ struct bkey_cached *ck = (void *) i->path->l[0].b;
if (ck->valid) {
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
@@ -1457,7 +1385,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
added += d->delta;
}
- BUG_ON(update_replicas(c, dst, &d->r, d->delta));
+ BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
}
dst->nr_inodes += deltas->nr_inodes;
@@ -1474,7 +1402,14 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
*/
should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
- atomic64_sub(should_not_have_added, &c->sectors_available);
+ u64 old, new, v = atomic64_read(&c->sectors_available);
+
+ do {
+ old = v;
+ new = max_t(s64, 0, old - should_not_have_added);
+ } while ((v = atomic64_cmpxchg(&c->sectors_available,
+ old, new)) != old);
+
added -= should_not_have_added;
warn = true;
}
@@ -1492,54 +1427,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
/* trans_mark: */
-static struct btree_iter *trans_get_update(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- struct bkey_s_c *k)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i)
- if (i->iter->btree_id == btree_id &&
- (btree_node_type_is_extents(btree_id)
- ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
- bkey_cmp(pos, i->k->k.p) < 0
- : !bkey_cmp(pos, i->iter->pos))) {
- *k = bkey_i_to_s_c(i->k);
-
- /* ugly hack.. */
- BUG_ON(btree_iter_live(trans, i->iter));
- trans->iters_live |= 1ULL << i->iter->idx;
- return i->iter;
- }
-
- return NULL;
-}
-
-static int trans_get_key(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- struct btree_iter **iter,
- struct bkey_s_c *k)
-{
- unsigned flags = btree_id != BTREE_ID_alloc
- ? BTREE_ITER_SLOTS
- : BTREE_ITER_CACHED;
- int ret;
-
- *iter = trans_get_update(trans, btree_id, pos, k);
- if (*iter)
- return 1;
-
- *iter = bch2_trans_get_iter(trans, btree_id, pos,
- flags|BTREE_ITER_INTENT);
- *k = __bch2_btree_iter_peek(*iter, flags);
- ret = bkey_err(*k);
- if (ret)
- bch2_trans_iter_put(trans, *iter);
- return ret;
-}
-
static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
{
@@ -1547,36 +1436,33 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
struct bucket *g;
- struct btree_iter *iter;
- struct bkey_s_c k;
struct bkey_alloc_buf *a;
+ struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
int ret;
a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
if (IS_ERR(a))
return a;
- iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
- if (iter) {
- *u = bch2_alloc_unpack(k);
- } else {
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
- if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
- }
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
+ }
+ if (update && !bpos_cmp(update->k.p, pos)) {
+ *u = bch2_alloc_unpack(bkey_i_to_s_c(update));
+ } else {
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
- *_iter = iter;
return a;
}
@@ -1585,7 +1471,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
struct bkey_alloc_buf *a;
int ret;
@@ -1594,15 +1480,16 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
if (IS_ERR(a))
return PTR_ERR(a);
- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+ u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ bch2_trans_update(trans, &iter, &a->k, 0);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1611,15 +1498,19 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
struct bch_replicas_padded r;
int ret = 0;
- ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
- if (ret < 0)
- return ret;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
if (k.k->type != KEY_TYPE_stripe) {
bch2_fs_inconsistent(c,
@@ -1627,7 +1518,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
(u64) p.ec.idx);
bch2_inconsistent_error(c);
ret = -EIO;
- goto out;
+ goto err;
}
if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
@@ -1635,37 +1526,42 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
- goto out;
+ goto err;
}
s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(s);
if (ret)
- goto out;
+ goto err;
bkey_reassemble(&s->k_i, k);
stripe_blockcount_set(&s->v, p.ec.block,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
- bch2_trans_update(trans, iter, &s->k_i, 0);
+ bch2_trans_update(trans, &iter, &s->k_i, 0);
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
r.e.data_type = data_type;
update_replicas_list(trans, &r.e, sectors);
-out:
- bch2_trans_iter_put(trans, iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k, unsigned offset,
- s64 sectors, unsigned flags,
- enum bch_data_type data_type)
+ struct bkey_s_c k, unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_replicas_padded r;
+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+ ? BCH_DATA_btree
+ : BCH_DATA_user;
+ s64 sectors = bkey_is_btree_ptr(k.k)
+ ? c->opts.btree_node_size
+ : k.k->size;
s64 dirty_sectors = 0;
bool stale;
int ret;
@@ -1674,15 +1570,14 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
r.e.nr_devs = 0;
r.e.nr_required = 1;
- BUG_ON(!sectors);
-
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = data_type == BCH_DATA_btree
- ? sectors
- : ptr_disk_sectors_delta(p, offset, sectors, flags);
+ s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
- ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors,
- data_type);
+ ret = bch2_trans_mark_pointer(trans, k, p,
+ disk_sectors, data_type);
if (ret < 0)
return ret;
@@ -1718,7 +1613,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct bkey_alloc_buf *a;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
int ret = 0;
@@ -1742,7 +1637,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
if (!deleting) {
if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
- iter->pos.inode, iter->pos.offset, u.gen,
+ iter.pos.inode, iter.pos.offset, u.gen,
u.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
@@ -1756,9 +1651,9 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
}
bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ bch2_trans_update(trans, &iter, &a->k, 0);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1818,40 +1713,62 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
+static int bch2_trans_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
+ unsigned flags)
+{
+ int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+
+ if (nr) {
+ struct replicas_delta_list *d =
+ replicas_deltas_realloc(trans, 0);
+ d->nr_inodes += nr;
+ }
+
+ return 0;
+}
+
+static int bch2_trans_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c k, unsigned flags)
+{
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+ s64 sectors = (s64) k.k->size;
+ struct replicas_delta_list *d;
+
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ sectors = -sectors;
+ sectors *= replicas;
+
+ d = replicas_deltas_realloc(trans, 0);
+
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(d->persistent_reserved));
+
+ d->persistent_reserved[replicas - 1] += sectors;
+ return 0;
+}
+
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
- u64 idx, unsigned sectors,
- unsigned front_frag,
- unsigned back_frag,
- unsigned flags)
+ u64 *idx, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i *n;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- int frags_referenced;
- s64 ret;
-
- ret = trans_get_key(trans, BTREE_ID_reflink,
- POS(0, idx), &iter, &k);
- if (ret < 0)
- return ret;
-
- sectors = min_t(u64, sectors, k.k->p.offset - idx);
-
- frags_referenced =
- reflink_p_frag_references(p, 0, front_frag, k) +
- reflink_p_frag_references(p, back_frag, p.k->size, k);
+ char buf[200];
+ int ret;
- if (frags_referenced == 2) {
- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
- add = -add;
- } else if (frags_referenced == 1) {
- BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
- goto out;
- }
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
ret = PTR_ERR_OR_ZERO(n);
@@ -1862,15 +1779,38 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
refcount = bkey_refcount(n);
if (!refcount) {
+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
bch2_fs_inconsistent(c,
- "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, idx);
- bch2_inconsistent_error(c);
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf);
ret = -EIO;
goto err;
}
- BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
+ bch2_fs_inconsistent(c,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
le64_add_cpu(refcount, add);
if (!*refcount) {
@@ -1878,210 +1818,72 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
set_bkey_val_u64s(&n->k, 0);
}
- bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
- ret = bch2_trans_update(trans, iter, n, 0);
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, n, 0);
if (ret)
goto err;
-out:
- ret = sectors;
+
+ *idx = k.k->p.offset;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p, unsigned offset,
- s64 sectors, unsigned flags)
+ struct bkey_s_c k, unsigned flags)
{
- u64 idx = le64_to_cpu(p.v->idx) + offset;
- unsigned front_frag, back_frag;
- s64 ret = 0;
-
- if (sectors < 0)
- sectors = -sectors;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx, end_idx;
+ int ret = 0;
- BUG_ON(offset + sectors > p.k->size);
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- front_frag = offset;
- back_frag = offset + sectors;
+ v->front_pad = v->back_pad = 0;
+ }
- while (sectors) {
- ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
- front_frag, back_frag, flags);
- if (ret < 0)
- return ret;
+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+ le32_to_cpu(p.v->back_pad);
- idx += ret;
- sectors -= ret;
- }
+ while (idx < end_idx && !ret)
+ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
- return 0;
+ return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned offset, s64 sectors, unsigned flags)
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
+ struct bkey_s_c new, unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
- struct replicas_delta_list *d;
-
- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
case KEY_TYPE_btree_ptr_v2:
- sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
- ? c->opts.btree_node_size
- : -c->opts.btree_node_size;
-
- return bch2_trans_mark_extent(trans, k, offset, sectors,
- flags, BCH_DATA_btree);
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- return bch2_trans_mark_extent(trans, k, offset, sectors,
- flags, BCH_DATA_user);
+ return bch2_trans_mark_extent(trans, k, flags);
case KEY_TYPE_stripe:
return bch2_trans_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode: {
- int nr = (new.k->type == KEY_TYPE_inode) -
- (old.k->type == KEY_TYPE_inode);
-
- if (nr) {
- d = replicas_deltas_realloc(trans, 0);
- d->nr_inodes += nr;
- }
-
- return 0;
- }
- case KEY_TYPE_reservation: {
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
-
- d = replicas_deltas_realloc(trans, 0);
-
- sectors *= replicas;
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->persistent_reserved));
-
- d->persistent_reserved[replicas - 1] += sectors;
- return 0;
- }
+ case KEY_TYPE_inode:
+ case KEY_TYPE_inode_v2:
+ return bch2_trans_mark_inode(trans, old, new, flags);
+ case KEY_TYPE_reservation:
+ return bch2_trans_mark_reservation(trans, k, flags);
case KEY_TYPE_reflink_p:
- return bch2_trans_mark_reflink_p(trans,
- bkey_s_c_to_reflink_p(k),
- offset, sectors, flags);
+ return bch2_trans_mark_reflink_p(trans, k, flags);
default:
return 0;
}
}
-int bch2_trans_mark_update(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *new,
- unsigned flags)
-{
- struct bkey_s_c old;
- int ret;
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc(iter->btree_id))
- return 0;
-
- if (!btree_node_type_is_extents(iter->btree_id)) {
- if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
- old = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(old);
- if (ret)
- return ret;
- } else {
- struct bkey_cached *ck = (void *) iter->l[0].b;
-
- BUG_ON(!ck->valid);
- old = bkey_i_to_s_c(ck->k);
- }
-
- if (old.k->type == new->k.type) {
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
- } else {
- struct btree_iter *copy;
- struct bkey _old;
-
- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
-
- bkey_init(&_old);
- old = (struct bkey_s_c) { &_old, NULL };
-
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
- 0, new->k.size,
- BTREE_TRIGGER_INSERT);
- if (ret)
- return ret;
-
- copy = bch2_trans_copy_iter(trans, iter);
-
- for_each_btree_key_continue(copy, 0, old, ret) {
- unsigned offset = 0;
- s64 sectors = -((s64) old.k->size);
-
- flags |= BTREE_TRIGGER_OVERWRITE;
-
- if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
- break;
-
- switch (bch2_extent_overlap(&new->k, old.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- offset = 0;
- sectors = -((s64) old.k->size);
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = bkey_start_offset(&new->k) -
- old.k->p.offset;
- break;
- case BCH_EXTENT_OVERLAP_FRONT:
- offset = 0;
- sectors = bkey_start_offset(old.k) -
- new->k.p.offset;
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- offset = bkey_start_offset(&new->k) -
- bkey_start_offset(old.k);
- sectors = -((s64) new->k.size);
- flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
- break;
- }
-
- BUG_ON(sectors >= 0);
-
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
- offset, sectors, flags);
- if (ret)
- break;
- }
- bch2_trans_iter_put(trans, copy);
- }
-
- return ret;
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
unsigned sectors)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
struct bkey_alloc_buf *a;
struct bch_extent_ptr ptr = {
@@ -2104,7 +1906,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- iter->pos.inode, iter->pos.offset, u.gen,
+ iter.pos.inode, iter.pos.offset, u.gen,
bch2_data_types[u.data_type],
bch2_data_types[type],
bch2_data_types[type]);
@@ -2116,9 +1918,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
u.dirty_sectors = sectors;
bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ bch2_trans_update(trans, &iter, &a->k, 0);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 04a2a9310cdd..5ed9441cb115 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -125,20 +125,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
}
-static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
- unsigned live_size)
-{
- return live_size && p.crc.compression_type
- ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
- p.crc.uncompressed_size))
- : live_size;
-}
-
-static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
-{
- return __ptr_disk_sectors(p, p.crc.live_size);
-}
-
/* bucket gc marks */
static inline unsigned bucket_sectors_used(struct bucket_mark mark)
@@ -240,16 +226,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
- s64, struct bch_fs_usage *, u64, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct bch_fs_usage *, unsigned);
+int bch2_mark_update(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, unsigned);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
- unsigned, s64, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
- struct bkey_i *insert, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+ struct bkey_s_c, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index c29f8272e682..db68a78276cf 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -157,6 +157,9 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
#if 0
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (arg.flags || arg.pad)
return -EINVAL;
@@ -165,6 +168,9 @@ static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
static long bch2_ioctl_stop(struct bch_fs *c)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
bch2_fs_stop(c);
return 0;
}
@@ -175,6 +181,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
char *path;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (arg.flags || arg.pad)
return -EINVAL;
@@ -192,6 +201,9 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
{
struct bch_dev *ca;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
@@ -211,6 +223,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
char *path;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (arg.flags || arg.pad)
return -EINVAL;
@@ -228,6 +243,9 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
struct bch_dev *ca;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
@@ -250,11 +268,15 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
struct bch_dev *ca;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
BCH_BY_INDEX)) ||
- arg.pad[0] || arg.pad[1] || arg.pad[2])
+ arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+ arg.new_state >= BCH_MEMBER_STATE_NR)
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
@@ -331,6 +353,9 @@ static long bch2_ioctl_data(struct bch_fs *c,
unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
int ret, fd = -1;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if (arg.op >= BCH_DATA_OP_NR || arg.flags)
return -EINVAL;
@@ -497,6 +522,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
struct bch_sb *sb;
int ret = 0;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
arg.pad)
return -EINVAL;
@@ -537,6 +565,9 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
struct bch_dev *ca;
unsigned i;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
for_each_online_member(ca, c, i)
if (ca->disk_sb.bdev->bd_dev == dev) {
percpu_ref_put(&ca->io_ref);
@@ -552,6 +583,9 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
struct bch_dev *ca;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~BCH_BY_INDEX) ||
arg.pad)
return -EINVAL;
@@ -572,6 +606,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
struct bch_dev *ca;
int ret;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
if ((arg.flags & ~BCH_BY_INDEX) ||
arg.pad)
return -EINVAL;
@@ -597,7 +634,6 @@ do { \
long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
{
- /* ioctls that don't require admin cap: */
switch (cmd) {
case BCH_IOCTL_QUERY_UUID:
return bch2_ioctl_query_uuid(c, arg);
@@ -605,12 +641,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return bch2_ioctl_fs_usage(c, arg);
case BCH_IOCTL_DEV_USAGE:
return bch2_ioctl_dev_usage(c, arg);
- }
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
@@ -626,7 +656,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
- /* ioctls that do require admin cap: */
switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index a01073e54a33..fbe8603cfb30 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -6,83 +6,108 @@
#include <linux/crc32c.h>
#include <linux/crypto.h>
+#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
#include <keys/user-type.h>
-static u64 bch2_checksum_init(unsigned type)
+/*
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+ union {
+ u64 seed;
+ struct xxh64_state h64state;
+ };
+ unsigned int type;
+};
+
+static void bch2_checksum_init(struct bch2_checksum_state *state)
{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C_NONZERO:
- return U32_MAX;
- case BCH_CSUM_CRC64_NONZERO:
- return U64_MAX;
- case BCH_CSUM_CRC32C:
- return 0;
- case BCH_CSUM_CRC64:
- return 0;
+ switch (state->type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
+ state->seed = 0;
+ break;
+ case BCH_CSUM_crc32c_nonzero:
+ state->seed = U32_MAX;
+ break;
+ case BCH_CSUM_crc64_nonzero:
+ state->seed = U64_MAX;
+ break;
+ case BCH_CSUM_xxhash:
+ xxh64_reset(&state->h64state, 0);
+ break;
default:
BUG();
}
}
-static u64 bch2_checksum_final(unsigned type, u64 crc)
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C_NONZERO:
- return crc ^ U32_MAX;
- case BCH_CSUM_CRC64_NONZERO:
- return crc ^ U64_MAX;
- case BCH_CSUM_CRC32C:
- return crc;
- case BCH_CSUM_CRC64:
- return crc;
+ switch (state->type) {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
+ return state->seed;
+ case BCH_CSUM_crc32c_nonzero:
+ return state->seed ^ U32_MAX;
+ case BCH_CSUM_crc64_nonzero:
+ return state->seed ^ U64_MAX;
+ case BCH_CSUM_xxhash:
+ return xxh64_digest(&state->h64state);
default:
BUG();
}
}
-static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC32C:
- return crc32c(crc, data, len);
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC64:
- return crc64_be(crc, data, len);
+ switch (state->type) {
+ case BCH_CSUM_none:
+ return;
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc32c:
+ state->seed = crc32c(state->seed, data, len);
+ break;
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc64:
+ state->seed = crc64_be(state->seed, data, len);
+ break;
+ case BCH_CSUM_xxhash:
+ xxh64_update(&state->h64state, data, len);
+ break;
default:
BUG();
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
@@ -95,8 +120,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm,
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -104,7 +129,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
@@ -112,7 +138,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
@@ -135,21 +161,24 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
struct nonce nonce, const void *data, size_t len)
{
switch (type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64: {
- u64 crc = bch2_checksum_init(type);
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
+ struct bch2_checksum_state state;
+
+ state.type = type;
- crc = bch2_checksum_update(type, crc, data, len);
- crc = bch2_checksum_final(type, crc);
+ bch2_checksum_init(&state);
+ bch2_checksum_update(&state, data, len);
- return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
}
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
@@ -183,33 +212,34 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct bio_vec bv;
switch (type) {
- case BCH_CSUM_NONE:
+ case BCH_CSUM_none:
return (struct bch_csum) { 0 };
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64: {
- u64 crc = bch2_checksum_init(type);
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
+ struct bch2_checksum_state state;
+
+ state.type = type;
+ bch2_checksum_init(&state);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
- crc = bch2_checksum_update(type,
- crc, p, bv.bv_len);
+ bch2_checksum_update(&state, p, bv.bv_len);
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
- crc = bch2_checksum_update(type, crc,
- page_address(bv.bv_page) + bv.bv_offset,
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
+ bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
- crc = bch2_checksum_final(type, crc);
- return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
}
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
@@ -224,7 +254,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -283,16 +313,22 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
struct bch_csum b, size_t b_len)
{
+ struct bch2_checksum_state state;
+
+ state.type = type;
+ bch2_checksum_init(&state);
+ state.seed = a.lo;
+
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
- a.lo = bch2_checksum_update(type, a.lo,
+ bch2_checksum_update(&state,
page_address(ZERO_PAGE(0)), b);
b_len -= b;
}
-
+ a.lo = bch2_checksum_final(&state);
a.lo ^= b.lo;
a.hi ^= b.hi;
return a;
@@ -463,7 +499,7 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
@@ -546,7 +582,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
@@ -574,7 +610,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -606,7 +642,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index a8af0603c2c0..f5c1a609c5c4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,15 +7,15 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
switch (type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
return true;
default:
return false;
@@ -78,11 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
{
switch (type) {
case BCH_CSUM_OPT_none:
- return BCH_CSUM_NONE;
+ return BCH_CSUM_none;
case BCH_CSUM_OPT_crc32c:
- return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
case BCH_CSUM_OPT_crc64:
- return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+ case BCH_CSUM_OPT_xxhash:
+ return BCH_CSUM_xxhash;
default:
BUG();
}
@@ -93,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
{
if (c->sb.encryption_type)
return c->opts.wide_macs
- ? BCH_CSUM_CHACHA20_POLY1305_128
- : BCH_CSUM_CHACHA20_POLY1305_80;
+ ? BCH_CSUM_chacha20_poly1305_128
+ : BCH_CSUM_chacha20_poly1305_80;
return bch2_csum_opt_to_type(opt, true);
}
@@ -102,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{
if (c->sb.encryption_type)
- return BCH_CSUM_CHACHA20_POLY1305_128;
+ return BCH_CSUM_chacha20_poly1305_128;
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
@@ -138,9 +140,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 773cf87812ad..f63651d291e5 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4215c119e0a2..294e4baf4deb 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
if (c->opts.nochanges)
return;
- btree_node_io_lock(b);
+ bch2_btree_node_io_lock(b);
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
}
out:
mutex_unlock(&c->verify_lock);
- btree_node_io_unlock(b);
+ bch2_btree_node_io_unlock(b);
}
#ifdef CONFIG_DEBUG_FS
@@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int err;
@@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
while (k.k && !(err = bkey_err(k))) {
bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
i->buf[i->bytes] = '\n';
i->bytes++;
- k = bch2_btree_iter_next(iter);
- i->from = iter->pos;
+ k = bch2_btree_iter_next(&iter);
+ i->from = iter.pos;
err = flush_buf(i);
if (err)
@@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
if (!i->size)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
@@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
int err;
@@ -313,12 +313,12 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
if (err)
return err;
- if (!i->size || !bpos_cmp(POS_MAX, i->from))
+ if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
return i->ret;
bch2_trans_init(&trans, i->c, 0, 0);
- for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
i->bytes = strlen(i->buf);
err = flush_buf(i);
@@ -329,14 +329,14 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
* can't easily correctly restart a btree node traversal across
* all nodes, meh
*/
- i->from = bpos_cmp(POS_MAX, b->key.k.p)
+ i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
? bpos_successor(b->key.k.p)
: b->key.k.p;
if (!i->size)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
@@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct btree *prev_node = NULL;
int err;
@@ -373,11 +373,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH);
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = bkey_err(k))) {
- struct btree_iter_level *l = &iter->l[0];
+ struct btree_path_level *l = &iter.path->l[0];
struct bkey_packed *_k =
bch2_btree_node_iter_peek(&l->iter, l->b);
@@ -396,8 +396,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
if (err)
break;
- bch2_btree_iter_advance(iter);
- i->from = iter->pos;
+ bch2_btree_iter_advance(&iter);
+ i->from = iter.pos;
err = flush_buf(i);
if (err)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 3bf6379cefe6..fe4a85a6a8cb 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -8,6 +8,7 @@
#include "fs.h"
#include "keylist.h"
#include "str_hash.h"
+#include "subvolume.h"
#include <linux/dcache.h>
@@ -63,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
}
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type == DT_SUBVOL)
+ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+ return true;
+}
+
const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_dirents,
.key_type = KEY_TYPE_dirent,
@@ -70,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.hash_bkey = dirent_hash_bkey,
.cmp_key = dirent_cmp_key,
.cmp_bkey = dirent_cmp_bkey,
+ .is_visible = dirent_is_visible,
};
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -99,7 +110,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (memchr(d.v->d_name, '/', len))
return "invalid name";
- if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+ if (d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode)
return "dirent points to own directory";
return NULL;
@@ -112,11 +124,16 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
bch_scnmemcpy(out, d.v->d_name,
bch2_dirent_name_bytes(d));
- pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type);
+ pr_buf(out, " -> %llu type %s",
+ d.v->d_type != DT_SUBVOL
+ ? le64_to_cpu(d.v->d_inum)
+ : le32_to_cpu(d.v->d_child_subvol),
+ bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
- u8 type, const struct qstr *name, u64 dst)
+ subvol_inum dir, u8 type,
+ const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -132,7 +149,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
- dirent->v.d_inum = cpu_to_le64(dst);
+
+ if (type != DT_SUBVOL) {
+ dirent->v.d_inum = cpu_to_le64(dst);
+ } else {
+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+ dirent->v.d_child_subvol = cpu_to_le32(dst);
+ }
+
dirent->v.d_type = type;
memcpy(dirent->v.d_name, name->name, name->len);
@@ -146,21 +170,21 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
-int bch2_dirent_create(struct btree_trans *trans,
- u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags)
{
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, &dirent->k_i, flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -173,75 +197,130 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+ struct bkey_s_c_dirent d, subvol_inum *target)
+{
+ struct bch_subvolume s;
+ int ret = 0;
+
+ if (d.v->d_type == DT_SUBVOL &&
+ d.v->d_parent_subvol != dir.subvol)
+ return 1;
+
+ if (likely(d.v->d_type != DT_SUBVOL)) {
+ target->subvol = dir.subvol;
+ target->inum = le64_to_cpu(d.v->d_inum);
+ } else {
+ target->subvol = le32_to_cpu(d.v->d_child_subvol);
+
+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+ target->inum = le64_to_cpu(s.inode);
+ }
+
+ return ret;
+}
+
int bch2_dirent_rename(struct btree_trans *trans,
- u64 src_dir, struct bch_hash_info *src_hash,
- u64 dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
- const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
- enum bch_rename_mode mode)
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+ enum bch_rename_mode mode)
{
- struct btree_iter *src_iter = NULL, *dst_iter = NULL;
+ struct btree_iter src_iter = { NULL };
+ struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
- POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+ unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
int ret = 0;
- *src_inum = *dst_inum = 0;
+ if (src_dir.subvol != dst_dir.subvol)
+ return -EXDEV;
- /*
- * Lookup dst:
- *
- * Note that in BCH_RENAME mode, we're _not_ checking if
- * the target already exists - we're relying on the VFS
- * to do that check for us for correctness:
- */
- dst_iter = mode == BCH_RENAME
- ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name)
- : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_iter);
+ memset(src_inum, 0, sizeof(*src_inum));
+ memset(dst_inum, 0, sizeof(*dst_inum));
+
+ /* Lookup src: */
+ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_INTENT);
if (ret)
goto out;
- old_dst = bch2_btree_iter_peek_slot(dst_iter);
-
- if (mode != BCH_RENAME)
- *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
- if (mode != BCH_RENAME_EXCHANGE)
- *src_offset = dst_iter->pos.offset;
+ old_src = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(old_src);
+ if (ret)
+ goto out;
- /* Lookup src: */
- src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_iter);
+ ret = bch2_dirent_read_target(trans, src_dir,
+ bkey_s_c_to_dirent(old_src), src_inum);
if (ret)
goto out;
- old_src = bch2_btree_iter_peek_slot(src_iter);
- *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+ src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+ return -EOPNOTSUPP;
+
+
+ /* Lookup dst: */
+ if (mode == BCH_RENAME) {
+ /*
+ * Note that we're _not_ checking if the target already exists -
+ * we're relying on the VFS to do that check for us for
+ * correctness:
+ */
+ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name);
+ if (ret)
+ goto out;
+ } else {
+ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto out;
+
+ old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ ret = bkey_err(old_dst);
+ if (ret)
+ goto out;
+
+ ret = bch2_dirent_read_target(trans, dst_dir,
+ bkey_s_c_to_dirent(old_dst), dst_inum);
+ if (ret)
+ goto out;
+
+ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+ if (dst_type == DT_SUBVOL)
+ return -EOPNOTSUPP;
+ }
+
+ if (mode != BCH_RENAME_EXCHANGE)
+ *src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, 0, dst_name, 0);
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- new_dst->k.p = dst_iter->pos;
+ new_dst->k.p = dst_iter.pos;
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, 0, src_name, 0);
+ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- new_src->k.p = src_iter->pos;
+ new_src->k.p = src_iter.pos;
} else {
new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(new_src);
@@ -249,10 +328,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
goto out;
bkey_init(&new_src->k);
- new_src->k.p = src_iter->pos;
+ new_src->k.p = src_iter.pos;
- if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
- bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
/*
* We have a hash collision for the new dst key,
* and new_src - the key we're deleting - is between
@@ -265,10 +344,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
* If we're not overwriting, we can just insert
* new_dst at the src position:
*/
- new_dst->k.p = src_iter->pos;
- bch2_trans_update(trans, src_iter,
- &new_dst->k_i, 0);
- goto out_set_offset;
+ new_src = new_dst;
+ new_src->k.p = src_iter.pos;
+ goto out_set_src;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
@@ -280,7 +358,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
} else {
/* Check if we need a whiteout to delete src: */
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
- src_hash, src_iter);
+ src_hash, &src_iter);
if (ret < 0)
goto out;
@@ -289,70 +367,108 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
- bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
- bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
-out_set_offset:
+ bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+out_set_src:
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot:
+ */
+ if (src_type == DT_SUBVOL) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter);
+ if (ret)
+ goto out;
+
+ new_src->k.p = src_iter.pos;
+ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ }
+
+ bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
*dst_offset = new_dst->k.p.offset;
out:
- bch2_trans_iter_put(trans, src_iter);
- bch2_trans_iter_put(trans, dst_iter);
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
return ret;
}
-int bch2_dirent_delete_at(struct btree_trans *trans,
- const struct bch_hash_info *hash_info,
- struct btree_iter *iter)
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- hash_info, iter);
-}
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u32 snapshot;
+ int ret;
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, unsigned flags)
-{
- return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- hash_info, dir_inum, name, flags);
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, inum);
+ if (ret > 0)
+ ret = -ENOENT;
+err:
+ if (ret)
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret;
}
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct bch_hash_info *hash_info,
- const struct qstr *name)
+ const struct qstr *name, subvol_inum *inum)
{
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- u64 inum = 0;
+ struct btree_iter iter;
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
-
- iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
- hash_info, name, 0);
- if (IS_ERR(iter)) {
- BUG_ON(PTR_ERR(iter) == -EINTR);
- goto out;
- }
-
- k = bch2_btree_iter_peek_slot(iter);
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
- bch2_trans_iter_put(&trans, iter);
-out:
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+ name, inum, 0);
+ if (ret == -EINTR)
+ goto retry;
+ if (!ret)
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return inum;
+ return ret;
}
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
- POS(dir_inum, 0), 0, k, ret) {
- if (k.k->p.inode > dir_inum)
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode > dir.inum)
break;
if (k.k->type == KEY_TYPE_dirent) {
@@ -360,24 +476,32 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
+ subvol_inum target;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
- for_each_btree_key(&trans, iter, BTREE_ID_dirents,
- POS(inum, ctx->pos), 0, k, ret) {
- if (k.k->p.inode > inum)
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+ if (k.k->p.inode > inum.inum)
break;
if (k.k->type != KEY_TYPE_dirent)
@@ -385,6 +509,12 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
dirent = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
/*
* XXX: dir_emit() can fault and block, while we're holding
* locks
@@ -392,14 +522,25 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
ctx->pos = dirent.k->p.offset;
if (!dir_emit(ctx, dirent.v->d_name,
bch2_dirent_name_bytes(dirent),
- le64_to_cpu(dirent.v->d_inum),
- dirent.v->d_type))
+ target.inum,
+ vfs_d_type(dirent.v->d_type)))
break;
ctx->pos = dirent.k->p.offset + 1;
+
+ /*
+ * read_target looks up subvolumes, we can overflow paths if the
+ * directory has many subvolumes in it
+ */
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
+ break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
return ret;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index e1d8ce377d43..1bb4d802bc1d 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len)
sizeof(u64));
}
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+ struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
-int bch2_dirent_delete_at(struct btree_trans *,
- const struct bch_hash_info *,
- struct btree_iter *);
+static inline unsigned vfs_d_type(unsigned type)
+{
+ return type == DT_SUBVOL ? DT_DIR : type;
+}
enum bch_rename_mode {
BCH_RENAME,
@@ -44,20 +48,20 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- u64, struct bch_hash_info *,
- u64, struct bch_hash_info *,
- const struct qstr *, u64 *, u64 *,
- const struct qstr *, u64 *, u64 *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, u64 *,
+ const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *, u64,
- const struct bch_hash_info *,
- const struct qstr *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+ subvol_inum, const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+ const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *);
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index db6e4f6cac37..bca1b8a7b673 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -392,7 +392,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
- unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
DIV_ROUND_UP(bytes, PAGE_SIZE));
unsigned b = min_t(size_t, bytes - offset,
nr_iovecs << PAGE_SHIFT);
@@ -429,13 +429,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+ POS(0, idx), BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -445,6 +446,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
}
bkey_reassemble(&stripe->key.k_i, k);
err:
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -552,19 +554,19 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
return 0;
}
-static int ec_stripe_mem_alloc(struct bch_fs *c,
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
struct btree_iter *iter)
{
size_t idx = iter->pos.offset;
int ret = 0;
- if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
return ret;
- bch2_trans_unlock(iter->trans);
+ bch2_trans_unlock(trans);
ret = -EINTR;
- if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
return ret;
return -ENOMEM;
@@ -704,7 +706,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
struct disk_reservation *res)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bpos min_pos = POS(0, 1);
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
@@ -719,7 +721,7 @@ retry:
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(iter, start_pos);
+ bch2_btree_iter_set_pos(&iter, start_pos);
continue;
}
@@ -733,19 +735,19 @@ retry:
goto err;
found_slot:
- start_pos = iter->pos;
+ start_pos = iter.pos;
- ret = ec_stripe_mem_alloc(c, iter);
+ ret = ec_stripe_mem_alloc(&trans, &iter);
if (ret)
goto err;
- stripe->k.p = iter->pos;
+ stripe->k.p = iter.pos;
- ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
+ ret = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?:
bch2_trans_commit(&trans, res, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
@@ -759,15 +761,15 @@ err:
static int ec_stripe_bkey_update(struct btree_trans *trans,
struct bkey_i_stripe *new)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
const struct bch_stripe *existing;
unsigned i;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -790,9 +792,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
stripe_blockcount_set(&new->v, i,
stripe_blockcount_get(existing, i));
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -820,10 +822,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
struct bkey *pos)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_extent e;
struct bkey_buf sk;
+ struct bpos next_pos;
int ret = 0, dev, block;
bch2_bkey_buf_init(&sk);
@@ -831,23 +834,24 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
/* XXX this doesn't support the reflink btree */
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- bkey_start_pos(pos),
- BTREE_ITER_INTENT);
-
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(pos),
+ BTREE_ITER_INTENT);
+retry:
+ while (bch2_trans_begin(&trans),
+ (k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
struct bch_extent_ptr *ptr, *ec_ptr = NULL;
if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
block = bkey_matches_stripe(&s->key.v, k);
if (block < 0) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
@@ -862,16 +866,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
extent_stripe_ptr_add(e, s, ec_ptr, block);
- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
- ret = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+ bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
+ next_pos = sk.k->k.p;
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, sk.k, 0) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
- if (ret == -EINTR)
- ret = 0;
+ if (!ret)
+ bch2_btree_iter_set_pos(&iter, next_pos);
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ if (ret == -EINTR)
+ goto retry;
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
@@ -1061,16 +1070,14 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
-void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
- struct bpos pos, unsigned sectors)
+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
+ struct bkey *k)
{
- struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct ec_stripe_new *ec;
+ struct ec_stripe_new *ec = ob->ec;
- if (!ob)
+ if (!ec)
return;
- ec = ob->ec;
mutex_lock(&ec->lock);
if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
@@ -1080,8 +1087,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
}
bkey_init(&ec->keys.top->k);
- ec->keys.top->k.p = pos;
- bch2_key_resize(&ec->keys.top->k, sectors);
+ ec->keys.top->k.p = k->p;
+ ec->keys.top->k.size = k->size;
bch2_keylist_push(&ec->keys);
mutex_unlock(&ec->lock);
@@ -1147,7 +1154,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_blocks = nr_data + nr_parity;
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
- s->v.csum_type = BCH_CSUM_CRC32C;
+ s->v.csum_type = BCH_CSUM_crc32c;
s->v.pad = 0;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
@@ -1592,7 +1599,7 @@ write:
int bch2_stripes_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct genradix_iter giter;
struct bkey_i_stripe *new_key;
struct stripe *m;
@@ -1603,8 +1610,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
if (!m->alive)
@@ -1612,13 +1619,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL|flags,
- __bch2_stripe_write_key(&trans, iter, m,
+ __bch2_stripe_write_key(&trans, &iter, m,
giter.pos, new_key));
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
@@ -1627,13 +1634,14 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
return ret;
}
-static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
int ret = 0;
if (k.k->type == KEY_TYPE_stripe)
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
- bch2_mark_key(c, k, 0, 0, NULL, 0,
+ bch2_mark_key(trans, k,
BTREE_TRIGGER_NOATOMIC);
return ret;
@@ -1641,8 +1649,13 @@ static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
int bch2_stripes_read(struct bch_fs *c)
{
- int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
- bch2_stripes_read_fn);
+ struct btree_trans trans;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
+ bch2_stripes_read_fn);
+ bch2_trans_exit(&trans);
if (ret)
bch_err(c, "error reading stripes: %i", ret);
@@ -1652,20 +1665,21 @@ int bch2_stripes_read(struct bch_fs *c)
int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
size_t i, idx = 0;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
- k = bch2_btree_iter_prev(iter);
- if (!IS_ERR_OR_NULL(k.k))
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (!ret && k.k)
idx = k.k->p.offset + 1;
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
if (ret)
return ret;
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index e79626b59509..eb16e140e2c8 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -193,8 +193,8 @@ struct ec_stripe_head {
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
- struct bpos, unsigned);
+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
+ struct bkey *);
void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 90c3b986c264..2cea694575e9 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -111,6 +111,7 @@ found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
if (c->opts.ratelimit_errors &&
+ !(flags & FSCK_NO_RATELIMIT) &&
s->nr >= FSCK_ERR_RATELIMIT_NR) {
if (s->nr == FSCK_ERR_RATELIMIT_NR)
suppressing = true;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index d8cd19b3f63c..986938298adc 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -104,6 +104,7 @@ struct fsck_err_state {
#define FSCK_CAN_FIX (1 << 0)
#define FSCK_CAN_IGNORE (1 << 1)
#define FSCK_NEED_FSCK (1 << 2)
+#define FSCK_NO_RATELIMIT (1 << 3)
__printf(3, 4) __cold
enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index bb4b2b4352e0..58b2c96f450c 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -58,10 +58,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
u64 idx = le64_to_cpu(p.v->idx);
unsigned sectors = bpos_min(*end, p.k->p).offset -
bkey_start_offset(p.k);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c r_k;
- for_each_btree_key(trans, iter,
+ for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
BTREE_ITER_SLOTS, r_k, ret2) {
if (bkey_cmp(bkey_start_pos(r_k.k),
@@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
break;
}
}
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_put(trans, iter);
break;
}
}
@@ -94,16 +94,20 @@ static int count_iters_for_insert(struct btree_trans *trans,
#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
-int bch2_extent_atomic_end(struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_trans *trans,
+ struct btree_iter *iter,
struct bkey_i *insert,
struct bpos *end)
{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *copy;
+ struct btree_iter copy;
struct bkey_s_c k;
unsigned nr_iters = 0;
int ret;
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
*end = insert->k.p;
/* extent_update_to_keys(): */
@@ -114,9 +118,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
if (ret < 0)
return ret;
- copy = bch2_trans_copy_iter(trans, iter);
+ bch2_trans_copy_iter(&copy, iter);
- for_each_btree_key_continue(copy, 0, k, ret) {
+ for_each_btree_key_continue_norestart(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@@ -145,66 +149,21 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
break;
}
- bch2_trans_iter_put(trans, copy);
+ bch2_trans_iter_exit(trans, &copy);
return ret < 0 ? ret : 0;
}
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *k)
{
struct bpos end;
int ret;
- ret = bch2_extent_atomic_end(iter, k, &end);
+ ret = bch2_extent_atomic_end(trans, iter, k, &end);
if (ret)
return ret;
bch2_cut_back(end, k);
return 0;
}
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(iter, k, &end);
- if (ret)
- return ret;
-
- return !bkey_cmp(end, k->k.p);
-}
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct bkey_s_c k;
- int ret, sectors;
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- /* Check if we're splitting a compressed extent: */
-
- if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
- bkey_cmp(insert->k.p, k.k->p) < 0 &&
- (sectors = bch2_bkey_sectors_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
- }
- }
-
- return BTREE_INSERT_OK;
-}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 38dc084627d2..6f5cf449361a 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -4,13 +4,9 @@
#include "bcachefs.h"
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
- struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
- struct bkey_i *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b07d39555eb6..89b5be907eea 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -192,9 +192,10 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx written %u min_key ",
+ pr_buf(out, "seq %llx written %u min_key %s",
le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors_written));
+ le16_to_cpu(bp.v->sectors_written),
+ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
bch2_bpos_to_text(out, bp.v->min_key);
pr_buf(out, " ");
@@ -230,112 +231,134 @@ void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-enum merge_result bch2_extent_merge(struct bch_fs *c,
- struct bkey_s _l, struct bkey_s _r)
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
- struct bkey_s_extent l = bkey_s_to_extent(_l);
- struct bkey_s_extent r = bkey_s_to_extent(_r);
- union bch_extent_entry *en_l = l.v->start;
- union bch_extent_entry *en_r = r.v->start;
- struct bch_extent_crc_unpacked crc_l, crc_r;
-
- if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
- return BCH_MERGE_NOMERGE;
-
- crc_l = bch2_extent_crc_unpack(l.k, NULL);
-
- extent_for_each_entry(l, en_l) {
- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+ struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
+ struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
+ union bch_extent_entry *en_l;
+ const union bch_extent_entry *en_r;
+ struct extent_ptr_decoded lp, rp;
+ bool use_right_ptr;
+ struct bch_dev *ca;
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
if (extent_entry_type(en_l) != extent_entry_type(en_r))
- return BCH_MERGE_NOMERGE;
-
- switch (extent_entry_type(en_l)) {
- case BCH_EXTENT_ENTRY_ptr: {
- const struct bch_extent_ptr *lp = &en_l->ptr;
- const struct bch_extent_ptr *rp = &en_r->ptr;
- struct bch_dev *ca;
-
- if (lp->offset + crc_l.compressed_size != rp->offset ||
- lp->dev != rp->dev ||
- lp->gen != rp->gen)
- return BCH_MERGE_NOMERGE;
-
- /* We don't allow extents to straddle buckets: */
- ca = bch_dev_bkey_exists(c, lp->dev);
-
- if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
- return BCH_MERGE_NOMERGE;
-
- break;
- }
- case BCH_EXTENT_ENTRY_stripe_ptr:
- if (en_l->stripe_ptr.block != en_r->stripe_ptr.block ||
- en_l->stripe_ptr.idx != en_r->stripe_ptr.idx)
- return BCH_MERGE_NOMERGE;
- break;
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
- if (crc_l.csum_type != crc_r.csum_type ||
- crc_l.compression_type != crc_r.compression_type ||
- crc_l.nonce != crc_r.nonce)
- return BCH_MERGE_NOMERGE;
-
- if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
- crc_r.offset)
- return BCH_MERGE_NOMERGE;
+ return false;
- if (!bch2_checksum_mergeable(crc_l.csum_type))
- return BCH_MERGE_NOMERGE;
-
- if (crc_is_compressed(crc_l))
- return BCH_MERGE_NOMERGE;
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
- if (crc_l.csum_type &&
- crc_l.uncompressed_size +
- crc_r.uncompressed_size > c->sb.encoded_extent_max)
- return BCH_MERGE_NOMERGE;
+ if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+ return false;
- if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+ rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+ while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+ __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
+ if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+ rp.ptr.offset + rp.crc.offset ||
+ lp.ptr.dev != rp.ptr.dev ||
+ lp.ptr.gen != rp.ptr.gen ||
+ lp.has_ec != rp.has_ec)
+ return false;
+
+ /* Extents may not straddle buckets: */
+ ca = bch_dev_bkey_exists(c, lp.ptr.dev);
+ if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+ return false;
+
+ if (lp.has_ec != rp.has_ec ||
+ (lp.has_ec &&
+ (lp.ec.block != rp.ec.block ||
+ lp.ec.redundancy != rp.ec.redundancy ||
+ lp.ec.idx != rp.ec.idx)))
+ return false;
+
+ if (lp.crc.compression_type != rp.crc.compression_type ||
+ lp.crc.nonce != rp.crc.nonce)
+ return false;
+
+ if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+ lp.crc.uncompressed_size) {
+ /* can use left extent's crc entry */
+ } else if (lp.crc.live_size <= rp.crc.offset ) {
+ /* can use right extent's crc entry */
+ } else {
+ /* check if checksums can be merged: */
+ if (lp.crc.csum_type != rp.crc.csum_type ||
+ lp.crc.nonce != rp.crc.nonce ||
+ crc_is_compressed(lp.crc) ||
+ !bch2_checksum_mergeable(lp.crc.csum_type))
+ return false;
+
+ if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+ rp.crc.offset)
+ return false;
+
+ if (lp.crc.csum_type &&
+ lp.crc.uncompressed_size +
+ rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+ return false;
+
+ if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
bch2_crc_field_size_max[extent_entry_type(en_l)])
- return BCH_MERGE_NOMERGE;
-
- break;
- default:
- return BCH_MERGE_NOMERGE;
+ return false;
}
- }
-
- extent_for_each_entry(l, en_l) {
- struct bch_extent_crc_unpacked crc_l, crc_r;
-
- en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
- if (!extent_entry_is_crc(en_l))
- continue;
-
- crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
- crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
- crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
- crc_l.csum,
- crc_r.csum,
- crc_r.uncompressed_size << 9);
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
- crc_l.uncompressed_size += crc_r.uncompressed_size;
- crc_l.compressed_size += crc_r.compressed_size;
+ use_right_ptr = false;
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end) {
+ if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+ use_right_ptr)
+ en_l->ptr = en_r->ptr;
+
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l =
+ bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r =
+ bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ use_right_ptr = false;
+
+ if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+ crc_l.uncompressed_size) {
+ /* can use left extent's crc entry */
+ } else if (crc_l.live_size <= crc_r.offset ) {
+ /* can use right extent's crc entry */
+ crc_r.offset -= crc_l.live_size;
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+ extent_entry_type(en_l));
+ use_right_ptr = true;
+ } else {
+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+ crc_l.csum,
+ crc_r.csum,
+ crc_r.uncompressed_size << 9);
+
+ crc_l.uncompressed_size += crc_r.uncompressed_size;
+ crc_l.compressed_size += crc_r.compressed_size;
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+ extent_entry_type(en_l));
+ }
+ }
- bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
- extent_entry_type(en_l));
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
}
bch2_key_resize(l.k, l.k->size + r.k->size);
-
- return BCH_MERGE_MERGE;
+ return true;
}
/* KEY_TYPE_reservation: */
@@ -363,25 +386,17 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
r.v->nr_replicas);
}
-enum merge_result bch2_reservation_merge(struct bch_fs *c,
- struct bkey_s _l, struct bkey_s _r)
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
{
struct bkey_s_reservation l = bkey_s_to_reservation(_l);
- struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
if (l.v->generation != r.v->generation ||
l.v->nr_replicas != r.v->nr_replicas)
- return BCH_MERGE_NOMERGE;
-
- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
- bch2_key_resize(l.k, KEY_SIZE_MAX);
- bch2_cut_front_s(l.k->p, r.s);
- return BCH_MERGE_PARTIAL;
- }
+ return false;
bch2_key_resize(l.k, l.k->size + r.k->size);
-
- return BCH_MERGE_MERGE;
+ return true;
}
/* Extent checksum entries: */
@@ -465,7 +480,7 @@ restart_narrow_pointers:
bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+ __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
bch2_extent_ptr_decoded_append(k, &p);
@@ -597,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
return false;
}
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- bool ret = true;
- int err;
-
- end.offset += size;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
- bch2_trans_iter_put(&trans, iter);
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -802,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *dst, *src, *prev;
+ union bch_extent_entry *entry = to_entry(ptr), *next;
+ union bch_extent_entry *ret = entry;
bool drop_crc = true;
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
- src = extent_entry_next(to_entry(ptr));
- if (src != ptrs.end &&
- !extent_entry_is_crc(src))
- drop_crc = false;
-
- dst = to_entry(ptr);
- while ((prev = extent_entry_prev(ptrs, dst))) {
- if (extent_entry_is_ptr(prev))
+ for (next = extent_entry_next(entry);
+ next != ptrs.end;
+ next = extent_entry_next(next)) {
+ if (extent_entry_is_crc(next)) {
break;
-
- if (extent_entry_is_crc(prev)) {
- if (drop_crc)
- dst = prev;
+ } else if (extent_entry_is_ptr(next)) {
+ drop_crc = false;
break;
}
+ }
- dst = prev;
+ extent_entry_drop(k, entry);
+
+ while ((entry = extent_entry_prev(ptrs, entry))) {
+ if (extent_entry_is_ptr(entry))
+ break;
+
+ if ((extent_entry_is_crc(entry) && drop_crc) ||
+ extent_entry_is_stripe_ptr(entry)) {
+ ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_drop(k, entry);
+ }
}
- memmove_u64s_down(dst, src,
- (u64 *) ptrs.end - (u64 *) src);
- k.k->u64s -= (u64 *) src - (u64 *) dst;
+ return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+ union bch_extent_entry *ret =
+ __bch2_bkey_drop_ptr(k, ptr);
+
+ /*
+ * If we deleted all the dirty pointers and there's still cached
+ * pointers, we could set the cached pointers to dirty if they're not
+ * stale - but to do that correctly we'd need to grab an open_bucket
+ * reference so that we don't race with bucket reuse:
+ */
+ if (have_dirty &&
+ !bch2_bkey_dirty_devs(k.s_c).nr) {
+ k.k->type = KEY_TYPE_error;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+ k.k->type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ }
- return dst;
+ return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -906,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
ptr->cached &&
ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
- /* will only happen if all pointers were cached: */
- if (!bch2_bkey_nr_ptrs(k.s_c))
- k.k->type = KEY_TYPE_deleted;
-
return bkey_deleted(k.k);
}
@@ -946,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- crc.csum_type,
- crc.compression_type);
+ bch2_csum_types[crc.csum_type],
+ bch2_compression_types[crc.compression_type]);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9999805f955e..9c2567274a2b 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
- case BCH_EXTENT_ENTRY_ptr:
- return true;
- default:
- return false;
- }
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
@@ -394,8 +394,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-enum merge_result bch2_extent_merge(struct bch_fs *,
- struct bkey_s, struct bkey_s);
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_extent (struct bkey_ops) { \
.key_invalid = bch2_extent_invalid, \
@@ -409,8 +408,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-enum merge_result bch2_reservation_merge(struct bch_fs *,
- struct bkey_s, struct bkey_s);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reservation (struct bkey_ops) { \
.key_invalid = bch2_reservation_invalid, \
@@ -428,6 +426,17 @@ void bch2_extent_crc_append(struct bkey_i *,
/* Generic code for keys with pointers: */
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+ switch (k->type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ return true;
+ default:
+ return false;
+ }
+}
+
static inline bool bkey_extent_is_direct_data(const struct bkey *k)
{
switch (k->type) {
@@ -558,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -570,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
+ struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 00a63fecb976..5f3429e99115 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -6,113 +6,207 @@
#include "dirent.h"
#include "fs-common.h"
#include "inode.h"
+#include "subvolume.h"
#include "xattr.h"
#include <linux/posix_acl.h>
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+ subvol_inum dir,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *new_inode,
const struct qstr *name,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct posix_acl *default_acl,
- struct posix_acl *acl)
+ struct posix_acl *acl,
+ subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL;
- struct btree_iter *inode_iter = NULL;
- struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
+ subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
- u64 dir_offset = 0;
+ u64 dir_target;
+ u32 snapshot;
+ unsigned dir_type = mode_to_type(mode);
int ret;
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
goto err;
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
- if (!name)
- new_inode->bi_flags |= BCH_INODE_UNLINKED;
-
- inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
- if (default_acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- default_acl, ACL_TYPE_DEFAULT);
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ /* Normal create path - allocate a new inode: */
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+ if (flags & BCH_CREATE_TMPFILE)
+ new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
goto err;
+
+ snapshot_src = (subvol_inum) { 0 };
+ } else {
+ /*
+ * Creating a snapshot - we're not allocating a new inode, but
+ * we do have to lookup the root inode of the subvolume we're
+ * snapshotting and update it (in the new snapshot):
+ */
+
+ if (!snapshot_src.inum) {
+ /* Inode wasn't specified, just snapshot: */
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+ BTREE_ITER_CACHED, &s);
+ if (ret)
+ goto err;
+
+ snapshot_src.inum = le64_to_cpu(s.inode);
+ }
+
+ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (new_inode->bi_subvol != snapshot_src.subvol) {
+ /* Not a subvolume root: */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If we're not root, we have to own the subvolume being
+ * snapshotted:
+ */
+ if (uid && new_inode->bi_uid != uid) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ flags |= BCH_CREATE_SUBVOL;
}
- if (acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- acl, ACL_TYPE_ACCESS);
+ new_inum.inum = new_inode->bi_inum;
+ dir_target = new_inode->bi_inum;
+
+ if (flags & BCH_CREATE_SUBVOL) {
+ u32 new_subvol, dir_snapshot;
+
+ ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ snapshot_src.subvol,
+ &new_subvol, &snapshot,
+ (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+ if (ret)
+ goto err;
+
+ new_inode->bi_parent_subvol = dir.subvol;
+ new_inode->bi_subvol = new_subvol;
+ new_inum.subvol = new_subvol;
+ dir_target = new_subvol;
+ dir_type = DT_SUBVOL;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(&dir_iter);
if (ret)
goto err;
}
- if (name) {
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ if (default_acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ default_acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ goto err;
+ }
+
+ if (acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ acl, ACL_TYPE_ACCESS);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (!(flags & BCH_CREATE_TMPFILE)) {
struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
- dir_u->bi_mtime = dir_u->bi_ctime = now;
+ u64 dir_offset;
- if (S_ISDIR(new_inode->bi_mode))
+ if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
- ret = bch2_inode_write(trans, dir_iter, dir_u);
+ ret = bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
- mode_to_type(new_inode->bi_mode),
- name, new_inode->bi_inum,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
&dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
- }
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
}
- /* XXX use bch2_btree_iter_set_snapshot() */
- inode_iter->snapshot = U32_MAX;
- bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
- ret = bch2_inode_write(trans, inode_iter, new_inode);
+ ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, new_inode);
err:
- bch2_trans_iter_put(trans, inode_iter);
- bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
return ret;
}
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
- u64 inum, struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_inode_unpacked *dir_u,
+ subvol_inum inum, struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(c);
u64 dir_offset = 0;
int ret;
- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (dir.subvol != inum.subvol)
+ return -EXDEV;
+
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -120,80 +214,110 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
- name, inum, &dir_offset,
+ name, inum.inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir_inum;
+ inode_u->bi_dir = dir.inum;
inode_u->bi_dir_offset = dir_offset;
}
- ret = bch2_inode_write(trans, dir_iter, dir_u) ?:
- bch2_inode_write(trans, inode_iter, inode_u);
+ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
err:
- bch2_trans_iter_put(trans, dir_iter);
- bch2_trans_iter_put(trans, inode_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
return ret;
}
int bch2_unlink_trans(struct btree_trans *trans,
- u64 dir_inum, struct bch_inode_unpacked *dir_u,
+ subvol_inum dir,
+ struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
+ const struct qstr *name,
+ bool deleting_snapshot)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
- *inode_iter = NULL;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter dirent_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
- u64 inum, now = bch2_current_time(c);
+ subvol_inum inum;
+ u64 now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
- dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
- name, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dirent_iter);
+ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(dirent_iter);
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-
- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
- if (inode_u->bi_dir == k.k->p.inode &&
- inode_u->bi_dir_offset == k.k->p.offset) {
+ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret)
+ goto err;
+ }
+
+ if (deleting_snapshot && !inode_u->bi_subvol) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (deleting_snapshot || inode_u->bi_subvol) {
+ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
+
+ k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the
+ * dirent, not just emit a whiteout in the current snapshot:
+ */
+ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dirent_iter);
+ if (ret)
+ goto err;
+ } else {
+ bch2_inode_nlink_dec(inode_u);
+ }
+
+ if (inode_u->bi_dir == dirent_iter.pos.inode &&
+ inode_u->bi_dir_offset == dirent_iter.pos.offset) {
inode_u->bi_dir = 0;
inode_u->bi_dir_offset = 0;
}
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
- dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
- bch2_inode_nlink_dec(inode_u);
-
- ret = (S_ISDIR(inode_u->bi_mode)
- ? bch2_empty_dir_trans(trans, inum)
- : 0) ?:
- bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
- bch2_inode_write(trans, dir_iter, dir_u) ?:
- bch2_inode_write(trans, inode_iter, inode_u);
+ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash, &dirent_iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
err:
- bch2_trans_iter_put(trans, inode_iter);
- bch2_trans_iter_put(trans, dirent_iter);
- bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
return ret;
}
@@ -222,8 +346,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
}
int bch2_rename_trans(struct btree_trans *trans,
- u64 src_dir, struct bch_inode_unpacked *src_dir_u,
- u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
struct bch_inode_unpacked *src_inode_u,
struct bch_inode_unpacked *dst_inode_u,
const struct qstr *src_name,
@@ -231,25 +355,27 @@ int bch2_rename_trans(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct bch_fs *c = trans->c;
- struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
- struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
+ struct btree_iter src_dir_iter = { NULL };
+ struct btree_iter dst_dir_iter = { NULL };
+ struct btree_iter src_inode_iter = { NULL };
+ struct btree_iter dst_inode_iter = { NULL };
struct bch_hash_info src_hash, dst_hash;
- u64 src_inode, src_offset, dst_inode, dst_offset;
+ subvol_inum src_inum, dst_inum;
+ u64 src_offset, dst_offset;
u64 now = bch2_current_time(c);
int ret;
- src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_dir_iter);
+ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
src_hash = bch2_hash_info_init(c, src_dir_u);
- if (dst_dir != src_dir) {
- dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+ if (dst_dir.inum != src_dir.inum ||
+ dst_dir.subvol != src_dir.subvol) {
+ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
@@ -262,22 +388,20 @@ int bch2_rename_trans(struct btree_trans *trans,
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
- src_name, &src_inode, &src_offset,
- dst_name, &dst_inode, &dst_offset,
+ src_name, &src_inum, &src_offset,
+ dst_name, &dst_inum, &dst_offset,
mode);
if (ret)
goto err;
- src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_inode_iter);
+ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
- if (dst_inode) {
- dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+ if (dst_inum.inum) {
+ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
}
@@ -307,7 +431,7 @@ int bch2_rename_trans(struct btree_trans *trans,
}
if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inode)) {
+ bch2_empty_dir_trans(trans, dst_inum)) {
ret = -ENOTEMPTY;
goto err;
}
@@ -326,12 +450,12 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (S_ISDIR(src_inode_u->bi_mode)) {
+ if (is_subdir_for_nlink(src_inode_u)) {
src_dir_u->bi_nlink--;
dst_dir_u->bi_nlink++;
}
- if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
}
@@ -342,28 +466,28 @@ int bch2_rename_trans(struct btree_trans *trans,
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
- if (src_dir != dst_dir) {
+ if (src_dir.inum != dst_dir.inum) {
dst_dir_u->bi_mtime = now;
dst_dir_u->bi_ctime = now;
}
src_inode_u->bi_ctime = now;
- if (dst_inode)
+ if (dst_inum.inum)
dst_inode_u->bi_ctime = now;
- ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
- (src_dir != dst_dir
- ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+ (src_dir.inum != dst_dir.inum
+ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
: 0 ) ?:
- bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
- (dst_inode
- ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+ (dst_inum.inum
+ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
: 0 );
err:
- bch2_trans_iter_put(trans, dst_inode_iter);
- bch2_trans_iter_put(trans, src_inode_iter);
- bch2_trans_iter_put(trans, dst_dir_iter);
- bch2_trans_iter_put(trans, src_dir_iter);
+ bch2_trans_iter_exit(trans, &dst_inode_iter);
+ bch2_trans_iter_exit(trans, &src_inode_iter);
+ bch2_trans_iter_exit(trans, &dst_dir_iter);
+ bch2_trans_iter_exit(trans, &src_dir_iter);
return ret;
}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index 2273b7961c9b..dde237859514 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -4,27 +4,33 @@
struct posix_acl;
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE (1U << 0)
+#define BCH_CREATE_SUBVOL (1U << 1)
+#define BCH_CREATE_SNAPSHOT (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
uid_t, gid_t, umode_t, dev_t,
struct posix_acl *,
- struct posix_acl *);
+ struct posix_acl *,
+ subvol_inum, unsigned);
-int bch2_link_trans(struct btree_trans *, u64,
- u64, struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
const struct qstr *);
-int bch2_unlink_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
- const struct qstr *);
+ const struct qstr *, bool);
int bch2_rename_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
- u64, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index e6916bbc25eb..5bcdfe3c5890 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -99,8 +99,7 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping,
* is continually redirtying a specific page
*/
do {
- if (!mapping->nrpages &&
- !mapping->nrexceptional)
+ if (!mapping->nrpages)
return 0;
ret = filemap_write_and_wait_range(mapping, start, end);
@@ -224,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
return;
mutex_lock(&inode->ei_quota_lock);
+ BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+ inode->v.i_blocks += sectors;
+
#ifdef CONFIG_BCACHEFS_QUOTA
if (quota_res && sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
@@ -235,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
}
#endif
- inode->v.i_blocks += sectors;
mutex_unlock(&inode->ei_quota_lock);
}
@@ -244,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
/* stored in page->private: */
struct bch_page_sector {
- /* Uncompressed, fully allocated replicas: */
- unsigned nr_replicas:3;
+ /* Uncompressed, fully allocated replicas (or on disk reservation): */
+ unsigned nr_replicas:4;
- /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
- unsigned replicas_reserved:3;
+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+ unsigned replicas_reserved:4;
/* i_sectors: */
enum {
SECTOR_UNALLOCATED,
SECTOR_RESERVED,
SECTOR_DIRTY,
+ SECTOR_DIRTY_RESERVED,
SECTOR_ALLOCATED,
- } state:2;
+ } state:8;
};
struct bch_page_state {
spinlock_t lock;
atomic_t write_count;
+ bool uptodate;
struct bch_page_sector s[PAGE_SECTORS];
};
@@ -282,28 +285,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
@@ -317,13 +305,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
@@ -333,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
+static unsigned bkey_to_sector_state(const struct bkey *k)
+{
+ if (k->type == KEY_TYPE_reservation)
+ return SECTOR_RESERVED;
+ if (bkey_extent_is_allocation(k))
+ return SECTOR_ALLOCATED;
+ return SECTOR_UNALLOCATED;
+}
+
+static void __bch2_page_state_set(struct page *page,
+ unsigned pg_offset, unsigned pg_len,
+ unsigned nr_ptrs, unsigned state)
+{
+ struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+ unsigned i;
+
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ spin_lock(&s->lock);
+
+ for (i = pg_offset; i < pg_offset + pg_len; i++) {
+ s->s[i].nr_replicas = nr_ptrs;
+ s->s[i].state = state;
+ }
+
+ if (i == PAGE_SECTORS)
+ s->uptodate = true;
+
+ spin_unlock(&s->lock);
+}
+
+static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
+ struct page **pages, unsigned nr_pages)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
+ unsigned pg_idx = 0;
+ u32 snapshot;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ while (pg_idx < nr_pages) {
+ struct page *page = pages[pg_idx];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
+ unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+
+ BUG_ON(k.k->p.offset < pg_start);
+ BUG_ON(bkey_start_offset(k.k) > pg_end);
+
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
+ __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+
+ if (k.k->p.offset < pg_end)
+ break;
+ pg_idx++;
+ }
+
+ if (pg_idx == nr_pages)
+ break;
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ bio_for_each_segment(bv, bio, iter)
+ __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+ bv.bv_len >> 9, nr_ptrs, state);
+}
+
+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ s->s[j].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+}
+
+static void mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+ s64 i_sectors_delta = 0;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ switch (s->s[j].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[j].state = SECTOR_RESERVED;
+ break;
+ case SECTOR_DIRTY:
+ s->s[j].state = SECTOR_DIRTY_RESERVED;
+ i_sectors_delta--;
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
{
/* XXX: this should not be open coded */
@@ -417,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c,
if (!s)
return -ENOMEM;
+ BUG_ON(!s->uptodate);
+
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
@@ -471,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page)
disk_res.sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
- if (s->s[i].state == SECTOR_DIRTY) {
- dirty_sectors++;
+ switch (s->s[i].state) {
+ case SECTOR_DIRTY:
s->s[i].state = SECTOR_UNALLOCATED;
+ --dirty_sectors;
+ break;
+ case SECTOR_DIRTY_RESERVED:
+ s->s[i].state = SECTOR_RESERVED;
+ break;
+ default:
+ break;
}
}
bch2_disk_reservation_put(c, &disk_res);
- if (dirty_sectors)
- i_sectors_acct(c, inode, NULL, -dirty_sectors);
+ i_sectors_acct(c, inode, NULL, dirty_sectors);
bch2_page_state_release(page);
}
@@ -513,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c,
s->s[i].replicas_reserved += sectors;
res->disk.sectors -= sectors;
- if (s->s[i].state == SECTOR_UNALLOCATED)
+ switch (s->s[i].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[i].state = SECTOR_DIRTY;
dirty_sectors++;
-
- s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+ break;
+ case SECTOR_RESERVED:
+ s->s[i].state = SECTOR_DIRTY_RESERVED;
+ break;
+ default:
+ break;
+ }
}
spin_unlock(&s->lock);
- if (dirty_sectors)
- i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
if (!PageDirty(page))
__set_page_dirty_nobuffers(page);
@@ -576,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct bch2_page_reservation res;
unsigned len;
loff_t isize;
- int ret = VM_FAULT_LOCKED;
+ int ret;
bch2_page_reservation_init(c, inode, &res);
@@ -602,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
@@ -612,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
bch2_page_reservation_put(c, inode, &res);
wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
out:
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
sb_end_pagefault(inode->v.i_sb);
@@ -646,18 +857,12 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
EBUG_ON(!PageLocked(page));
EBUG_ON(!PageLocked(newpage));
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
@@ -671,10 +876,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -693,31 +898,29 @@ struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
@@ -725,69 +928,14 @@ static int readpages_iter_init(struct readpages_iter *iter,
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
}
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = k.k->type == KEY_TYPE_reservation
- ? SECTOR_RESERVED
- : SECTOR_ALLOCATED;
-
- bio_for_each_segment(bv, bio, iter) {
- struct bch_page_state *s = bch2_page_state(bv.bv_page);
- unsigned i;
-
- for (i = bv.bv_offset >> 9;
- i < (bv.bv_offset + bv.bv_len) >> 9;
- i++) {
- s->s[i].nr_replicas = nr_ptrs;
- s->s[i].state = state;
- }
- }
-}
-
static bool extent_partial_reads_expensive(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -807,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
{
while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
- pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
struct page *page = readpage_iter_next(iter);
int ret;
@@ -820,11 +968,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -851,35 +996,58 @@ static void readpage_bio_extend(struct readpages_iter *iter,
}
}
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
- struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ subvol_inum inum,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ u32 snapshot;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
retry:
+ bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- bch2_btree_iter_set_pos(iter,
- POS(inum, rbio->bio.bi_iter.bi_sector));
+ /*
+ * read_extent -> io_time_reset may cause a transaction restart
+ * without returning an error, we need to check for that here:
+ */
+ if (!bch2_trans_relock(trans)) {
+ ret = -EINTR;
+ break;
+ }
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
@@ -906,10 +1074,9 @@ retry:
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- if (bkey_extent_is_allocation(k.k))
- bch2_add_page_sectors(&rbio->bio, k);
+ bch2_bio_page_state_set(&rbio->bio, k);
- bch2_read_extent(trans, rbio, iter->pos,
+ bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
@@ -917,13 +1084,19 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
+
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
}
+err:
+ bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
if (ret) {
- bch_err_inum_ratelimited(c, inum,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
@@ -932,24 +1105,20 @@ retry:
bch2_bkey_buf_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
- struct btree_iter *iter;
struct page *page;
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
@@ -958,7 +1127,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
unsigned n = min_t(unsigned,
readpages_iter.nr_pages -
readpages_iter.idx,
- BIO_MAX_PAGES);
+ BIO_MAX_VECS);
struct bch_read_bio *rbio =
rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
opts);
@@ -966,43 +1135,34 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
readpages_iter.idx++;
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
- rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
- bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+ bchfs_read(&trans, rbio, inode_inum(inode),
&readpages_iter);
}
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
- bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inum, struct page *page)
+ subvol_inum inum, struct page *page)
{
struct btree_trans trans;
- struct btree_iter *iter;
bch2_page_state_create(page, __GFP_NOFAIL);
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
rbio->bio.bi_iter.bi_sector =
- (sector_t) page->index << PAGE_SECTOR_SHIFT;
+ (sector_t) page->index << PAGE_SECTORS_SHIFT;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
-
- bchfs_read(&trans, iter, rbio, inum, NULL);
-
- bch2_trans_iter_put(&trans, iter);
+ bchfs_read(&trans, rbio, inum, NULL);
bch2_trans_exit(&trans);
}
@@ -1016,7 +1176,7 @@ int bch2_readpage(struct file *file, struct page *page)
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
return 0;
}
@@ -1039,7 +1199,7 @@ static int bch2_read_single_page(struct page *page,
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1081,36 +1241,37 @@ static void bch2_writepage_io_done(struct closure *cl)
struct bch_writepage_io, cl);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
+ unsigned i;
up(&io->op.c->io_in_flight);
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1134,7 +1295,7 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
@@ -1168,8 +1329,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
{
struct bch_write_op *op;
- w->io = container_of(bio_alloc_bioset(GFP_NOFS,
- BIO_MAX_PAGES,
+ w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
@@ -1179,10 +1339,10 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op = &w->io->op;
bch2_write_op_init(op, c, w->opts);
op->target = w->opts.foreground_target;
- op_journal_seq_set(op, &inode->ei_journal_seq);
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
@@ -1225,16 +1385,16 @@ static int __bch2_writepage(struct page *page,
do_io:
s = bch2_page_state_create(page, __GFP_NOFAIL);
- ret = bch2_get_page_disk_reservation(c, inode, page, true);
- if (ret) {
- SetPageError(page);
- mapping_set_error(page->mapping, ret);
- unlock_page(page);
- return 0;
- }
+ /*
+ * Things get really hairy with errors during writeback:
+ */
+ ret = bch2_get_page_disk_reservation(c, inode, page, false);
+ BUG_ON(ret);
/* Before unlocking the page, get copy of reservations: */
+ spin_lock(&s->lock);
orig = *s;
+ spin_unlock(&s->lock);
for (i = 0; i < PAGE_SECTORS; i++) {
if (s->s[i].state < SECTOR_DIRTY)
@@ -1267,7 +1427,7 @@ do_io:
offset = 0;
while (1) {
- unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
u64 sector;
while (offset < PAGE_SECTORS &&
@@ -1277,22 +1437,21 @@ do_io:
if (offset == PAGE_SECTORS)
break;
- sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
-
while (offset + sectors < PAGE_SECTORS &&
- orig.s[offset + sectors].state >= SECTOR_DIRTY)
+ orig.s[offset + sectors].state >= SECTOR_DIRTY) {
+ reserved_sectors += orig.s[offset + sectors].replicas_reserved;
+ dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
sectors++;
-
- for (i = offset; i < offset + sectors; i++) {
- reserved_sectors += orig.s[i].replicas_reserved;
- dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
}
+ BUG_ON(!sectors);
+
+ sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
- (BIO_MAX_PAGES * PAGE_SIZE) ||
+ (BIO_MAX_VECS * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
@@ -1403,6 +1562,12 @@ readpage:
if (ret)
goto err;
out:
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+ if (ret)
+ goto out;
+ }
+
ret = bch2_page_reservation_get(c, inode, page, res,
offset, len, true);
if (ret) {
@@ -1532,20 +1697,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
}
while (reserved < len) {
- struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+ unsigned i = (offset + reserved) >> PAGE_SHIFT;
+ struct page *page = pages[i];
unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
unsigned pg_len = min_t(unsigned, len - reserved,
PAGE_SIZE - pg_offset);
-retry_reservation:
- ret = bch2_page_reservation_get(c, inode, page, &res,
- pg_offset, pg_len, true);
- if (ret && !PageUptodate(page)) {
- ret = bch2_read_single_page(page, mapping);
- if (!ret)
- goto retry_reservation;
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode),
+ pages + i, nr_pages - i);
+ if (ret)
+ goto out;
}
+ ret = bch2_page_reservation_get(c, inode, page, &res,
+ pg_offset, pg_len, true);
if (ret)
goto out;
@@ -1561,8 +1727,8 @@ retry_reservation:
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
unsigned pg_len = min_t(unsigned, len - copied,
PAGE_SIZE - pg_offset);
- unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
- iter, pg_offset, pg_len);
+ unsigned pg_copied = copy_page_from_iter_atomic(page,
+ pg_offset, pg_len,iter);
if (!pg_copied)
break;
@@ -1575,7 +1741,6 @@ retry_reservation:
}
flush_dcache_page(page);
- iov_iter_advance(iter, pg_copied);
copied += pg_copied;
if (pg_copied != pg_len)
@@ -1693,18 +1858,6 @@ again:
/* O_DIRECT reads */
-static void bio_release_pages(struct bio *bio, bool mark_dirty)
-{
- struct bio_vec *bvec;
- unsigned i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- if (mark_dirty && !PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
- }
-}
-
static void bio_check_or_release(struct bio *bio, bool check_dirty)
{
if (check_dirty) {
@@ -1768,7 +1921,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
iter->count -= shorten;
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ iov_iter_npages(iter, BIO_MAX_VECS),
&c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -1803,7 +1956,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
goto start;
while (iter->count) {
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ iov_iter_npages(iter, BIO_MAX_VECS),
&c->bio_read);
bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
@@ -1827,7 +1980,7 @@ start:
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
iter->count += shorten;
@@ -1882,6 +2035,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
/* O_DIRECT writes */
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 size,
+ unsigned nr_replicas, bool compressed)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 end = offset + size;
+ u32 snapshot;
+ bool ret = true;
+ int err;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (err)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+ break;
+
+ if (k.k->p.snapshot != snapshot ||
+ nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
+ ret = false;
+ break;
+ }
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (err == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return err ? false : ret;
+}
+
static void bch2_dio_write_loop_async(struct bch_write_op *);
static long bch2_dio_write_loop(struct dio_write *dio)
@@ -1892,8 +2089,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned, iter_count;
+ unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
@@ -1906,7 +2104,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
iter_count = dio->iter.count;
if (kthread)
- use_mm(dio->mm);
+ kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
@@ -1916,7 +2114,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
current->faults_disabled_mapping = NULL;
if (kthread)
- unuse_mm(dio->mm);
+ kthread_unuse_mm(dio->mm);
/*
* If the fault handler returned an error but also signalled
@@ -1949,8 +2147,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages:
*/
- bio_for_each_segment_all(bv, bio, i)
- put_page(bv->bv_page);
ret = -EFAULT;
goto err;
}
@@ -1958,9 +2154,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
dio->op.end_io = bch2_dio_write_loop_async;
dio->op.target = dio->op.opts.foreground_target;
- op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
+ dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
if ((req->ki_flags & IOCB_DSYNC) &&
@@ -1971,8 +2167,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
- !bch2_check_range_allocated(c, dio->op.pos,
- bio_sectors(bio),
+ !bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0))
goto err;
@@ -2015,8 +2211,10 @@ loop:
i_size_write(&inode->v, req->ki_pos);
spin_unlock(&inode->v.i_lock);
- bio_for_each_segment_all(bv, bio, i)
- put_page(bv->bv_page);
+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
+ bio->bi_vcnt = 0;
if (dio->op.error) {
set_bit(EI_INODE_ERROR, &inode->ei_flags);
@@ -2039,6 +2237,9 @@ err:
if (dio->free_iov)
kfree(dio->iter.iov);
+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
bio_put(bio);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
@@ -2105,7 +2306,9 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
}
bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ iov_iter_is_bvec(iter)
+ ? 0
+ : iov_iter_npages(iter, BIO_MAX_VECS),
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
init_completion(&dio->done);
@@ -2182,45 +2385,58 @@ unlock:
/* fsync: */
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, ret2;
+ struct bch_inode_unpacked inode;
+ int ret;
- ret = file_write_and_wait_range(file, start, end);
+ if (c->opts.journal_flush_disabled)
+ return 0;
+
+ ret = bch2_inode_find_by_inum(c, inum, &inode);
if (ret)
return ret;
- if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
- goto out;
+ return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+}
- ret = sync_inode_metadata(&inode->v, 1);
- if (ret)
- return ret;
-out:
- if (!c->opts.journal_flush_disabled)
- ret = bch2_journal_flush_seq(&c->journal,
- inode->ei_journal_seq);
- ret2 = file_check_and_advance_wb_err(file);
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret, ret2, ret3;
- return ret ?: ret2;
+ ret = file_write_and_wait_range(file, start, end);
+ ret2 = sync_inode_metadata(&inode->v, 1);
+ ret3 = bch2_flush_inode(c, inode_inum(inode));
+
+ return ret ?: ret2 ?: ret3;
}
/* truncate: */
-static inline int range_has_data(struct bch_fs *c,
- struct bpos start,
- struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+ struct bpos start,
+ struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
+ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
@@ -2229,9 +2445,14 @@ static inline int range_has_data(struct bch_fs *c,
break;
}
}
- bch2_trans_iter_put(&trans, iter);
+ start = iter.pos;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
static int __bch2_truncate_page(struct bch_inode_info *inode,
@@ -2244,6 +2465,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
unsigned i;
struct page *page;
+ s64 i_sectors_delta = 0;
int ret = 0;
/* Page boundary? Nothing to do */
@@ -2261,9 +2483,9 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* page
*/
- ret = range_has_data(c,
- POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
- POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+ ret = range_has_data(c, inode->ei_subvol,
+ POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
if (ret <= 0)
return ret;
@@ -2295,9 +2517,21 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
i < round_down(end_offset, block_bytes(c)) >> 9;
i++) {
s->s[i].nr_replicas = 0;
+ if (s->s[i].state == SECTOR_DIRTY)
+ i_sectors_delta--;
s->s[i].state = SECTOR_UNALLOCATED;
}
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+ /*
+ * Caller needs to know whether this page will be written out by
+ * writeback - doing an i_size update if necessary - or whether it will
+ * be responsible for the i_size update:
+ */
+ ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
+ PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+
zero_user_segment(page, start_offset, end_offset);
/*
@@ -2306,8 +2540,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* XXX: because we aren't currently tracking whether the page has actual
* data in it (vs. just 0s, or only partially written) this wrong. ick.
*/
- ret = bch2_get_page_disk_reservation(c, inode, page, false);
- BUG_ON(ret);
+ BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
/*
* This removes any writeable userspace mappings; we need to force
@@ -2329,11 +2562,25 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
from, round_up(from, PAGE_SIZE));
}
-static int bch2_extend(struct bch_inode_info *inode,
+static int bch2_truncate_pages(struct bch_inode_info *inode,
+ loff_t start, loff_t end)
+{
+ int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
+ start, end);
+
+ if (ret >= 0 &&
+ start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+ ret = __bch2_truncate_page(inode,
+ end >> PAGE_SHIFT,
+ start, end);
+ return ret;
+}
+
+static int bch2_extend(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *inode_u,
struct iattr *iattr)
{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
int ret;
@@ -2347,24 +2594,15 @@ static int bch2_extend(struct bch_inode_info *inode,
return ret;
truncate_setsize(&inode->v, iattr->ia_size);
- setattr_copy(&inode->v, iattr);
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size,
- ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
- return ret;
+ return bch2_setattr_nonsize(mnt_userns, inode, iattr);
}
static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
-
bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
- bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
return 0;
}
@@ -2378,30 +2616,33 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode,
return 0;
}
-int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
+int bch2_truncate(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
struct bch_inode_unpacked inode_u;
- struct btree_trans trans;
- struct btree_iter *iter;
u64 new_i_size = iattr->ia_size;
s64 i_sectors_delta = 0;
int ret = 0;
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
/*
- * fetch current on disk i_size: inode is locked, i_size can only
- * increase underneath us:
+ * If the truncate call with change the size of the file, the
+ * cmtimes should be updated. If the size will not change, we
+ * do not need to update the cmtimes.
*/
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
- ret = PTR_ERR_OR_ZERO(iter);
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ if (iattr->ia_size != inode->v.i_size) {
+ if (!(iattr->ia_valid & ATTR_MTIME))
+ ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+ if (!(iattr->ia_valid & ATTR_CTIME))
+ ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+ }
+ inode_dio_wait(&inode->v);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
goto err;
@@ -2418,12 +2659,14 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
inode->v.i_size < inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
- ret = bch2_extend(inode, &inode_u, iattr);
+ ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
goto err;
}
+ iattr->ia_valid &= ~ATTR_SIZE;
+
ret = bch2_truncate_page(inode, iattr->ia_size);
- if (unlikely(ret))
+ if (unlikely(ret < 0))
goto err;
/*
@@ -2457,20 +2700,21 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
truncate_setsize(&inode->v, iattr->ia_size);
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
round_up(iattr->ia_size, block_bytes(c)) >> 9,
- U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+ U64_MAX, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ BUG_ON(!inode->v.i_size && inode->v.i_blocks);
+
if (unlikely(ret))
goto err;
- setattr_copy(&inode->v, iattr);
-
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
- ATTR_MTIME|ATTR_CTIME);
+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
mutex_unlock(&inode->ei_update_lock);
+
+ ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
return ret;
@@ -2490,49 +2734,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode,
static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
- u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+ u64 end = offset + len;
+ u64 block_start = round_up(offset, block_bytes(c));
+ u64 block_end = round_down(end, block_bytes(c));
+ bool truncated_last_page;
int ret = 0;
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
- ret = __bch2_truncate_page(inode,
- offset >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
+ ret = bch2_truncate_pages(inode, offset, end);
+ if (unlikely(ret < 0))
goto err;
- if (offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT) {
- ret = __bch2_truncate_page(inode,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
- goto err;
- }
+ truncated_last_page = ret;
- truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+ truncate_pagecache_range(&inode->v, offset, end - 1);
- if (discard_start < discard_end) {
+ if (block_start < block_end ) {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
- discard_start, discard_end,
- &inode->ei_journal_seq,
+ ret = bch2_fpunch(c, inode_inum(inode),
+ block_start >> 9, block_end >> 9,
&i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_MTIME|ATTR_CTIME) ?: ret;
+ if (end >= inode->v.i_size && !truncated_last_page) {
+ ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+ ATTR_MTIME|ATTR_CTIME);
+ } else {
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_MTIME|ATTR_CTIME);
+ }
mutex_unlock(&inode->ei_update_lock);
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
-
return ret;
}
@@ -2544,7 +2778,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct address_space *mapping = inode->v.i_mapping;
struct bkey_buf copy;
struct btree_trans trans;
- struct btree_iter *src, *dst, *del;
+ struct btree_iter src, dst, del;
loff_t shift, new_size;
u64 src_start;
int ret = 0;
@@ -2552,31 +2786,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
- /*
- * We need i_mutex to keep the page cache consistent with the extents
- * btree, and the btree consistent with i_size - we don't need outside
- * locking for the extents btree itself, because we're using linked
- * iterators
- */
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
if (insert) {
- ret = -EFBIG;
if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
- goto err;
+ return -EFBIG;
- ret = -EINVAL;
if (offset >= inode->v.i_size)
- goto err;
+ return -EINVAL;
src_start = U64_MAX;
shift = len;
} else {
- ret = -EINVAL;
if (offset + len >= inode->v.i_size)
- goto err;
+ return -EINVAL;
src_start = offset + len;
shift = -len;
@@ -2586,7 +2807,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
if (ret)
- goto err;
+ return ret;
if (insert) {
i_size_write(&inode->v, new_size);
@@ -2597,23 +2818,22 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
} else {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
offset >> 9, (offset + len) >> 9,
- &inode->ei_journal_seq,
&i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
if (ret)
- goto err;
+ return ret;
}
bch2_bkey_buf_init(&copy);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+ bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
POS(inode->v.i_ino, src_start >> 9),
BTREE_ITER_INTENT);
- dst = bch2_trans_copy_iter(&trans, src);
- del = bch2_trans_copy_iter(&trans, src);
+ bch2_trans_copy_iter(&dst, &src);
+ bch2_trans_copy_iter(&del, &src);
while (ret == 0 || ret == -EINTR) {
struct disk_reservation disk_res =
@@ -2624,10 +2844,24 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
struct bpos atomic_end;
unsigned trigger_flags = 0;
+ u32 snapshot;
+
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src, snapshot);
+ bch2_btree_iter_set_snapshot(&dst, snapshot);
+ bch2_btree_iter_set_snapshot(&del, snapshot);
+
+ bch2_trans_begin(&trans);
k = insert
- ? bch2_btree_iter_peek_prev(src)
- : bch2_btree_iter_peek(src);
+ ? bch2_btree_iter_peek_prev(&src)
+ : bch2_btree_iter_peek(&src);
if ((ret = bkey_err(k)))
continue;
@@ -2645,9 +2879,9 @@ reassemble:
bch2_cut_front(move_pos, copy.k);
copy.k->k.p.offset += shift >> 9;
- bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
+ bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
- ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
+ ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
if (ret)
continue;
@@ -2665,7 +2899,7 @@ reassemble:
delete.k.p = copy.k->k.p;
delete.k.size = copy.k->k.size;
delete.k.p.offset -= shift >> 9;
- bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
+ bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
@@ -2686,35 +2920,36 @@ reassemble:
BUG_ON(ret);
}
- ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
- bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
- bch2_trans_commit(&trans, &disk_res,
- &inode->ei_journal_seq,
+ ret = bch2_btree_iter_traverse(&del) ?:
+ bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+ bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
+ bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(c, &disk_res);
if (!ret)
- bch2_btree_iter_set_pos(src, next_pos);
+ bch2_btree_iter_set_pos(&src, next_pos);
}
- bch2_trans_iter_put(&trans, del);
- bch2_trans_iter_put(&trans, dst);
- bch2_trans_iter_put(&trans, src);
+ bch2_trans_iter_exit(&trans, &del);
+ bch2_trans_iter_exit(&trans, &dst);
+ bch2_trans_iter_exit(&trans, &src);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&copy, c);
if (ret)
- goto err;
+ return ret;
+ mutex_lock(&inode->ei_update_lock);
if (!insert) {
i_size_write(&inode->v, new_size);
- mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, new_size,
ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
+ } else {
+ /* We need an inode update to update bi_journal_seq for fsync: */
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_MTIME|ATTR_CTIME);
}
-err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
+ mutex_unlock(&inode->ei_update_lock);
return ret;
}
@@ -2723,41 +2958,49 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bpos end_pos = POS(inode->v.i_ino, end_sector);
unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
+ while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
s64 i_sectors_delta = 0;
struct disk_reservation disk_res = { 0 };
struct quota_res quota_res = { 0 };
struct bkey_i_reservation reservation;
struct bkey_s_c k;
unsigned sectors;
+ u32 snapshot;
bch2_trans_begin(&trans);
- k = bch2_btree_iter_peek_slot(iter);
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto bkey_err;
/* already reserved */
if (k.k->type == KEY_TYPE_reservation &&
bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
- bch2_btree_iter_next_slot(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
if (bkey_extent_is_data(k.k) &&
!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_next_slot(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
@@ -2766,7 +3009,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
reservation.k.p = k.k->p;
reservation.k.size = k.k->size;
- bch2_cut_front(iter->pos, &reservation.k_i);
+ bch2_cut_front(iter.pos, &reservation.k_i);
bch2_cut_back(end_pos, &reservation.k_i);
sectors = reservation.k.size;
@@ -2790,9 +3033,12 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
- ret = bch2_extent_update(&trans, iter, &reservation.k_i,
- &disk_res, &inode->ei_journal_seq,
+ ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+ &reservation.k_i,
+ &disk_res, NULL,
0, &i_sectors_delta, true);
+ if (ret)
+ goto bkey_err;
i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
@@ -2800,7 +3046,21 @@ bkey_err:
if (ret == -EINTR)
ret = 0;
}
- bch2_trans_iter_put(&trans, iter);
+
+ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
+ mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
+
+ if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+ struct quota_res quota_res = { 0 };
+ s64 i_sectors_delta = 0;
+
+ bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+ end_sector, &i_sectors_delta);
+ i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+ bch2_quota_reservation_put(c, inode, &quota_res);
+ }
+
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -2808,77 +3068,58 @@ bkey_err:
static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len)
{
- struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- loff_t end = offset + len;
- loff_t block_start = round_down(offset, block_bytes(c));
- loff_t block_end = round_up(end, block_bytes(c));
- int ret;
-
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ u64 end = offset + len;
+ u64 block_start = round_down(offset, block_bytes(c));
+ u64 block_end = round_up(end, block_bytes(c));
+ bool truncated_last_page = false;
+ int ret, ret2 = 0;
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
if (ret)
- goto err;
+ return ret;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = __bch2_truncate_page(inode,
- offset >> PAGE_SHIFT,
- offset, end);
-
- if (!ret &&
- offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
- ret = __bch2_truncate_page(inode,
- end >> PAGE_SHIFT,
- offset, end);
+ ret = bch2_truncate_pages(inode, offset, end);
+ if (unlikely(ret < 0))
+ return ret;
- if (unlikely(ret))
- goto err;
+ truncated_last_page = ret;
truncate_pagecache_range(&inode->v, offset, end - 1);
+
+ block_start = round_up(offset, block_bytes(c));
+ block_end = round_down(end, block_bytes(c));
}
ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
- if (ret)
- goto err;
/*
- * Do we need to extend the file?
- *
- * If we zeroed up to the end of the file, we dropped whatever writes
- * were going to write out the current i_size, so we have to extend
- * manually even if FL_KEEP_SIZE was set:
+ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+ * so that the VFS cache i_size is consistent with the btree i_size:
*/
- if (end >= inode->v.i_size &&
- (!(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_ZERO_RANGE))) {
+ if (ret &&
+ !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+ return ret;
- /*
- * Sync existing appends before extending i_size,
- * as in bch2_extend():
- */
- ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size, S64_MAX);
- if (ret)
- goto err;
+ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+ end = inode->v.i_size;
- if (mode & FALLOC_FL_KEEP_SIZE)
- end = inode->v.i_size;
- else
- i_size_write(&inode->v, end);
+ if (end >= inode->v.i_size &&
+ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+ !(mode & FALLOC_FL_KEEP_SIZE))) {
+ spin_lock(&inode->v.i_lock);
+ i_size_write(&inode->v, end);
+ spin_unlock(&inode->v.i_lock);
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, end, 0);
+ ret2 = bch2_write_inode_size(c, inode, end, 0);
mutex_unlock(&inode->ei_update_lock);
}
-err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
- return ret;
+
+ return ret ?: ret2;
}
long bch2_fallocate_dispatch(struct file *file, int mode,
@@ -2891,6 +3132,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
if (!percpu_ref_tryget(&c->writes))
return -EROFS;
+ inode_lock(&inode->v);
+ inode_dio_wait(&inode->v);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
ret = bchfs_fallocate(inode, mode, offset, len);
else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -2902,277 +3147,14 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
else
ret = -EOPNOTSUPP;
+
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ inode_unlock(&inode->v);
percpu_ref_put(&c->writes);
return ret;
}
-static void mark_range_unallocated(struct bch_inode_info *inode,
- loff_t start, loff_t end)
-{
- pgoff_t index = start >> PAGE_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
- struct pagevec pvec;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct bch_page_state *s;
-
- lock_page(page);
- s = bch2_page_state(page);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- unlock_page(page);
- }
- pagevec_release(&pvec);
- } while (index <= end_index);
-}
-
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
-{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
- int ret;
-
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
-
- ret = generic_write_check_limits(file_out, pos_out, &count);
- if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
-
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
- }
-
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
-
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
@@ -3218,13 +3200,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret)
goto err;
- mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+ mark_pagecache_unallocated(src, pos_src >> 9,
+ (pos_src + aligned_len) >> 9);
ret = bch2_remap_range(c,
- POS(dst->v.i_ino, pos_dst >> 9),
- POS(src->v.i_ino, pos_src >> 9),
+ inode_inum(dst), pos_dst >> 9,
+ inode_inum(src), pos_src >> 9,
aligned_len >> 9,
- &dst->ei_journal_seq,
pos_dst + len, &i_sectors_delta);
if (ret < 0)
goto err;
@@ -3242,10 +3224,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
i_size_write(&dst->v, pos_dst + ret);
spin_unlock(&dst->v.i_lock);
- if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
- IS_SYNC(file_inode(file_dst))) &&
- !c->opts.journal_flush_disabled)
- ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq);
+ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+ IS_SYNC(file_inode(file_dst)))
+ ret = bch2_flush_inode(c, inode_inum(dst));
err:
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
@@ -3311,9 +3292,11 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@@ -3321,9 +3304,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
@@ -3332,9 +3321,12 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
} else if (k.k->p.offset >> 9 > isize)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
if (ret)
return ret;
@@ -3407,9 +3399,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
@@ -3417,9 +3411,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
@@ -3436,9 +3436,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
offset = max(offset, bkey_start_offset(k.k) << 9);
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
if (ret)
return ret;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index 1b593ea707d5..b24efeaf343e 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
@@ -32,13 +31,10 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
int bch2_fsync(struct file *, loff_t, loff_t, int);
-int bch2_truncate(struct bch_inode_info *, struct iattr *);
+int bch2_truncate(struct user_namespace *,
+ struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 258f7f967dbb..9f329a624c12 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -10,7 +10,11 @@
#include "quota.h"
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -81,7 +85,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
return ret;
inode_lock(&inode->v);
- if (!inode_owner_or_capable(&inode->v)) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
ret = -EACCES;
goto setflags_out;
}
@@ -152,7 +156,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
return ret;
inode_lock(&inode->v);
- if (!inode_owner_or_capable(&inode->v)) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
ret = -EACCES;
goto err;
}
@@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
char *kname = NULL;
struct qstr qstr;
int ret = 0;
- u64 inum;
+ subvol_inum inum;
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
if (!kname)
@@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
qstr.len = ret;
qstr.name = kname;
- ret = -ENOENT;
- inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
- &qstr);
- if (!inum)
+ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+ if (ret)
goto err1;
vinode = bch2_vfs_inode_get(c, inum);
@@ -266,22 +268,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
down_write(&c->vfs_sb->s_umount);
switch (flags) {
- case FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(c->vfs_sb->s_bdev);
+ case FSOP_GOING_FLAGS_DEFAULT:
+ ret = freeze_bdev(c->vfs_sb->s_bdev);
if (ret)
goto err;
- if (sb && !IS_ERR(sb)) {
- bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
- bch2_fs_emergency_read_only(c);
- thaw_bdev(c->vfs_sb->s_bdev, sb);
- }
+ bch2_journal_flush(&c->journal);
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ thaw_bdev(c->vfs_sb->s_bdev);
break;
- }
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
+ fallthrough;
case FSOP_GOING_FLAGS_NOLOGFLUSH:
c->vfs_sb->s_flags |= SB_RDONLY;
@@ -296,6 +296,161 @@ err:
return ret;
}
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct inode *dir;
+ struct bch_inode_info *inode;
+ struct user_namespace *s_user_ns;
+ struct dentry *dst_dentry;
+ struct path src_path, dst_path;
+ int how = LOOKUP_FOLLOW;
+ int error;
+ subvol_inum snapshot_src = { 0 };
+ unsigned lookup_flags = 0;
+ unsigned create_flags = BCH_CREATE_SUBVOL;
+
+ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+ BCH_SUBVOL_SNAPSHOT_RO))
+ return -EINVAL;
+
+ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ (arg.src_ptr ||
+ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+ return -EINVAL;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ create_flags |= BCH_CREATE_SNAPSHOT;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+ create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+ /* why do we need this lock? */
+ down_read(&c->vfs_sb->s_umount);
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ sync_inodes_sb(c->vfs_sb);
+retry:
+ if (arg.src_ptr) {
+ error = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.src_ptr,
+ how, &src_path);
+ if (error)
+ goto err1;
+
+ if (src_path.dentry->d_sb->s_fs_info != c) {
+ path_put(&src_path);
+ error = -EXDEV;
+ goto err1;
+ }
+
+ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+ }
+
+ dst_dentry = user_path_create(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ &dst_path, lookup_flags);
+ error = PTR_ERR_OR_ZERO(dst_dentry);
+ if (error)
+ goto err2;
+
+ if (dst_dentry->d_sb->s_fs_info != c) {
+ error = -EXDEV;
+ goto err3;
+ }
+
+ if (dst_dentry->d_inode) {
+ error = -EEXIST;
+ goto err3;
+ }
+
+ dir = dst_path.dentry->d_inode;
+ if (IS_DEADDIR(dir)) {
+ error = -ENOENT;
+ goto err3;
+ }
+
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid())) {
+ error = -EOVERFLOW;
+ goto err3;
+ }
+
+ error = inode_permission(file_mnt_user_ns(filp),
+ dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ goto err3;
+
+ if (!IS_POSIXACL(dir))
+ arg.mode &= ~current_umask();
+
+ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+ if (error)
+ goto err3;
+
+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ !arg.src_ptr)
+ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
+ dst_dentry, arg.mode|S_IFDIR,
+ 0, snapshot_src, create_flags);
+ error = PTR_ERR_OR_ZERO(inode);
+ if (error)
+ goto err3;
+
+ d_instantiate(dst_dentry, &inode->v);
+ fsnotify_mkdir(dir, dst_dentry);
+err3:
+ done_path_create(&dst_path, dst_dentry);
+err2:
+ if (arg.src_ptr)
+ path_put(&src_path);
+
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+err1:
+ up_read(&c->vfs_sb->s_umount);
+
+ return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct path path;
+ struct inode *dir;
+ int ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ ret = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ret;
+
+ if (path.dentry->d_sb->s_fs_info != c) {
+ path_put(&path);
+ return -EXDEV;
+ }
+
+ dir = path.dentry->d_parent->d_inode;
+
+ ret = __bch2_unlink(dir, path.dentry, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, path.dentry);
+ d_delete(path.dentry);
+ }
+ path_put(&path);
+
+ return ret;
+}
+
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
@@ -326,6 +481,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case FS_IOC_GOINGDOWN:
return bch2_ioc_goingdown(c, (u32 __user *) arg);
+ case BCH_IOCTL_SUBVOLUME_CREATE: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_create(c, file, i);
+ }
+
+ case BCH_IOCTL_SUBVOLUME_DESTROY: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_destroy(c, file, i);
+ }
+
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7899feb68399..7eb33da9c253 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -27,6 +27,7 @@
#include <linux/exportfs.h>
#include <linux/fiemap.h>
#include <linux/module.h>
+#include <linux/pagemap.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
#include <linux/statfs.h>
@@ -35,30 +36,10 @@
static struct kmem_cache *bch2_inode_cache;
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_info *,
- struct bch_inode_unpacked *);
-
-static void journal_seq_copy(struct bch_fs *c,
- struct bch_inode_info *dst,
- u64 journal_seq)
-{
- /*
- * atomic64_cmpxchg has a fallback for archs that don't support it,
- * cmpxchg does not:
- */
- atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
- u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
- do {
- old = v;
-
- if (old >= journal_seq)
- break;
- } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-
- bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
-}
+ struct bch_inode_unpacked *,
+ struct bch_subvolume *);
static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
{
@@ -112,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock)
__pagecache_lock_get(lock, -1);
}
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
unsigned fields)
{
+ struct bch_fs *c = trans->c;
+
+ BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+ bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+ POS(0, bi->bi_inum),
+ 0 && c->opts.inodes_use_key_cache);
+
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
i_gid_write(&inode->v, bi->bi_gid);
@@ -140,32 +129,29 @@ int __must_check bch2_write_inode(struct bch_fs *c,
void *p, unsigned fields)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bch_inode_unpacked inode_u;
int ret;
bch2_trans_init(&trans, c, 0, 512);
+ trans.ip = _RET_IP_;
retry:
bch2_trans_begin(&trans);
- iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter) ?:
+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT) ?:
(set ? set(inode, &inode_u, p) : 0) ?:
- bch2_inode_write(&trans, iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_NOFAIL);
+ bch2_inode_write(&trans, &iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
* this is important for inode updates via bchfs_write_index_update
*/
if (!ret)
- bch2_inode_update_after_write(c, inode, &inode_u, fields);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
@@ -209,51 +195,82 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ return inode->ei_subvol == inum->subvol &&
+ inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ inode->v.i_ino = inum->inum;
+ inode->ei_subvol = inum->subvol;
+ inode->ei_inode.bi_inum = inum->inum;
+ return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
+ struct btree_trans trans;
+ struct bch_subvolume subvol;
int ret;
- inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+ inode = to_bch_ei(iget5_locked(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
return &inode->v;
- ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+ bch2_trans_init(&trans, c, 8, 0);
+ ret = lockrestart_do(&trans,
+ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+
+ if (!ret)
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+ bch2_trans_exit(&trans);
+
if (ret) {
iget_failed(&inode->v);
return ERR_PTR(ret);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
-
- inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
-
unlock_new_inode(&inode->v);
return &inode->v;
}
-static int inum_test(struct inode *inode, void *p)
-{
- unsigned long *ino = p;
-
- return *ino == inode->i_ino;
-}
-
-static struct bch_inode_info *
-__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev, bool tmpfile)
+struct bch_inode_info *
+__bch2_create(struct user_namespace *mnt_userns,
+ struct bch_inode_info *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct user_namespace *ns = dir->v.i_sb->s_user_ns;
struct btree_trans trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
+ subvol_inum inum;
+ struct bch_subvolume subvol;
u64 journal_seq = 0;
int ret;
@@ -274,27 +291,34 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
bch2_inode_init_early(c, &inode_u);
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_lock(&dir->ei_update_lock);
bch2_trans_init(&trans, c, 8,
- 2048 + (!tmpfile ? dentry->d_name.len : 0));
+ 2048 + (!(flags & BCH_CREATE_TMPFILE)
+ ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
- ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
- !tmpfile ? &dentry->d_name : NULL,
- from_kuid(ns, current_fsuid()),
- from_kgid(ns, current_fsgid()),
+ ret = bch2_create_trans(&trans,
+ inode_inum(dir), &dir_u, &inode_u,
+ !(flags & BCH_CREATE_TMPFILE)
+ ? &dentry->d_name : NULL,
+ from_kuid(mnt_userns, current_fsuid()),
+ from_kgid(mnt_userns, current_fsgid()),
mode, rdev,
- default_acl, acl) ?:
+ default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
KEY_TYPE_QUOTA_PREALLOC);
if (unlikely(ret))
goto err_before_quota;
- ret = bch2_trans_commit(&trans, NULL, &journal_seq,
- BTREE_INSERT_NOUNLOCK);
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.inum = inode_u.bi_inum;
+
+ ret = bch2_subvolume_get(&trans, inum.subvol, true,
+ BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ bch2_trans_commit(&trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
@@ -304,15 +328,14 @@ err_before_quota:
goto err_trans;
}
- if (!tmpfile) {
- bch2_inode_update_after_write(c, dir, &dir_u,
+ if (!(flags & BCH_CREATE_TMPFILE)) {
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, dir, journal_seq);
mutex_unlock(&dir->ei_update_lock);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
- journal_seq_copy(c, inode, journal_seq);
+ bch2_iget5_set(&inode->v, &inum);
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -324,8 +347,12 @@ err_before_quota:
*/
inode->v.i_state |= I_CREATING;
- old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
- inum_test, NULL, &inode->v.i_ino));
+
+ old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
@@ -333,7 +360,6 @@ err_before_quota:
* We raced, another process pulled the new inode into cache
* before us:
*/
- journal_seq_copy(c, old, journal_seq);
make_bad_inode(&inode->v);
iput(&inode->v);
@@ -352,7 +378,7 @@ err:
posix_acl_release(acl);
return inode;
err_trans:
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_unlock(&dir->ei_update_lock);
bch2_trans_exit(&trans);
@@ -371,22 +397,25 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
- u64 inum;
+ subvol_inum inum = { .subvol = 1 };
+ int ret;
- inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
- &dentry->d_name);
+ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+ &dentry->d_name, &inum);
- if (inum)
+ if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
}
-static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
+static int bch2_mknod(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
- __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
+ (subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -395,10 +424,11 @@ static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
return 0;
}
-static int bch2_create(struct inode *vdir, struct dentry *dentry,
+static int bch2_create(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return bch2_mknod(vdir, dentry, mode|S_IFREG, 0);
+ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
}
static int __bch2_link(struct bch_fs *c,
@@ -413,20 +443,16 @@ static int __bch2_link(struct bch_fs *c,
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK,
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_link_trans(&trans,
- dir->v.i_ino,
- inode->v.i_ino, &dir_u, &inode_u,
+ inode_inum(dir), &dir_u,
+ inode_inum(inode), &inode_u,
&dentry->d_name));
if (likely(!ret)) {
- BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
- journal_seq_copy(c, inode, dir->ei_journal_seq);
- bch2_inode_update_after_write(c, dir, &dir_u,
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
}
bch2_trans_exit(&trans);
@@ -453,7 +479,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
return 0;
}
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+ bool deleting_snapshot)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -465,20 +492,17 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK|
+ ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans,
- dir->v.i_ino, &dir_u,
- &inode_u, &dentry->d_name));
+ inode_inum(dir), &dir_u,
+ &inode_u, &dentry->d_name,
+ deleting_snapshot));
if (likely(!ret)) {
- BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
- journal_seq_copy(c, inode, dir->ei_journal_seq);
- bch2_inode_update_after_write(c, dir, &dir_u,
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(c, inode, &inode_u,
+ bch2_inode_update_after_write(&trans, inode, &inode_u,
ATTR_MTIME);
}
@@ -488,14 +512,21 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
return ret;
}
-static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+ return __bch2_unlink(vdir, dentry, false);
+}
+
+static int bch2_symlink(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
const char *symname)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
- inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (unlikely(IS_ERR(inode)))
return PTR_ERR(inode);
@@ -510,8 +541,6 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
if (unlikely(ret))
goto err;
- journal_seq_copy(c, dir, inode->ei_journal_seq);
-
ret = __bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
goto err;
@@ -523,12 +552,14 @@ err:
return ret;
}
-static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
+static int bch2_mkdir(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0);
+ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
}
-static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+static int bch2_rename2(struct user_namespace *mnt_userns,
+ struct inode *src_vdir, struct dentry *src_dentry,
struct inode *dst_vdir, struct dentry *dst_dentry,
unsigned flags)
{
@@ -544,7 +575,6 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
- u64 journal_seq = 0;
int ret;
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
@@ -584,11 +614,10 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
goto err;
}
- ret = __bch2_trans_do(&trans, NULL, &journal_seq,
- BTREE_INSERT_NOUNLOCK,
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_rename_trans(&trans,
- src_dir->v.i_ino, &src_dir_u,
- dst_dir->v.i_ino, &dst_dir_u,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
@@ -601,25 +630,19 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
BUG_ON(dst_inode &&
dst_inode->v.i_ino != dst_inode_u.bi_inum);
- bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+ bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, src_dir, journal_seq);
- if (src_dir != dst_dir) {
- bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+ if (src_dir != dst_dir)
+ bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, dst_dir, journal_seq);
- }
- bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+ bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
ATTR_CTIME);
- journal_seq_copy(c, src_inode, journal_seq);
- if (dst_inode) {
- bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+ if (dst_inode)
+ bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
ATTR_CTIME);
- journal_seq_copy(c, dst_inode, journal_seq);
- }
err:
bch2_trans_exit(&trans);
@@ -642,17 +665,21 @@ err:
return ret;
}
-void bch2_setattr_copy(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- struct iattr *attr)
+static void bch2_setattr_copy(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ struct iattr *attr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid);
+ bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid);
+ bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
+
+ if (ia_valid & ATTR_SIZE)
+ bi->bi_size = attr->ia_size;
if (ia_valid & ATTR_ATIME)
bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
@@ -668,19 +695,20 @@ void bch2_setattr_copy(struct bch_inode_info *inode,
: inode->v.i_gid;
if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+ !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
}
-static int bch2_setattr_nonsize(struct bch_inode_info *inode,
- struct iattr *attr)
+int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode,
+ struct iattr *attr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans trans;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
int ret;
@@ -706,34 +734,32 @@ retry:
kfree(acl);
acl = NULL;
- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
if (ret)
goto btree_err;
- bch2_setattr_copy(inode, &inode_u, attr);
+ bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+ inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK|
+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
btree_err:
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err_trans;
- bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
if (acl)
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -745,7 +771,8 @@ err:
return ret;
}
-static int bch2_getattr(const struct path *path, struct kstat *stat,
+static int bch2_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned query_flags)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
@@ -785,26 +812,29 @@ static int bch2_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+static int bch2_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
int ret;
lockdep_assert_held(&inode->v.i_rwsem);
- ret = setattr_prepare(dentry, iattr);
+ ret = setattr_prepare(mnt_userns, dentry, iattr);
if (ret)
return ret;
return iattr->ia_valid & ATTR_SIZE
- ? bch2_truncate(inode, iattr)
- : bch2_setattr_nonsize(inode, iattr);
+ ? bch2_truncate(mnt_userns, inode, iattr)
+ : bch2_setattr_nonsize(mnt_userns, inode, iattr);
}
-static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
+static int bch2_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
struct bch_inode_info *inode =
- __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -873,36 +903,49 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *ei = to_bch_ei(vinode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
+ u32 snapshot;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
+ start >>= 9;
+
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(ei->v.i_ino, start >> 9), 0);
retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(ei->v.i_ino, start, snapshot), 0);
+
+ while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
- bkey_cmp(iter->pos, end) < 0) {
+ bkey_cmp(iter.pos, end) < 0) {
enum btree_id data_btree = BTREE_ID_extents;
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
@@ -923,7 +966,7 @@ retry:
offset_into_extent),
cur.k);
bch2_key_resize(&cur.k->k, sectors);
- cur.k->k.p = iter->pos;
+ cur.k->k.p = iter.pos;
cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
@@ -936,10 +979,12 @@ retry:
bkey_copy(prev.k, cur.k);
have_extent = true;
- bch2_btree_iter_set_pos(iter,
- POS(iter->pos.inode, iter->pos.offset + sectors));
+ bch2_btree_iter_set_pos(&iter,
+ POS(iter.pos.inode, iter.pos.offset + sectors));
}
-
+ start = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
if (ret == -EINTR)
goto retry;
@@ -947,8 +992,7 @@ retry:
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
@@ -984,16 +1028,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- return bch2_readdir(c, inode->v.i_ino, ctx);
-}
-
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
+ return bch2_readdir(c, inode_inum(inode), ctx);
}
static const struct file_operations bch_file_operations = {
@@ -1010,7 +1045,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1080,7 +1115,7 @@ static const struct address_space_operations bch_address_space_operations = {
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
@@ -1093,51 +1128,243 @@ static const struct address_space_operations bch_address_space_operations = {
.error_remove_page = generic_error_remove_page,
};
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+struct bcachefs_fid {
+ u64 inum;
+ u32 subvol;
+ u32 gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+ struct bcachefs_fid fid;
+ struct bcachefs_fid dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
{
- struct bch_fs *c = sb->s_fs_info;
- struct inode *vinode;
+ switch (fh_type) {
+ case FILEID_BCACHEFS_WITHOUT_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+ case FILEID_BCACHEFS_WITH_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+ default:
+ return false;
+ }
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+ return (struct bcachefs_fid) {
+ .inum = inode->ei_inode.bi_inum,
+ .subvol = inode->ei_subvol,
+ .gen = inode->ei_inode.bi_generation,
+ };
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+ struct inode *vdir)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+
+ if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+ return FILEID_INVALID;
+
+ if (!S_ISDIR(inode->v.i_mode) && dir) {
+ struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+ fid->fid = bch2_inode_to_fid(inode);
+ fid->dir = bch2_inode_to_fid(dir);
+
+ *len = sizeof(*fid) / sizeof(u32);
+ return FILEID_BCACHEFS_WITH_PARENT;
+ } else {
+ struct bcachefs_fid *fid = (void *) fh;
+
+ *fid = bch2_inode_to_fid(inode);
- if (ino < BCACHEFS_ROOT_INO)
- return ERR_PTR(-ESTALE);
+ *len = sizeof(*fid) / sizeof(u32);
+ return FILEID_BCACHEFS_WITHOUT_PARENT;
+ }
+}
- vinode = bch2_vfs_inode_get(c, ino);
- if (IS_ERR(vinode))
- return ERR_CAST(vinode);
- if (generation && vinode->i_generation != generation) {
- /* we didn't find the right inode.. */
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+ struct bcachefs_fid fid)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+ .subvol = fid.subvol,
+ .inum = fid.inum,
+ });
+ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
iput(vinode);
- return ERR_PTR(-ESTALE);
+ vinode = ERR_PTR(-ESTALE);
}
return vinode;
}
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- bch2_nfs_get_inode);
+ struct bcachefs_fid *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type))
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
}
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- bch2_nfs_get_inode);
+ struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type) ||
+ fh_type != FILEID_BCACHEFS_WITH_PARENT)
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ subvol_inum parent_inum = {
+ .subvol = inode->ei_inode.bi_parent_subvol ?:
+ inode->ei_subvol,
+ .inum = inode->ei_inode.bi_dir,
+ };
+
+ if (!parent_inum.inum)
+ return NULL;
+
+ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct btree_iter iter1;
+ struct btree_iter iter2;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ struct bch_inode_unpacked inode_u;
+ subvol_inum target;
+ u32 snapshot;
+ unsigned name_len;
+ int ret;
+
+ if (!S_ISDIR(dir->v.i_mode))
+ return -EINVAL;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+ bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter1, snapshot);
+ bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+ ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+ if (ret)
+ goto err;
+
+ if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+ k = bch2_btree_iter_peek_slot(&iter1);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_dirent) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ } else {
+ /*
+ * File with multiple hardlinks and our backref is to the wrong
+ * directory - linear search:
+ */
+ for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ if (k.k->p.inode > dir->ei_inode.bi_inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ }
+ }
+
+ ret = -ENOENT;
+ goto err;
+found:
+ name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+ memcpy(name, d.v->d_name, name_len);
+ name[name_len] = '\0';
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter1);
+ bch2_trans_iter_exit(&trans, &iter2);
+ bch2_trans_exit(&trans);
+
+ return ret;
}
static const struct export_operations bch_export_ops = {
+ .encode_fh = bch2_encode_fh,
.fh_to_dentry = bch2_fh_to_dentry,
.fh_to_parent = bch2_fh_to_parent,
- //.get_parent = bch2_get_parent,
+ .get_parent = bch2_get_parent,
+ .get_name = bch2_get_name,
};
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi)
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
{
- bch2_inode_update_after_write(c, inode, bi, ~0);
+ bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ else
+ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
@@ -1146,9 +1373,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_size = bi->bi_size;
inode->ei_flags = 0;
- inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
+ inode->ei_subvol = inum.subvol;
inode->v.i_mapping->a_ops = &bch_address_space_operations;
@@ -1184,7 +1411,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
mutex_init(&inode->ei_update_lock);
pagecache_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
- inode->ei_journal_seq = 0;
return &inode->v;
}
@@ -1246,10 +1472,57 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino, true);
+ bch2_inode_rm(c, inode_inum(inode), true);
}
}
+void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ struct snapshot_id_list *s)
+{
+ struct super_block *sb = c->vfs_sb;
+ struct inode *inode;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+ (inode->i_state & I_FREEING))
+ continue;
+
+ d_mark_dontcache(inode);
+ d_prune_aliases(inode);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+again:
+ cond_resched();
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+ (inode->i_state & I_FREEING))
+ continue;
+
+ if (!(inode->i_state & I_DONTCACHE)) {
+ d_mark_dontcache(inode);
+ d_prune_aliases(inode);
+ }
+
+ spin_lock(&inode->i_lock);
+ if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+ !(inode->i_state & I_FREEING)) {
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ goto again;
+ }
+
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+}
+
static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -1299,13 +1572,14 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
static struct bch_fs *bch2_path_to_fs(const char *path)
{
struct bch_fs *c;
- struct block_device *bdev = lookup_bdev(path);
+ dev_t dev;
+ int ret;
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ ret = lookup_bdev(path, &dev);
+ if (ret)
+ return ERR_PTR(ret);
- c = bch2_dev_to_fs(bdev->bd_dev);
- bdput(bdev);
+ c = bch2_dev_to_fs(dev);
if (c)
closure_put(&c->cl);
return c ?: ERR_PTR(-ENOENT);
@@ -1316,9 +1590,6 @@ static char **split_devs(const char *_dev_name, unsigned *nr)
char *dev_name = NULL, **devs = NULL, *s;
size_t i, nr_devs = 0;
- if (strlen(_dev_name) == 0)
- return NULL;
-
dev_name = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
return NULL;
@@ -1494,6 +1765,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (ret)
return ERR_PTR(ret);
+ if (!dev_name || strlen(dev_name) == 0)
+ return ERR_PTR(-EINVAL);
+
devs = split_devs(dev_name, &nr_devs);
if (!devs)
return ERR_PTR(-ENOMEM);
@@ -1561,6 +1835,8 @@ got_sb:
sb->s_xattr = bch2_xattr_handlers;
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.nsec_per_time_unit;
+ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
c->vfs_sb = sb;
strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -1568,9 +1844,7 @@ got_sb:
if (ret)
goto err_put_super;
- sb->s_bdi->congested_fn = bch2_congested;
- sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
@@ -1589,7 +1863,9 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+ sb->s_shrink.seeks = 0;
+
+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
if (IS_ERR(vinode)) {
bch_err(c, "error mounting: error getting root inode %i",
(int) PTR_ERR(vinode));
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 2d82ed7dd740..b2211ec7f302 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -36,7 +36,6 @@ struct bch_inode_info {
unsigned long ei_flags;
struct mutex ei_update_lock;
- u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
@@ -45,16 +44,32 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
+ u32 ei_subvol;
+
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->ei_subvol,
+ .inum = inode->ei_inode.bi_inum,
+ };
+}
+
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
*/
#define EI_INODE_ERROR 0
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT 1
+
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@@ -135,6 +150,10 @@ struct bch_inode_unpacked;
#ifndef NO_BCACHEFS_FS
+struct bch_inode_info *
+__bch2_create(struct user_namespace *, struct bch_inode_info *,
+ struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@@ -154,24 +173,33 @@ static inline int bch2_set_projid(struct bch_fs *c,
KEY_TYPE_QUOTA_PREALLOC);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
struct bch_inode_unpacked *, void *);
-void bch2_inode_update_after_write(struct bch_fs *,
+void bch2_inode_update_after_write(struct btree_trans *,
struct bch_inode_info *,
struct bch_inode_unpacked *,
unsigned);
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned);
+int bch2_setattr_nonsize(struct user_namespace *,
+ struct bch_inode_info *,
+ struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
#else
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ struct snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 86a6189503e4..361dbf338023 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -9,6 +9,7 @@
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
+#include "subvolume.h"
#include "super.h"
#include "xattr.h"
@@ -17,15 +18,16 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
u64 sectors = 0;
int ret;
for_each_btree_key(trans, iter, BTREE_ID_extents,
- POS(inum, 0), 0, k, ret) {
+ SPOS(inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode != inum)
break;
@@ -33,33 +35,138 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
sectors += k.k->size;
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret ?: sectors;
}
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u64 subdirs = 0;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode != inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_DIR)
+ subdirs++;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+ u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS(0, snapshot), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not fonud", snapshot);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ struct bch_subvolume s;
+ int ret;
+
+ ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+ *snapshot = le32_to_cpu(s.snapshot);
+ *inum = le64_to_cpu(s.inode);
+ return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ POS(0, inode_nr),
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, inode);
+err:
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error %i fetching inode %llu",
+ ret, inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
- POS(0, inode_nr), 0);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode_nr, *snapshot), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (snapshot)
- *snapshot = iter->pos.snapshot;
- ret = k.k->type == KEY_TYPE_inode
- ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+ ret = bkey_is_inode(k.k)
+ ? bch2_inode_unpack(k, inode)
: -ENOENT;
+ if (!ret)
+ *snapshot = iter.pos.snapshot;
err:
- bch2_trans_iter_free(trans, iter);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error %i fetching inode %llu:%u",
+ ret, inode_nr, *snapshot);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -70,16 +177,41 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
}
+static int __lookup_dirent(struct btree_trans *trans,
+ struct bch_hash_info hash_info,
+ subvol_inum dir, struct qstr *name,
+ u64 *target, unsigned *type)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0);
+ if (ret)
+ return ret;
+
+ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ *target = le64_to_cpu(d.v->d_inum);
+ *type = d.v->d_type;
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
static int __write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- struct btree_iter *inode_iter =
- bch2_trans_get_iter(trans, BTREE_ID_inodes,
- SPOS(0, inode->bi_inum, snapshot),
- BTREE_ITER_INTENT);
- int ret = bch2_inode_write(trans, inode_iter, inode);
- bch2_trans_iter_put(trans, inode_iter);
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode->bi_inum, snapshot),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_inode_write(trans, &iter, inode);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -96,110 +228,176 @@ static int write_inode(struct btree_trans *trans,
return ret;
}
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ struct btree_iter iter = { NULL };
+ struct bkey_i_inode_generation delete;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ if (ret)
+ goto err;
+retry:
+ bch2_trans_begin(trans);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch2_fs_inconsistent(trans->c,
+ "inode %llu:%u not found when deleting",
+ inum, snapshot);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_inode_unpack(k, &inode_u);
+
+ /* Subvolume root? */
+ if (inode_u.bi_subvol) {
+ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter.pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret == -EINTR)
+ goto retry;
+
+ return ret;
+}
+
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bch_inode_unpacked dir_inode;
struct bch_hash_info dir_hash_info;
int ret;
- ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
if (ret)
return ret;
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, iter);
- bch2_trans_iter_put(trans, iter);
- return ret;
-}
-
-static int remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- int ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __remove_dirent(trans, pos));
- if (ret)
- bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
+ &dir_hash_info, &iter, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked root;
struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- u64 inum;
+ subvol_inum root_inum = { .subvol = subvol };
+ u64 inum = 0;
+ unsigned d_type = 0;
u32 snapshot;
int ret;
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
- if (ret && ret != -ENOENT)
+ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ if (ret)
+ return ret;
+
+ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ if (ret)
return ret;
root_hash_info = bch2_hash_info_init(c, &root);
- inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
- &lostfound_str);
- if (!inum) {
+
+ ret = __lookup_dirent(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type);
+ if (ret == -ENOENT) {
bch_notice(c, "creating lost+found");
goto create_lostfound;
}
- ret = lookup_inode(trans, inum, lostfound, &snapshot);
- if (ret && ret != -ENOENT) {
- /*
- * The check_dirents pass has already run, dangling dirents
- * shouldn't exist here:
- */
+ if (ret && ret != -EINTR)
bch_err(c, "error looking up lost+found: %i", ret);
+ if (ret)
return ret;
- }
- if (ret == -ENOENT) {
-create_lostfound:
- bch2_inode_init_early(c, lostfound);
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_create_trans(trans,
- BCACHEFS_ROOT_INO, &root,
- lostfound,
- &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL));
- if (ret)
- bch_err(c, "error creating lost+found: %i", ret);
+ if (d_type != DT_DIR) {
+ bch_err(c, "error looking up lost+found: not a directory");
+ return ret;
}
- return 0;
+ /*
+ * The check_dirents pass has already run, dangling dirents
+ * shouldn't exist here:
+ */
+ return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+ bch2_inode_init_early(c, lostfound);
+
+ ret = bch2_create_trans(trans, root_inum, &root,
+ lostfound, &lostfound_str,
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL,
+ (subvol_inum) { }, 0);
+ if (ret && ret != -EINTR)
+ bch_err(c, "error creating lost+found: %i", ret);
+ return ret;
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+static int __reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
{
struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 subvol;
int ret;
- ret = lookup_lostfound(trans, &lostfound);
+ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+ if (ret)
+ return ret;
+
+ ret = lookup_lostfound(trans, subvol, &lostfound);
if (ret)
return ret;
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
- ret = write_inode(trans, &lostfound, U32_MAX);
+ ret = __write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
@@ -209,33 +407,51 @@ static int reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
- ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
- mode_to_type(inode->bi_mode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE));
+ ret = bch2_dirent_create(trans,
+ (subvol_inum) {
+ .subvol = subvol,
+ .inum = lostfound.bi_inum,
+ },
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ return ret;
+
+ inode->bi_dir = lostfound.bi_inum;
+ inode->bi_dir_offset = dir_offset;
+
+ return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
+{
+ int ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ __reattach_inode(trans, inode, inode_snapshot));
if (ret) {
bch_err(trans->c, "error %i reattaching inode %llu",
ret, inode->bi_inum);
return ret;
}
- inode->bi_dir = lostfound.bi_inum;
- inode->bi_dir_offset = dir_offset;
-
- return write_inode(trans, inode, U32_MAX);
+ return ret;
}
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+ POS(inode->bi_dir, inode->bi_dir_offset), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto out;
@@ -244,45 +460,251 @@ static int remove_backpointer(struct btree_trans *trans,
goto out;
}
- ret = remove_dirent(trans, k.k->p);
+ ret = __remove_dirent(trans, k.k->p);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+ if (bkey_cmp(s->pos, pos))
+ s->nr = 0;
+ s->pos = pos;
+
+ /* Might get called multiple times due to lock restarts */
+ if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+ return 0;
+
+ return snapshots_seen_add(c, s, pos.snapshot);
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+ u32 id, u32 ancestor)
+{
+ ssize_t i;
+
+ BUG_ON(id > ancestor);
+
+ id = snapshot_t(c, id)->equiv;
+ ancestor = snapshot_t(c, ancestor)->equiv;
+
+ /* @ancestor should be the snapshot most recently added to @seen */
+ BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+ BUG_ON(seen->pos.snapshot != ancestor);
+
+ if (id == ancestor)
+ return true;
+
+ if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+ return false;
+
+ for (i = seen->nr - 2;
+ i >= 0 && seen->d[i] >= id;
+ --i)
+ if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+ bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ return false;
+
+ return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s - list of snapshot IDs already seen at @src
+ * @src - snapshot ID of src key
+ * @dst - snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+ u32 src, u32 dst)
+{
+ return dst <= src
+ ? key_visible_in_snapshot(c, s, dst, src)
+ : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
struct inode_walker {
- bool first_this_inode;
- bool have_inode;
- u64 cur_inum;
- u32 snapshot;
- struct bch_inode_unpacked inode;
+ bool first_this_inode;
+ u64 cur_inum;
+
+ size_t nr;
+ size_t size;
+ struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+ } *d;
};
+static void inode_walker_exit(struct inode_walker *w)
+{
+ kfree(w->d);
+ w->d = NULL;
+}
+
static struct inode_walker inode_walker_init(void)
{
- return (struct inode_walker) {
- .cur_inum = -1,
- .have_inode = false,
+ return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct inode_walker *w)
+{
+ if (w->nr == w->size) {
+ size_t new_size = max_t(size_t, 8UL, w->size * 2);
+ void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+ GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ w->d = d;
+ w->size = new_size;
+ }
+
+ return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+ struct bkey_s_c inode)
+{
+ struct bch_inode_unpacked u;
+ int ret;
+
+ ret = inode_walker_realloc(w);
+ if (ret)
+ return ret;
+
+ BUG_ON(bch2_inode_unpack(inode, &u));
+
+ w->d[w->nr++] = (struct inode_walker_entry) {
+ .inode = u,
+ .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
};
+
+ return 0;
}
-static int walk_inode(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
+static int __walk_inode(struct btree_trans *trans,
+ struct inode_walker *w, struct bpos pos)
{
- if (inum != w->cur_inum) {
- int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned i, ancestor_pos;
+ int ret;
- if (ret && ret != -ENOENT)
- return ret;
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
- w->have_inode = !ret;
- w->cur_inum = inum;
- w->first_this_inode = true;
- } else {
+ if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
+ goto lookup_snapshot;
}
- return 0;
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != pos.inode)
+ break;
+
+ if (bkey_is_inode(k.k))
+ add_inode(c, w, k);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ w->cur_inum = pos.inode;
+ w->first_this_inode = true;
+lookup_snapshot:
+ for (i = 0; i < w->nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ goto found;
+ return INT_MAX;
+found:
+ BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+ if (pos.snapshot != w->d[i].snapshot) {
+ ancestor_pos = i;
+
+ while (i && w->d[i - 1].snapshot > pos.snapshot)
+ --i;
+
+ ret = inode_walker_realloc(w);
+ if (ret)
+ return ret;
+
+ array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+ w->d[i].snapshot = pos.snapshot;
+ w->d[i].count = 0;
+ }
+
+ return i;
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+ add_inode(c, w, k);
+ if (k.k->p.snapshot >= s->pos.snapshot)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ char buf[200];
+ int ret = 0;
+
+ if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+ return ret;
}
static int hash_redo_key(struct btree_trans *trans,
@@ -290,6 +712,9 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
+ bch_err(trans->c, "hash_redo_key() not implemented yet");
+ return -EINVAL;
+#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
@@ -305,28 +730,10 @@ static int hash_redo_key(struct btree_trans *trans,
bkey_init(&delete->k);
delete->k.p = k_iter->pos;
- return bch2_trans_update(trans, k_iter, delete, 0) ?:
+ return bch2_btree_iter_traverse(k_iter) ?:
+ bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-}
-
-static int fsck_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *info,
- struct btree_iter *iter)
-{
- int ret;
-retry:
- ret = bch2_hash_delete_at(trans, desc, info, iter) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret == -EINTR) {
- ret = bch2_btree_iter_traverse(iter);
- if (!ret)
- goto retry;
- }
-
- return ret;
+#endif
}
static int hash_check_key(struct btree_trans *trans,
@@ -335,7 +742,7 @@ static int hash_check_key(struct btree_trans *trans,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
char buf[200];
struct bkey_s_c k;
u64 hash;
@@ -362,20 +769,17 @@ static int hash_check_key(struct btree_trans *trans,
"duplicate hash table keys:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c,
hash_k), buf))) {
- ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
- if (ret)
- return ret;
- ret = 1;
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
break;
}
if (bkey_deleted(k.k)) {
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
goto bad_hash;
}
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
bad_hash:
if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
@@ -384,9 +788,7 @@ bad_hash:
(bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
return 0;
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
if (ret) {
bch_err(c, "hash_redo_key err %i", ret);
return ret;
@@ -398,30 +800,64 @@ fsck_err:
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c_inode inode)
+ struct bch_inode_unpacked *prev,
+ bool full)
{
struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
struct bch_inode_unpacked u;
bool do_update = false;
- int ret = 0;
+ int ret;
- ret = bch2_inode_unpack(inode, &u);
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
- if (bch2_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fsck",
- inode.k->p.inode))
+ ret = bkey_err(k);
+ if (ret)
return ret;
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ /*
+ * if snapshot id isn't a leaf node, skip it - deletion in
+ * particular is not atomic, so on the internal snapshot nodes
+ * we can see inodes marked for deletion after a clean shutdown
+ */
+ if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+ return 0;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (!full &&
+ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+ BCH_INODE_I_SECTORS_DIRTY|
+ BCH_INODE_UNLINKED)))
+ return 0;
+
+ if (prev->bi_inum != u.bi_inum)
+ *prev = u;
+
+ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
+ inode_d_type(prev) != inode_d_type(&u), c,
+ "inodes in different snapshots don't match")) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
+
if (u.bi_flags & BCH_INODE_UNLINKED &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum, false);
+ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
@@ -441,9 +877,10 @@ static int check_inode(struct btree_trans *trans,
* just switch units to bytes and that issue goes away
*/
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+ iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
- NULL);
+ 0, NULL);
if (ret) {
bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
@@ -468,7 +905,7 @@ static int check_inode(struct btree_trans *trans,
bch_verbose(c, "recounting sectors for inode %llu",
u.bi_inum);
- sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
bch_err(c, "error in fsck: error %i recounting inode sectors",
(int) sectors);
@@ -488,10 +925,7 @@ static int check_inode(struct btree_trans *trans,
}
if (do_update) {
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_inode_write(trans, iter, &u));
+ ret = write_inode(trans, &u, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i "
"updating inode", ret);
@@ -504,41 +938,99 @@ noinline_for_stack
static int check_inodes(struct bch_fs *c, bool full)
{
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_s_c_inode inode;
+ struct btree_iter iter;
+ struct bch_inode_unpacked prev = { 0 };
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
- continue;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- inode = bkey_s_c_to_inode(k);
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_inode(&trans, &iter, &prev, full));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- if (full ||
- (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED))) {
- ret = check_inode(&trans, iter, inode);
- if (ret)
- break;
- }
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int check_subvol(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ subvol = bkey_s_c_to_subvolume(k);
+
+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error deleting subvolume %llu: %i",
+ iter->pos.offset, ret);
+ if (ret)
+ return ret;
}
- bch2_trans_iter_put(&trans, iter);
- BUG_ON(ret == -EINTR);
+ return 0;
+}
+
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
+ POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_subvol(&trans, &iter));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
static int fix_overlapping_extent(struct btree_trans *trans,
struct bkey_s_c k, struct bpos cut_at)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i *u;
int ret;
@@ -558,45 +1050,208 @@ static int fix_overlapping_extent(struct btree_trans *trans,
* assume things about extent overwrites - we should be running the
* triggers manually here
*/
- iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
- BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
- BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
- ret = bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+#endif
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_iter *iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
+ if (!ret && k.k->type != KEY_TYPE_dirent)
+ ret = -ENOENT;
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
+ }
+
+ return bkey_s_c_to_dirent(k);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ d = dirent_get_by_pos(trans, &iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+ ret = bkey_err(d.s_c);
if (ret)
- goto out;
- if (k.k->type != KEY_TYPE_dirent)
- goto out;
+ return ret;
- ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
-out:
- bch2_trans_iter_free(trans, iter);
+ ret = dirent_points_to_inode(d, inode);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
- return d.k->p.inode == inode->bi_dir &&
- d.k->p.offset == inode->bi_dir_offset;
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_sectors == i->count)
+ continue;
+
+ count2 = lockrestart_do(trans,
+ bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_sectors == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->cur_inum, i->snapshot,
+ i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+ continue;
+
+ i->inode.bi_sectors = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ ret2 = -EINTR;
+ }
+fsck_err:
+ return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct inode_walker *inode,
+ struct snapshots_seen *s)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct inode_walker_entry *i;
+ char buf[200];
+ int ret = 0;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = snapshots_seen_update(c, s, k.k->p);
+ if (ret)
+ return ret;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (inode->cur_inum != k.k->p.inode) {
+ ret = check_i_sectors(trans, inode);
+ if (ret)
+ return ret;
+ }
+#if 0
+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[200];
+ char buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+ bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+ return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ }
+#endif
+ ret = __walk_inode(trans, inode, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
+ "extent in missing inode:\n %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+ if (ret == INT_MAX)
+ return 0;
+
+ i = inode->d + ret;
+ ret = 0;
+
+ if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+ !S_ISLNK(i->inode.bi_mode), c,
+ "extent in non regular inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+ if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->type != KEY_TYPE_reservation &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+ "extent type %u offset %llu past end of inode %llu, i_size %llu",
+ k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+ bch2_fs_lazy_rw(c);
+ return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+ k.k->p.snapshot),
+ POS(k.k->p.inode, U64_MAX),
+ 0, NULL) ?: -EINTR;
+ }
+ }
+ }
+
+ if (bkey_extent_is_allocation(k.k))
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+ i->count += k.k->size;
+#if 0
+ bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
+
+fsck_err:
+ return ret;
}
/*
@@ -607,303 +1262,429 @@ noinline_for_stack
static int check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
+ struct snapshots_seen s;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_buf prev;
- u64 i_sectors = 0;
+ struct btree_iter iter;
int ret = 0;
+#if 0
+ struct bkey_buf prev;
bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
+#endif
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- if (w.have_inode &&
- w.cur_inum != k.k->p.inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
- fsck_err_on(w.inode.bi_sectors != i_sectors, c,
- "inode %llu has incorrect i_sectors: got %llu, should be %llu",
- w.inode.bi_inum,
- w.inode.bi_sectors, i_sectors)) {
- w.inode.bi_sectors = i_sectors;
-
- ret = write_inode(&trans, &w.inode, w.snapshot);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent(&trans, &iter, &w, &s));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
+#if 0
+ bch2_bkey_buf_exit(&prev, c);
+#endif
+ inode_walker_exit(&w);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+
+ return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_nlink == i->count)
+ continue;
+
+ count2 = lockrestart_do(trans,
+ bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_nlink == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+ "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+ w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+ i->inode.bi_nlink = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
+ ret2 = -EINTR;
}
+ }
+fsck_err:
+ return ret ?: ret2;
+}
- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
- char buf1[200];
- char buf2[200];
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ bool backpointer_exists = true;
+ char buf[200];
+ int ret = 0;
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
- bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
+
+ if (!inode_points_to_dirent(target, d)) {
+ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+ if (ret < 0)
+ goto err;
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+ backpointer_exists = ret;
+ ret = 0;
+
+ if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+ backpointer_exists, c,
+ "directory %llu with multiple links",
+ target->bi_inum)) {
+ ret = __remove_dirent(trans, d.k->p);
+ if (ret)
+ goto err;
+ return 0;
}
- ret = walk_inode(&trans, &w, k.k->p.inode);
- if (ret)
- break;
+ if (fsck_err_on(backpointer_exists &&
+ !target->bi_nlink, c,
+ "inode %llu has multiple links but i_nlink 0",
+ target->bi_inum)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_UNLINKED;
- if (w.first_this_inode)
- i_sectors = 0;
-
- if (fsck_err_on(!w.have_inode, c,
- "extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode) ||
- fsck_err_on(w.have_inode &&
- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
- "extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.bi_mode)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, 0),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
- if (fsck_err_on(w.have_inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ if (fsck_err_on(!backpointer_exists, c,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
+ }
- if (bkey_extent_is_allocation(k.k))
- i_sectors += k.k->size;
- bch2_bkey_buf_reassemble(&prev, c, k);
+ if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ return ret;
- bch2_btree_iter_advance(iter);
+ d = dirent_i_to_s_c(n);
}
+
+ if (d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+ (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ return ret;
+
+ d = dirent_i_to_s_c(n);
+ }
+err:
fsck_err:
- if (ret == -EINTR)
- goto retry;
- bch2_trans_iter_put(&trans, iter);
- bch2_bkey_buf_exit(&prev, c);
- return bch2_trans_exit(&trans) ?: ret;
+ return ret;
}
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_hash_info *hash_info,
+ struct inode_walker *dir,
+ struct inode_walker *target,
+ struct snapshots_seen *s)
{
- struct inode_walker w = inode_walker_init();
- struct bch_hash_info hash_info;
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ struct inode_walker_entry *i;
char buf[200];
- unsigned nr_subdirs = 0;
- int ret = 0;
+ int ret;
- bch_verbose(c, "checking dirents");
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- struct bkey_s_c_dirent d;
- struct bch_inode_unpacked target;
- u32 target_snapshot;
- bool have_target;
- bool backpointer_exists = true;
- u64 d_inum;
-
- if (w.have_inode &&
- w.cur_inum != k.k->p.inode &&
- fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
- "directory %llu with wrong i_nlink: got %u, should be %u",
- w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
- w.inode.bi_nlink = nr_subdirs;
- ret = write_inode(&trans, &w.inode, w.snapshot);
- if (ret)
- break;
- }
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = snapshots_seen_update(c, s, k.k->p);
+ if (ret)
+ return ret;
- ret = walk_inode(&trans, &w, k.k->p.inode);
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (dir->cur_inum != k.k->p.inode) {
+ ret = check_subdir_count(trans, dir);
if (ret)
- break;
+ return ret;
+ }
- if (w.first_this_inode)
- nr_subdirs = 0;
+ ret = __walk_inode(trans, dir, k.k->p);
+ if (ret < 0)
+ return ret;
- if (fsck_err_on(!w.have_inode, c,
- "dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf)) ||
- fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
- "dirent in non directory inode type %u:\n%s",
- mode_to_type(w.inode.bi_mode),
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
- ret = lockrestart_do(&trans,
- bch2_btree_delete_at(&trans, iter, 0));
- if (ret)
- goto err;
- goto next;
- }
+ if (fsck_err_on(ret == INT_MAX, c,
+ "dirent in nonexisting directory:\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- if (!w.have_inode)
- goto next;
+ if (ret == INT_MAX)
+ return 0;
- if (w.first_this_inode)
- hash_info = bch2_hash_info_init(c, &w.inode);
+ i = dir->d + ret;
+ ret = 0;
- ret = hash_check_key(&trans, bch2_dirent_hash_desc,
- &hash_info, iter, k);
- if (ret > 0) {
- ret = 0;
- goto next;
- }
- if (ret)
- goto fsck_err;
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+ "dirent in non directory inode type %s:\n%s",
+ bch2_d_type_str(inode_d_type(&i->inode)),
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter, 0);
- if (k.k->type != KEY_TYPE_dirent)
- goto next;
+ if (dir->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
- d = bkey_s_c_to_dirent(k);
- d_inum = le64_to_cpu(d.v->d_inum);
+ ret = hash_check_key(trans, bch2_dirent_hash_desc,
+ hash_info, iter, k);
+ if (ret < 0)
+ return ret;
+ if (ret) /* dirent has been deleted */
+ return 0;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ return 0;
+
+ d = bkey_s_c_to_dirent(k);
- ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
+ if (d.v->d_type == DT_SUBVOL) {
+ struct bch_inode_unpacked subvol_root;
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 target_snapshot;
+ u64 target_inum;
+
+ ret = __subvol_lookup(trans, target_subvol,
+ &target_snapshot, &target_inum);
if (ret && ret != -ENOENT)
- break;
+ return ret;
- have_target = !ret;
- ret = 0;
+ if (fsck_err_on(ret, c,
+ "dirent points to missing subvolume %llu",
+ le64_to_cpu(d.v->d_child_subvol)))
+ return __remove_dirent(trans, d.k->p);
+
+ ret = __lookup_inode(trans, target_inum,
+ &subvol_root, &target_snapshot);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (fsck_err_on(ret, c,
+ "subvolume %u points to missing subvolume root %llu",
+ target_subvol,
+ target_inum)) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
- if (fsck_err_on(!have_target, c,
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+ "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_subvol, target_subvol)) {
+ subvol_root.bi_subvol = target_subvol;
+ ret = __write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ return ret;
+ }
+
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
+ if (ret)
+ return ret;
+ } else {
+ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ if (ret)
+ return ret;
+
+ if (fsck_err_on(!target->nr, c,
"dirent points to missing inode:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c,
k), buf))) {
- ret = remove_dirent(&trans, d.k->p);
+ ret = __remove_dirent(trans, d.k->p);
if (ret)
- goto err;
- goto next;
+ return ret;
}
- if (!have_target)
- goto next;
-
- if (!target.bi_dir &&
- !target.bi_dir_offset) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
-
- ret = write_inode(&trans, &target, target_snapshot);
+ for (i = target->d; i < target->d + target->nr; i++) {
+ ret = check_dirent_target(trans, iter, d,
+ &i->inode, i->snapshot);
if (ret)
- goto err;
+ return ret;
}
+ }
- if (!inode_backpointer_matches(d, &target)) {
- ret = inode_backpointer_exists(&trans, &target);
- if (ret < 0)
- goto err;
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
- backpointer_exists = ret;
- ret = 0;
+fsck_err:
+ return ret;
+}
- if (fsck_err_on(S_ISDIR(target.bi_mode) &&
- backpointer_exists, c,
- "directory %llu with multiple links",
- target.bi_inum)) {
- ret = remove_dirent(&trans, d.k->p);
- if (ret)
- goto err;
- continue;
- }
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+ struct inode_walker dir = inode_walker_init();
+ struct inode_walker target = inode_walker_init();
+ struct snapshots_seen s;
+ struct bch_hash_info hash_info;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ int ret = 0;
- if (fsck_err_on(backpointer_exists &&
- !target.bi_nlink, c,
- "inode %llu has multiple links but i_nlink 0",
- d_inum)) {
- target.bi_nlink++;
- target.bi_flags &= ~BCH_INODE_UNLINKED;
+ bch_verbose(c, "checking dirents");
- ret = write_inode(&trans, &target, target_snapshot);
- if (ret)
- goto err;
- }
+ snapshots_seen_init(&s);
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- if (fsck_err_on(!backpointer_exists, c,
- "inode %llu has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- d_inum,
- target.bi_dir,
- target.bi_dir_offset,
- k.k->p.inode,
- k.k->p.offset)) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
-
- ret = write_inode(&trans, &target, target_snapshot);
- if (ret)
- goto err;
- }
- }
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
- "incorrect d_type: should be %u:\n%s",
- mode_to_type(target.bi_mode),
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
- struct bkey_i_dirent *n;
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_dirent(&trans, &iter, &hash_info,
+ &dir, &target, &s));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
- if (!n) {
- ret = -ENOMEM;
- goto err;
- }
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+ inode_walker_exit(&dir);
+ inode_walker_exit(&target);
+ return ret;
+}
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(target.bi_mode);
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_hash_info *hash_info,
+ struct inode_walker *inode)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ int ret;
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_trans_update(&trans, iter, &n->k_i, 0));
- kfree(n);
- if (ret)
- goto err;
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
- }
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
- nr_subdirs += d.v->d_type == DT_DIR;
-next:
- bch2_btree_iter_advance(iter);
- }
-err:
-fsck_err:
- if (ret == -EINTR)
- goto retry;
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret;
- bch2_trans_iter_put(&trans, iter);
- return bch2_trans_exit(&trans) ?: ret;
+ ret = __walk_inode(trans, inode, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
+ "xattr for missing inode %llu",
+ k.k->p.inode))
+ return bch2_btree_delete_at(trans, iter, 0);
+
+ if (ret == INT_MAX)
+ return 0;
+
+ ret = 0;
+
+ if (inode->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+
+ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+ return ret;
}
/*
@@ -912,90 +1693,101 @@ fsck_err:
noinline_for_stack
static int check_xattrs(struct bch_fs *c)
{
- struct inode_walker w = inode_walker_init();
+ struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ struct btree_iter iter;
int ret = 0;
bch_verbose(c, "checking xattrs");
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- ret = walk_inode(&trans, &w, k.k->p.inode);
- if (ret)
- break;
-
- if (fsck_err_on(!w.have_inode, c,
- "xattr for missing inode %llu",
- k.k->p.inode)) {
- ret = bch2_btree_delete_at(&trans, iter, 0);
- if (ret)
- break;
- continue;
- }
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- if (w.first_this_inode && w.have_inode)
- hash_info = bch2_hash_info_init(c, &w.inode);
-
- ret = hash_check_key(&trans, bch2_xattr_hash_desc,
- &hash_info, iter, k);
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_xattr(&trans, &iter, &hash_info,
+ &inode));
if (ret)
break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- bch2_btree_iter_advance(iter);
- }
-fsck_err:
- if (ret == -EINTR)
- goto retry;
-
- bch2_trans_iter_put(&trans, iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root_trans(struct btree_trans *trans)
{
- struct bkey_inode_buf packed;
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked root_inode;
u32 snapshot;
+ u64 inum;
int ret;
- bch_verbose(c, "checking root directory");
-
- ret = bch2_trans_do(c, NULL, NULL, 0,
- lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && ret != -ENOENT)
return ret;
- if (fsck_err_on(ret, c, "root directory missing"))
- goto create_root;
+ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+ struct bkey_i_subvolume root_subvol;
- if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
- "root inode not a directory"))
- goto create_root;
+ snapshot = U32_MAX;
+ inum = BCACHEFS_ROOT_INO;
- return 0;
+ bkey_subvolume_init(&root_subvol.k_i);
+ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol.v.flags = 0;
+ root_subvol.v.snapshot = cpu_to_le32(snapshot);
+ root_subvol.v.inode = cpu_to_le64(inum);
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+ if (ret) {
+ bch_err(c, "error writing root subvol: %i", ret);
+ goto err;
+ }
+
+ }
+
+ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+ "root inode not a directory")) {
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+ 0, NULL);
+ root_inode.bi_inum = inum;
+
+ ret = __write_inode(trans, &root_inode, snapshot);
+ if (ret)
+ bch_err(c, "error writing root inode: %i", ret);
+ }
+err:
fsck_err:
return ret;
-create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
- 0, NULL);
- root_inode->bi_inum = BCACHEFS_ROOT_INO;
+}
- bch2_inode_pack(c, &packed, root_inode);
+/* Get root directory, create if it doesn't exist: */
+noinline_for_stack
+static int check_root(struct bch_fs *c)
+{
+ bch_verbose(c, "checking root directory");
- return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ return bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ check_root_trans(&trans));
}
struct pathbuf {
@@ -1004,10 +1796,23 @@ struct pathbuf {
struct pathbuf_entry {
u64 inum;
+ u32 snapshot;
} *entries;
};
-static int path_down(struct pathbuf *p, u64 inum)
+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+{
+ struct pathbuf_entry *i;
+
+ for (i = p->entries; i < p->entries + p->nr; i++)
+ if (i->inum == inum &&
+ i->snapshot == snapshot)
+ return true;
+
+ return false;
+}
+
+static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
{
if (p->nr == p->size) {
size_t new_size = max_t(size_t, 256UL, p->size * 2);
@@ -1023,73 +1828,109 @@ static int path_down(struct pathbuf *p, u64 inum)
};
p->entries[p->nr++] = (struct pathbuf_entry) {
- .inum = inum,
+ .inum = inum,
+ .snapshot = snapshot,
};
return 0;
}
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
static int check_path(struct btree_trans *trans,
struct pathbuf *p,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
{
struct bch_fs *c = trans->c;
- u32 snapshot;
- size_t i;
int ret = 0;
+ snapshot = snapshot_t(c, snapshot)->equiv;
p->nr = 0;
- while (inode->bi_inum != BCACHEFS_ROOT_INO) {
+ while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ struct btree_iter dirent_iter;
+ struct bkey_s_c_dirent d;
+ u32 parent_snapshot = snapshot;
+
+ if (inode->bi_subvol) {
+ u64 inum;
+
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &parent_snapshot, &inum);
+ if (ret)
+ break;
+ }
+
ret = lockrestart_do(trans,
- inode_backpointer_exists(trans, inode));
- if (ret < 0)
+ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot))).k));
+ if (ret && ret != -ENOENT)
break;
- if (!ret) {
- if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
- inode->bi_inum,
- mode_to_type(inode->bi_mode),
+ if (!ret && !dirent_points_to_inode(d, inode)) {
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ ret = -ENOENT;
+ }
+
+ if (ret == -ENOENT) {
+ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+ inode->bi_inum, snapshot,
+ bch2_d_type_str(inode_d_type(inode)),
inode->bi_nlink,
inode->bi_dir,
inode->bi_dir_offset))
- ret = reattach_inode(trans, inode);
+ ret = reattach_inode(trans, inode, snapshot);
break;
}
- ret = 0;
+
+ bch2_trans_iter_exit(trans, &dirent_iter);
if (!S_ISDIR(inode->bi_mode))
break;
- ret = path_down(p, inode->bi_inum);
+ ret = path_down(p, inode->bi_inum, snapshot);
if (ret) {
bch_err(c, "memory allocation failure");
return ret;
}
- for (i = 0; i < p->nr; i++) {
- if (inode->bi_dir != p->entries[i].inum)
- continue;
+ snapshot = parent_snapshot;
+
+ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ if (ret) {
+ /* Should have been caught in dirents pass */
+ bch_err(c, "error looking up parent directory: %i", ret);
+ break;
+ }
+
+ if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ struct pathbuf_entry *i;
/* XXX print path */
+ bch_err(c, "directory structure loop");
+
+ for (i = p->entries; i < p->entries + p->nr; i++)
+ pr_err("%llu:%u", i->inum, i->snapshot);
+ pr_err("%llu:%u", inode->bi_inum, snapshot);
+
if (!fsck_err(c, "directory structure loop"))
return 0;
- ret = lockrestart_do(trans,
- remove_backpointer(trans, inode));
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
break;
}
- ret = reattach_inode(trans, inode);
- break;
- }
-
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
- if (ret) {
- /* Should have been caught in dirents pass */
- bch_err(c, "error looking up parent directory: %i", ret);
- break;
+ ret = reattach_inode(trans, inode, snapshot);
}
}
fsck_err:
@@ -1103,10 +1944,11 @@ fsck_err:
* After check_dirents(), if an inode backpointer doesn't exist that means it's
* unreachable:
*/
+noinline_for_stack
static int check_directory_structure(struct bch_fs *c)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
struct pathbuf path = { 0, 0, NULL };
@@ -1116,28 +1958,33 @@ static int check_directory_structure(struct bch_fs *c)
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
continue;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+ ret = bch2_inode_unpack(k, &u);
if (ret) {
/* Should have been caught earlier in fsck: */
bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
break;
}
- ret = check_path(&trans, &path, &u);
+ if (u.bi_flags & BCH_INODE_UNLINKED)
+ continue;
+
+ ret = check_path(&trans, &path, &u, iter.pos.snapshot);
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(ret == -EINTR);
kfree(path.entries);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
struct nlink_table {
@@ -1185,8 +2032,9 @@ static int nlink_cmp(const void *_l, const void *_r)
return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
}
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
- u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+ struct nlink_table *links,
+ u64 range_start, u64 range_end, u64 inum, u32 snapshot)
{
struct nlink *link, key = {
.inum = inum, .snapshot = U32_MAX,
@@ -1195,10 +2043,20 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
if (inum < range_start || inum >= range_end)
return;
- link = bsearch(&key, links->d, links->nr,
- sizeof(links->d[0]), nlink_cmp);
- if (link)
- link->count++;
+ link = __inline_bsearch(&key, links->d, links->nr,
+ sizeof(links->d[0]), nlink_cmp);
+ if (!link)
+ return;
+
+ while (link > links->d && link[0].inum == link[-1].inum)
+ --link;
+
+ for (; link < links->d + links->nr && link->inum == inum; link++)
+ if (ref_visible(c, s, snapshot, link->snapshot)) {
+ link->count++;
+ if (link->snapshot >= snapshot)
+ break;
+ }
}
noinline_for_stack
@@ -1207,9 +2065,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
u64 start, u64 *end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_inode inode;
struct bch_inode_unpacked u;
int ret = 0;
@@ -1218,22 +2075,21 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
continue;
- inode = bkey_s_c_to_inode(k);
+ /* Should never fail, checked by bch2_inode_invalid: */
+ BUG_ON(bch2_inode_unpack(k, &u));
/*
* Backpointer and directory structure checks are sufficient for
* directories, since they can't have hardlinks:
*/
- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
continue;
- /* Should never fail, checked by bch2_inode_invalid: */
- BUG_ON(bch2_inode_unpack(inode, &u));
-
if (!u.bi_nlink)
continue;
@@ -1245,7 +2101,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
@@ -1259,34 +2115,43 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
u64 range_start, u64 range_end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct snapshots_seen s;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
int ret;
+ snapshots_seen_init(&s);
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = snapshots_seen_update(c, &s, k.k->p);
+ if (ret)
+ break;
+
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
- if (d.v->d_type != DT_DIR)
- inc_link(c, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum));
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ d.k->p.snapshot);
break;
}
-
- bch2_trans_cond_resched(&trans);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
return ret;
}
@@ -1296,9 +2161,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
u64 range_start, u64 range_end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_inode inode;
struct bch_inode_unpacked u;
struct nlink *link = links->d;
int ret = 0;
@@ -1308,23 +2172,24 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset >= range_end)
break;
- if (k.k->type != KEY_TYPE_inode)
+ if (!bkey_is_inode(k.k))
continue;
- inode = bkey_s_c_to_inode(k);
- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
- continue;
+ BUG_ON(bch2_inode_unpack(k, &u));
- BUG_ON(bch2_inode_unpack(inode, &u));
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+ continue;
if (!u.bi_nlink)
continue;
- while (link->inum < k.k->p.offset) {
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
link++;
BUG_ON(link >= links->d + links->nr);
}
@@ -1335,16 +2200,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_inode_write(&trans, iter, &u));
+ ret = write_inode(&trans, &u, k.k->p.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i updating inode", ret);
}
}
fsck_err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
@@ -1390,21 +2252,91 @@ static int check_nlinks(struct bch_fs *c)
return ret;
}
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct bkey_s_c k;
+ struct bkey_s_c_reflink_p p;
+ struct bkey_i_reflink_p *u;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_reflink_p)
+ return 0;
+
+ p = bkey_s_c_to_reflink_p(k);
+
+ if (!p.v->front_pad && !p.v->back_pad)
+ return 0;
+
+ u = bch2_trans_kmalloc(trans, sizeof(*u));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&u->k_i, k);
+ u->v.front_pad = 0;
+ u->v.back_pad = 0;
+
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+noinline_for_stack
+static int fix_reflink_p(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+ return 0;
+
+ bch_verbose(c, "fixing reflink_p keys");
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->type == KEY_TYPE_reflink_p) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ fix_reflink_p_key(&trans, &iter));
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
/*
* Checks for inconsistencies that shouldn't happen, unless we have a bug.
* Doesn't fix them yet, mainly because they haven't yet been observed:
*/
int bch2_fsck_full(struct bch_fs *c)
{
- struct bch_inode_unpacked root_inode;
-
- return check_inodes(c, true) ?:
+ return bch2_fs_snapshots_check(c) ?:
+ check_inodes(c, true) ?:
+ check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
- check_root(c, &root_inode) ?:
+ check_root(c) ?:
check_directory_structure(c) ?:
- check_nlinks(c);
+ check_nlinks(c) ?:
+ fix_reflink_p(c);
}
int bch2_fsck_walk_inodes_only(struct bch_fs *c)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 17d8eb5223cd..ffce68a80490 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -6,8 +6,10 @@
#include "btree_update.h"
#include "error.h"
#include "extents.h"
+#include "extent_update.h"
#include "inode.h"
#include "str_hash.h"
+#include "subvolume.h"
#include "varint.h"
#include <linux/random.h>
@@ -33,29 +35,6 @@ static const u8 bits_table[8] = {
13 * 8 - 8,
};
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
- __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
- unsigned shift, bytes, bits = likely(!hi)
- ? fls64(lo)
- : fls64(hi) + 64;
-
- for (shift = 1; shift <= 8; shift++)
- if (bits < bits_table[shift - 1])
- goto got_shift;
-
- BUG();
-got_shift:
- bytes = byte_table[shift - 1];
-
- BUG_ON(out + bytes > end);
-
- memcpy(out, (u8 *) in + 16 - bytes, bytes);
- *out |= (1 << 8) >> shift;
-
- return bytes;
-}
-
static int inode_decode_field(const u8 *in, const u8 *end,
u64 out[2], unsigned *out_bits)
{
@@ -90,42 +69,11 @@ static int inode_decode_field(const u8 *in, const u8 *end,
return bytes;
}
-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- struct bkey_i_inode *k = &packed->inode;
- u8 *out = k->v.fields;
- u8 *end = (void *) &packed[1];
- u8 *last_nonzero_field = out;
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- unsigned bytes;
-
-#define x(_name, _bits) \
- out += inode_encode_field(out, end, 0, inode->_name); \
- nr_fields++; \
- \
- if (inode->_name) { \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- }
-
- BCH_INODE_FIELDS()
-#undef x
-
- out = last_nonzero_field;
- nr_fields = last_nonzero_fieldnr;
-
- bytes = out - (u8 *) &packed->inode.v;
- set_bkey_val_bytes(&packed->inode.k, bytes);
- memset_u64s_tail(&packed->inode.v, 0, bytes);
-
- SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+void bch2_inode_pack(struct bch_fs *c,
+ struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- struct bkey_i_inode *k = &packed->inode;
+ struct bkey_i_inode_v2 *k = &packed->inode;
u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
@@ -133,11 +81,19 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
unsigned bytes;
int ret;
+ bkey_inode_v2_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+
#define x(_name, _bits) \
nr_fields++; \
\
if (inode->_name) { \
- ret = bch2_varint_encode(out, inode->_name); \
+ ret = bch2_varint_encode_fast(out, inode->_name); \
out += ret; \
\
if (_bits > 64) \
@@ -163,30 +119,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-void bch2_inode_pack(struct bch_fs *c,
- struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
-
- if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
- SET_INODE_NEW_VARINT(&packed->inode.v, true);
- bch2_inode_pack_v2(packed, inode);
- } else {
- bch2_inode_pack_v1(packed, inode);
- }
+ SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
- int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+ int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
&unpacked);
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
@@ -235,24 +173,23 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
return 0;
}
-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+ const u8 *in, const u8 *end,
+ unsigned nr_fields)
{
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
unsigned fieldnr = 0;
int ret;
u64 v[2];
#define x(_name, _bits) \
- if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
- ret = bch2_varint_decode(in, end, &v[0]); \
+ if (fieldnr < nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v[0]); \
if (ret < 0) \
return ret; \
in += ret; \
\
if (_bits > 64) { \
- ret = bch2_varint_decode(in, end, &v[1]); \
+ ret = bch2_varint_decode_fast(in, end, &v[1]); \
if (ret < 0) \
return ret; \
in += ret; \
@@ -275,50 +212,81 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
return 0;
}
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
+int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+ switch (k.k->type) {
+ case KEY_TYPE_inode: {
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- if (INODE_NEW_VARINT(inode.v)) {
- return bch2_inode_unpack_v2(inode, unpacked);
- } else {
- return bch2_inode_unpack_v1(inode, unpacked);
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= 0;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODE_NR_FIELDS(inode.v));
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
+ break;
+ }
+ case KEY_TYPE_inode_v2: {
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODEv2_NR_FIELDS(inode.v));
+ }
+ default:
+ BUG();
}
-
- return 0;
}
-struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u64 inum, unsigned flags)
+int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_CACHED|flags);
- k = bch2_btree_iter_peek_cached(iter);
+ if (0 && trans->c->opts.inodes_use_key_cache)
+ flags |= BTREE_ITER_CACHED;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot), flags);
+ k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
- ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+ ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
if (ret)
goto err;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
- return iter;
+ return 0;
err:
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
int bch2_inode_write(struct btree_trans *trans,
@@ -338,8 +306,8 @@ int bch2_inode_write(struct btree_trans *trans,
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ struct bch_inode_unpacked unpacked;
if (k.k->p.inode)
return "nonzero k.p.inode";
@@ -353,7 +321,7 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
return "invalid str hash type";
- if (bch2_inode_unpack(inode, &unpacked))
+ if (bch2_inode_unpack(k, &unpacked))
return "invalid variable length fields";
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
@@ -366,28 +334,79 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
unpacked.bi_nlink != 0)
return "flagged as unlinked but bi_nlink != 0";
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+ return "subvolume root but not a directory";
+
return NULL;
}
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
struct bch_inode_unpacked unpacked;
- if (bch2_inode_unpack(inode, &unpacked)) {
- pr_buf(out, "(unpack error)");
- return;
- }
+ if (k.k->p.inode)
+ return "nonzero k.p.inode";
+
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+ return "incorrect value size";
+
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX)
+ return "fs inode in blockdev range";
- pr_buf(out, "mode: %o ", unpacked.bi_mode);
+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+ return "invalid str hash type";
+
+ if (bch2_inode_unpack(k, &unpacked))
+ return "invalid variable length fields";
+
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+ unpacked.bi_nlink != 0)
+ return "flagged as unlinked but bi_nlink != 0";
+
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+ return "subvolume root but not a directory";
+
+ return NULL;
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+ pr_buf(out, "mode %o flags %x journal_seq %llu",
+ inode->bi_mode, inode->bi_flags,
+ inode->bi_journal_seq);
#define x(_name, _bits) \
- pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
+ pr_buf(out, " "#_name " %llu", (u64) inode->_name);
BCH_INODE_FIELDS()
#undef x
}
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+ pr_buf(out, "inum: %llu ", inode->bi_inum);
+ __bch2_inode_unpacked_to_text(out, inode);
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bch_inode_unpacked inode;
+
+ if (bch2_inode_unpack(k, &inode)) {
+ pr_buf(out, "(unpack error)");
+ return;
+ }
+
+ __bch2_inode_unpacked_to_text(out, &inode);
+}
+
const char *bch2_inode_generation_invalid(const struct bch_fs *c,
struct bkey_s_c k)
{
@@ -461,6 +480,7 @@ static inline u32 bkey_generation(struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_inode:
+ case KEY_TYPE_inode_v2:
BUG();
case KEY_TYPE_inode_generation:
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
@@ -469,12 +489,15 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-struct btree_iter *bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 min, max, start, pos, *hint;
int ret = 0;
@@ -500,9 +523,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
start = min;
pos = start;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -515,9 +538,9 @@ again:
}
if (k.k->p.snapshot == snapshot &&
- k.k->type != KEY_TYPE_inode &&
+ !bkey_is_inode(k.k) &&
!bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
continue;
}
@@ -540,8 +563,8 @@ again:
ret = -ENOSPC;
if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
/* Retry from start */
@@ -553,32 +576,94 @@ found_slot:
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
/* We may have raced while the iterator wasn't pointing at pos: */
- if (k.k->type == KEY_TYPE_inode ||
+ if (bkey_is_inode(k.k) ||
bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
goto again;
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- return iter;
+ return 0;
+}
+
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+ subvol_inum inum, enum btree_id id)
+{
+ u64 offset = 0;
+ int ret = 0;
+
+ while (!ret || ret == -EINTR) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_trans_iter_init(trans, &iter, id,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+
+ if (!k.k || iter.pos.inode != inum.inum) {
+ bch2_trans_iter_exit(trans, &iter);
+ break;
+ }
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ if (btree_node_type_is_extents(iter.btree_id)) {
+ unsigned max_sectors =
+ min_t(u64, U64_MAX - iter.pos.offset,
+ KEY_SIZE_MAX & (~0 << trans->c->block_bits));
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+
+ ret = bch2_extent_trim_atomic(trans, &iter, &delete);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
- struct bpos start = POS(inode_nr, 0);
- struct bpos end = POS(inode_nr + 1, 0);
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
+ unsigned iter_flags = BTREE_ITER_INTENT;
+ u32 snapshot;
int ret;
+ if (0 && cached && c->opts.inodes_use_key_cache)
+ iter_flags |= BTREE_ITER_CACHED;
+
bch2_trans_init(&trans, c, 0, 1024);
/*
@@ -589,50 +674,48 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
- start, end, NULL);
+ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
if (ret)
goto err;
retry:
bch2_trans_begin(&trans);
- if (cached) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_CACHED|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_cached(iter);
- } else {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
- }
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot), iter_flags);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_inode) {
+ if (!bkey_is_inode(k.k)) {
bch2_fs_inconsistent(trans.c,
"inode %llu not found when deleting",
- inode_nr);
+ inum.inum);
ret = -EIO;
goto err;
}
- bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+ bch2_inode_unpack(k, &inode_u);
+
+ /* Subvolume root? */
+ BUG_ON(inode_u.bi_subvol);
bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter->pos;
+ delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
- ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+ ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
@@ -640,21 +723,22 @@ err:
return ret;
}
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_inode_peek(trans, inode, inode_nr, 0);
- ret = PTR_ERR_OR_ZERO(iter);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+ bch2_inode_find_by_inum_trans(&trans, inum, inode));
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 2cb081ae44d9..723186d8afb6 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -7,6 +7,7 @@
extern const char * const bch2_inode_opts[];
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode (struct bkey_ops) { \
@@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.val_to_text = bch2_inode_to_text, \
}
+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_inode_v2_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+}
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_inode ||
+ k->type == KEY_TYPE_inode_v2;
+}
+
const char *bch2_inode_generation_invalid(const struct bch_fs *,
struct bkey_s_c);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
@@ -34,6 +46,7 @@ typedef u64 u96;
struct bch_inode_unpacked {
u64 bi_inum;
+ u64 bi_journal_seq;
__le64 bi_hash_seed;
u32 bi_flags;
u16 bi_mode;
@@ -44,7 +57,7 @@ struct bch_inode_unpacked {
};
struct bkey_inode_buf {
- struct bkey_i_inode inode;
+ struct bkey_i_inode_v2 inode;
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[0 + BCH_INODE_FIELDS()];
@@ -53,10 +66,12 @@ struct bkey_inode_buf {
void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-struct btree_iter *bch2_inode_peek(struct btree_trans *,
- struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *);
@@ -69,12 +84,15 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-struct btree_iter *bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *, u32, u64);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
@@ -131,6 +149,11 @@ static inline u8 mode_to_type(umode_t mode)
return (mode >> 12) & 15;
}
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index ddb85f9d474e..5a3c9eff1b50 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -27,6 +27,7 @@
#include "keylist.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
@@ -135,10 +136,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -186,26 +187,24 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
int bch2_sum_sector_overwrites(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *new,
- bool *maybe_extending,
bool *usage_increasing,
s64 *i_sectors_delta,
s64 *disk_sectors_delta)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c old;
unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
int ret = 0;
- *maybe_extending = true;
*usage_increasing = false;
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- iter = bch2_trans_copy_iter(trans, extent_iter);
+ bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -220,38 +219,21 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
: 0;
if (!*usage_increasing &&
- (new_replicas > bch2_bkey_replicas(c, old) ||
+ (new->k.p.snapshot != old.k->p.snapshot ||
+ new_replicas > bch2_bkey_replicas(c, old) ||
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
- if (bkey_cmp(old.k->p, new->k.p) >= 0) {
- /*
- * Check if there's already data above where we're
- * going to be writing to - this means we're definitely
- * not extending the file:
- *
- * Note that it's not sufficient to check if there's
- * data up to the sector offset we're going to be
- * writing to, because i_size could be up to one block
- * less:
- */
- if (!bkey_cmp(old.k->p, new->k.p))
- old = bch2_btree_iter_next(iter);
-
- if (old.k && !bkey_err(old) &&
- old.k->p.inode == extent_iter->pos.inode &&
- bkey_extent_is_data(old.k))
- *maybe_extending = false;
-
+ if (bkey_cmp(old.k->p, new->k.p) >= 0)
break;
- }
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_extent_update(struct btree_trans *trans,
+ subvol_inum inum,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
@@ -260,214 +242,208 @@ int bch2_extent_update(struct btree_trans *trans,
s64 *i_sectors_delta_total,
bool check_enospc)
{
- /* this must live until after bch2_trans_commit(): */
- struct bkey_inode_buf inode_p;
- bool extending = false, usage_increasing;
+ struct btree_iter inode_iter;
+ struct bch_inode_unpacked inode_u;
+ struct bpos next_pos;
+ bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
int ret;
- ret = bch2_extent_trim_atomic(k, iter);
+ /*
+ * This traverses us the iterator without changing iter->path->pos to
+ * search_key() (which is pos + 1 for extents): we want there to be a
+ * path already traversed at iter->pos because
+ * bch2_trans_extent_update() will use it to attempt extent merging
+ */
+ ret = __bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ ret = bch2_extent_trim_atomic(trans, iter, k);
if (ret)
return ret;
+ new_i_size = min(k->k.p.offset << 9, new_i_size);
+ next_pos = k->k.p;
+
ret = bch2_sum_sector_overwrites(trans, iter, k,
- &extending,
&usage_increasing,
&i_sectors_delta,
&disk_sectors_delta);
if (ret)
return ret;
- if (!usage_increasing)
- check_enospc = false;
-
if (disk_res &&
disk_sectors_delta > (s64) disk_res->sectors) {
ret = bch2_disk_reservation_add(trans->c, disk_res,
disk_sectors_delta - disk_res->sectors,
- !check_enospc
+ !check_enospc || !usage_increasing
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
return ret;
}
- new_i_size = extending
- ? min(k->k.p.offset << 9, new_i_size)
- : 0;
-
- if (i_sectors_delta || new_i_size) {
- struct btree_iter *inode_iter;
- struct bch_inode_unpacked inode_u;
-
- inode_iter = bch2_inode_peek(trans, &inode_u,
- k->k.p.inode, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
- if (ret)
- return ret;
-
- /*
- * XXX:
- * writeback can race a bit with truncate, because truncate
- * first updates the inode then truncates the pagecache. This is
- * ugly, but lets us preserve the invariant that the in memory
- * i_size is always >= the on disk i_size.
- *
- BUG_ON(new_i_size > inode_u.bi_size &&
- (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
- */
- BUG_ON(new_i_size > inode_u.bi_size && !extending);
-
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size)
- inode_u.bi_size = new_i_size;
- else
- new_i_size = 0;
-
- inode_u.bi_sectors += i_sectors_delta;
-
- if (i_sectors_delta || new_i_size) {
- bch2_inode_pack(trans->c, &inode_p, &inode_u);
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+ BTREE_ITER_INTENT);
+ if (ret)
+ return ret;
- inode_p.inode.k.p.snapshot = iter->snapshot;
+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > inode_u.bi_size)
+ inode_u.bi_size = new_i_size;
- ret = bch2_trans_update(trans, inode_iter,
- &inode_p.inode.k_i, 0);
- }
-
- bch2_trans_iter_put(trans, inode_iter);
-
- if (ret)
- return ret;
- }
+ inode_u.bi_sectors += i_sectors_delta;
ret = bch2_trans_update(trans, iter, k, 0) ?:
+ bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
- BUG_ON(ret == -ENOSPC);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
if (ret)
return ret;
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
+ bch2_btree_iter_set_pos(iter, next_pos);
+
return 0;
}
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, u64 *journal_seq,
+ subvol_inum inum, u64 end,
s64 *i_sectors_delta)
{
struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bpos end_pos = POS(inum.inum, end);
struct bkey_s_c k;
int ret = 0, ret2 = 0;
+ u32 snapshot;
- while ((k = bch2_btree_iter_peek(iter)).k &&
- bkey_cmp(iter->pos, end) < 0) {
+ while (!ret || ret == -EINTR) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ if (ret)
+ ret2 = ret;
+
bch2_trans_begin(trans);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+
+ k = bch2_btree_iter_peek(iter);
+ if (bkey_cmp(iter->pos, end_pos) >= 0) {
+ bch2_btree_iter_set_pos(iter, end_pos);
+ break;
+ }
+
ret = bkey_err(k);
if (ret)
- goto btree_err;
+ continue;
bkey_init(&delete.k);
delete.k.p = iter->pos;
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete);
+ bch2_cut_back(end_pos, &delete);
- ret = bch2_extent_update(trans, iter, &delete,
- &disk_res, journal_seq,
+ ret = bch2_extent_update(trans, inum, iter, &delete,
+ &disk_res, NULL,
0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
-btree_err:
- if (ret == -EINTR) {
- ret2 = ret;
- ret = 0;
- }
- if (ret)
- break;
- }
-
- if (bkey_cmp(iter->pos, end) > 0) {
- bch2_btree_iter_set_pos(iter, end);
- ret = bch2_btree_iter_traverse(iter);
}
return ret ?: ret2;
}
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
- u64 *journal_seq, s64 *i_sectors_delta)
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+ s64 *i_sectors_delta)
{
struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
+ struct btree_iter iter;
+ int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(inum, start),
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ POS(inum.inum, start),
+ BTREE_ITER_INTENT);
- ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
- journal_seq, i_sectors_delta);
+ ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- if (ret == -EINTR)
- ret = 0;
-
- return ret;
+ return ret == -EINTR ? 0 : ret;
}
int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
+ subvol_inum inum = {
+ .subvol = op->subvol,
+ .inum = k->k.p.inode,
+ };
int ret;
+ BUG_ON(!inum.subvol);
+
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- bkey_start_pos(&k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
do {
bch2_trans_begin(&trans);
k = bch2_keylist_front(keys);
+ bch2_bkey_buf_copy(&sk, c, k);
- k->k.p.snapshot = iter->snapshot;
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+ &sk.k->k.p.snapshot);
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
- bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
- bkey_copy(sk.k, k);
- bch2_cut_front(iter->pos, sk.k);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(&sk.k->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(&trans, iter, sk.k,
+ ret = bch2_extent_update(&trans, inum, &iter, sk.k,
&op->res, op_journal_seq(op),
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
+ bch2_trans_iter_exit(&trans, &iter);
+
if (ret == -EINTR)
continue;
if (ret)
break;
- if (bkey_cmp(iter->pos, k->k.p) >= 0)
- bch2_keylist_pop_front(keys);
+ if (ec_ob)
+ bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
+
+ if (bkey_cmp(iter.pos, k->k.p) >= 0)
+ bch2_keylist_pop_front(&op->insert_keys);
+ else
+ bch2_cut_front(iter.pos, k);
} while (!bch2_keylist_empty(keys));
- bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
@@ -741,6 +717,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
? ((unsigned long) buf & (PAGE_SIZE - 1))
: 0), PAGE_SIZE);
+ pages = min(pages, BIO_MAX_VECS);
+
bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = true;
@@ -906,7 +884,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
struct bio *src = &op->wbio.bio, *dst = src;
struct bvec_iter saved_iter;
void *ec_buf;
- struct bpos ec_pos = op->pos;
unsigned total_output = 0, total_input = 0;
bool bounce = false;
bool page_alloc_failed = false;
@@ -923,7 +900,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
ret = -EIO;
goto err;
case PREP_ENCODED_CHECKSUM_ERR:
- BUG();
goto csum_err;
case PREP_ENCODED_DO_WRITE:
/* XXX look for bug here */
@@ -1077,9 +1053,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
dst->bi_iter.bi_size = total_output;
do_write:
- /* might have done a realloc... */
- bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
-
*_dst = dst;
return more;
csum_err:
@@ -1628,12 +1601,12 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
}
static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
int ret;
@@ -1644,12 +1617,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+ rbio->read_pos, BTREE_ITER_SLOTS);
retry:
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
if (bkey_err(k))
goto err;
@@ -1676,7 +1649,7 @@ retry:
goto err;
out:
bch2_rbio_done(rbio);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return;
@@ -1692,7 +1665,10 @@ static void bch2_rbio_retry(struct work_struct *work)
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->read_pos.inode;
+ subvol_inum inum = {
+ .subvol = rbio->subvol,
+ .inum = rbio->read_pos.inode,
+ };
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
@@ -1708,12 +1684,12 @@ static void bch2_rbio_retry(struct work_struct *work)
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
} else {
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- __bch2_read(c, rbio, iter, inode, &failed, flags);
+ __bch2_read(c, rbio, iter, inum, &failed, flags);
}
}
@@ -1742,7 +1718,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_fs *c = rbio->c;
u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
struct bch_extent_crc_unpacked new_crc;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter;
struct bkey_i *new;
struct bkey_s_c k;
int ret = 0;
@@ -1750,9 +1726,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if (crc_is_compressed(rbio->pick.crc))
return 0;
- iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto out;
@@ -1787,9 +1763,10 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
- ret = bch2_trans_update(trans, iter, new, 0);
+ ret = bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1811,8 +1788,11 @@ static void __bch2_read_endio(struct work_struct *work)
struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
+ unsigned nofs_flags;
struct bch_csum csum;
+ nofs_flags = memalloc_nofs_save();
+
/* Reset iterator for checksumming and copying bounced data: */
if (rbio->bounce) {
src->bi_iter.bi_size = crc.compressed_size << 9;
@@ -1877,6 +1857,8 @@ nodecode:
rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
}
+out:
+ memalloc_nofs_restore(nofs_flags);
return;
csum_err:
/*
@@ -1887,7 +1869,7 @@ csum_err:
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
rbio->flags |= BCH_READ_MUST_BOUNCE;
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
- return;
+ goto out;
}
bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
@@ -1895,12 +1877,12 @@ csum_err:
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
- return;
+ goto out;
decompression_err:
bch_err_inum_ratelimited(c, rbio->read_pos.inode,
"decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
- return;
+ goto out;
}
static void bch2_read_endio(struct bio *bio)
@@ -1955,7 +1937,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
unsigned *offset_into_extent,
struct bkey_buf *orig_k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
u64 reflink_offset;
int ret;
@@ -1963,10 +1945,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
*offset_into_extent;
- iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
- POS(0, reflink_offset),
- BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
+ POS(0, reflink_offset),
+ BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1983,10 +1965,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
goto err;
}
- *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
bch2_bkey_buf_reassemble(orig_k, trans->c, k);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2057,7 +2039,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_NONE &&
+ (pick.crc.csum_type != BCH_CSUM_none &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
@@ -2150,6 +2132,7 @@ get_bio:
/* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
+ rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
@@ -2252,13 +2235,14 @@ out_read_done:
}
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
@@ -2267,23 +2251,37 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- bch2_btree_iter_set_pos(iter,
- POS(inode, bvec_iter.bi_sector));
+ /*
+ * read_extent -> io_time_reset may cause a transaction restart
+ * without returning an error, we need to check for that here:
+ */
+ if (!bch2_trans_relock(&trans)) {
+ ret = -EINTR;
+ break;
+ }
+
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, bvec_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
@@ -2314,7 +2312,7 @@ retry:
if (bvec_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
data_btree, k,
offset_into_extent, failed, flags);
if (ret)
@@ -2325,20 +2323,26 @@ retry:
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
+ break;
}
- bch2_trans_iter_put(&trans, iter);
+err:
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
goto retry;
+ bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&sk, c);
+
if (ret) {
- bch_err_inum_ratelimited(c, inode,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
}
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
}
void bch2_fs_io_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index bc0a0bd6f849..1aa422dccef7 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -48,12 +48,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
-{
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-}
-
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
@@ -62,13 +56,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
}
int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct disk_reservation *,
- u64 *, u64, s64 *, bool);
+ struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+ struct btree_iter *, struct bkey_i *,
+ struct disk_reservation *, u64 *, u64, s64 *, bool);
+
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+ subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
int bch2_write_index_default(struct bch_write_op *);
@@ -90,6 +85,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->devs_have.nr = 0;
op->target = 0;
op->opts = opts;
+ op->subvol = 0;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
@@ -157,10 +153,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
}
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- u64, struct bch_io_failures *, unsigned flags);
+ subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inode)
+ subvol_inum inum)
{
struct bch_io_failures failed = { .nr = 0 };
@@ -168,8 +164,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index e7aca7c9823a..78bff13d36f2 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -62,6 +62,7 @@ struct bch_read_bio {
/*
* pos we read from - different from data_pos for indirect extents:
*/
+ u32 subvol;
struct bpos read_pos;
/*
@@ -94,7 +95,8 @@ struct bch_write_bio {
bounce:1,
put_bio:1,
have_ioref:1,
- used_mempool:1;
+ used_mempool:1,
+ first_btree_write:1;
struct bio bio;
};
@@ -121,6 +123,7 @@ struct bch_write_op {
u16 nonce;
struct bch_io_opts opts;
+ u32 subvol;
struct bpos pos;
struct bversion version;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d714779a28d0..14bea8a2535e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j)
buf->must_flush = false;
buf->separate_flush = false;
- memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
buf->data->u64s = 0;
@@ -109,7 +107,12 @@ void bch2_journal_halt(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- j->err_seq = journal_cur_seq(j);
+ /*
+ * XXX: we're not using j->lock here because this can be called from
+ * interrupt context, this can race with journal_write_done()
+ */
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
@@ -335,55 +338,6 @@ static void journal_write_work(struct work_struct *work)
journal_entry_close(j);
}
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- union journal_res_state s;
- unsigned i;
- u64 seq;
-
-
- spin_lock(&j->lock);
- seq = journal_cur_seq(j);
- s = READ_ONCE(j->reservations);
- i = s.idx;
-
- while (1) {
- if (test_bit(h, j->buf[i].has_inode))
- goto out;
-
- if (i == s.unwritten_idx)
- break;
-
- i = (i - 1) & JOURNAL_BUF_MASK;
- seq--;
- }
-
- seq = 0;
-out:
- spin_unlock(&j->lock);
-
- return seq;
-}
-
-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
-{
- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- struct journal_buf *buf;
-
- spin_lock(&j->lock);
-
- if ((buf = journal_seq_to_buf(j, seq)))
- set_bit(h, buf->has_inode);
-
- spin_unlock(&j->lock);
-}
-
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
@@ -602,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
spin_lock(&j->lock);
- BUG_ON(seq > journal_cur_seq(j));
+ if (WARN_ONCE(seq > journal_cur_seq(j),
+ "requested to flush journal seq %llu, but currently at %llu",
+ seq, journal_cur_seq(j)))
+ goto out;
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
@@ -1071,7 +1028,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
bch2_journal_space_available(j);
spin_unlock(&j->lock);
- return 0;
+ return bch2_journal_reclaim_start(j);
}
/* init/exit: */
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 145c0edf9cf3..c39cbbf1bccd 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -141,7 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j)
return j->pin.back - 1;
}
-u64 bch2_inode_journal_seq(struct journal *, u64);
void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
@@ -163,18 +162,6 @@ static inline void journal_state_inc(union journal_res_state *s)
s->buf3_count += s->idx == 3;
}
-static inline void bch2_journal_set_has_inode(struct journal *j,
- struct journal_res *res,
- u64 inum)
-{
- struct journal_buf *buf = &j->buf[res->idx];
- unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
-
- /* avoid atomic op if possible */
- if (unlikely(!test_bit(bit, buf->has_inode)))
- set_bit(bit, buf->has_inode);
-}
-
/*
* Amount of space that will be taken up by some keys in the journal (i.e.
* including the jset header)
@@ -291,7 +278,7 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
@@ -446,6 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
ret = 0;
if ((flags & JOURNAL_RES_GET_RESERVED) ||
+ test_bit(JOURNAL_NOCHANGES, &j->flags) ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2da6839fcdc0..5c8304e05abd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -450,7 +450,7 @@ static int journal_entry_validate_dev_usage(struct bch_fs *c,
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned expected = sizeof(*u);
unsigned dev;
int ret = 0;
@@ -1259,14 +1259,15 @@ static void journal_write_done(struct closure *cl)
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
- j->seq_ondisk = seq;
- if (err && (!j->err_seq || seq < j->err_seq))
- j->err_seq = seq;
+ if (!err) {
+ j->seq_ondisk = seq;
- if (!JSET_NO_FLUSH(w->data)) {
- j->flushed_seq_ondisk = seq;
- j->last_seq_ondisk = w->last_seq;
- }
+ if (!JSET_NO_FLUSH(w->data)) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = w->last_seq;
+ }
+ } else if (!j->err_seq || seq < j->err_seq)
+ j->err_seq = seq;
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1515,7 +1516,7 @@ retry_alloc:
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (c->opts.nochanges)
+ if (test_bit(JOURNAL_NOCHANGES, &j->flags))
goto no_io;
for_each_rw_member(ca, c, i)
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 70f896717537..ca482c6743c3 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -11,7 +11,6 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
/* Free space calculations: */
@@ -35,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
- unsigned available = (journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr;
+ unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
+ ? ((journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr)
+ : ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
@@ -645,6 +646,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
if (fifo_free(&j->pin) <= 32)
min_nr = 1;
+ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ min_nr = 1;
+
trace_journal_reclaim_start(c,
min_nr,
j->prereserved.reserved,
@@ -654,7 +658,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
nr_flushed = journal_flush_pins(j, seq_to_flush,
min_nr, min_key_cache);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f2060f903cbc..79bc0e49389b 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -250,19 +250,28 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
- for_each_btree_node(&trans, iter, i, POS_MIN,
- BTREE_ITER_PREFETCH, b)
- if (test_bit(BCH_FS_STOPPING, &c->flags)) {
- bch2_trans_exit(&trans);
- return;
- }
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+ 0, 0, BTREE_ITER_PREFETCH);
+retry:
+ bch2_trans_begin(&trans);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+ b &&
+ !test_bit(BCH_FS_STOPPING, &c->flags))
+ b = bch2_btree_iter_next_node(&iter);
+
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
}
- ret = bch2_trans_exit(&trans);
+ bch2_trans_exit(&trans);
if (ret)
return;
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 61674ae1ab5f..d484513289aa 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -34,8 +34,6 @@ struct journal_buf {
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
- /* bloom filter: */
- unsigned long has_inode[1024 / sizeof(unsigned long)];
};
/*
@@ -154,6 +152,7 @@ enum {
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
+ JOURNAL_NOCHANGES,
};
/* Embedded in struct bch_fs */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6ebe49ba2248..6defc33322b3 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
enum btree_id btree_id)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0;
@@ -47,13 +47,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((bch2_trans_begin(&trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
!(ret = bkey_err(k))) {
if (!bch2_bkey_has_device(k, dev_idx)) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
@@ -71,9 +73,18 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
*/
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&sk.k->k))
+ sk.k->k.size = 0;
- ret = bch2_trans_update(&trans, iter, sk.k, 0) ?:
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, sk.k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
@@ -87,9 +98,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
BUG_ON(ret == -EINTR);
@@ -106,7 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct closure cl;
struct btree *b;
struct bkey_buf k;
@@ -122,12 +133,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
closure_init_stack(&cl);
for (id = 0; id < BTREE_ID_NR; id++) {
- for_each_btree_node(&trans, iter, id, POS_MIN,
- BTREE_ITER_PREFETCH, b) {
+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
retry:
+ ret = 0;
+ while (bch2_trans_begin(&trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
- continue;
+ goto next;
bch2_bkey_buf_copy(&k, c, &b->key);
@@ -138,18 +153,23 @@ retry:
break;
}
- ret = bch2_btree_node_update_key(c, iter, b, k.k);
+ ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
if (ret == -EINTR) {
- b = bch2_btree_iter_peek_node(iter);
ret = 0;
- goto retry;
+ continue;
}
+
if (ret) {
bch_err(c, "Error updating btree node key: %i", ret);
break;
}
+next:
+ bch2_btree_iter_next_node(&iter);
}
- bch2_trans_iter_free(&trans, iter);
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
if (ret)
goto err;
@@ -161,7 +181,7 @@ retry:
ret = 0;
err:
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
BUG_ON(ret == -EINTR);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index abcc852731a6..64e39c10e34b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -8,11 +8,13 @@
#include "btree_update_interior.h"
#include "buckets.h"
#include "disk_groups.h"
+#include "ec.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
@@ -53,13 +55,89 @@ struct moving_context {
wait_queue_head_t wait;
};
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bkey_s_c k;
+ struct snapshots_seen s;
+ int ret;
+
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
+ snapshots_seen_init(&s);
+
+ if (!bkey_cmp(old_pos, new_pos))
+ return 0;
+
+ if (!snapshot_t(c, old_pos.snapshot)->children[0])
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+next:
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (bkey_cmp(old_pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+ struct bkey_i *update;
+ size_t i;
+
+ for (i = 0; i < s.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+ goto next;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = new_pos;
+ update->k.p.snapshot = k.k->p.snapshot;
+
+ bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ break;
+
+ ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ kfree(s.d);
+
+ return ret;
+}
+
static int bch2_migrate_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct migrate_write *m =
container_of(op, struct migrate_write, op);
+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_buf _new, _insert;
int ret = 0;
@@ -70,9 +148,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, m->btree_id,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k;
@@ -80,13 +158,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
struct bkey_i_extent *new;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ struct bpos next_pos;
bool did_work = false;
- bool extending = false, should_check_enospc;
+ bool should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- bch2_trans_reset(&trans, 0);
+ bch2_trans_begin(&trans);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -102,9 +181,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
new = bkey_i_to_extent(_new.k);
- bch2_cut_front(iter->pos, &new->k_i);
+ bch2_cut_front(iter.pos, &new->k_i);
- bch2_cut_front(iter->pos, insert);
+ bch2_cut_front(iter.pos, insert);
bch2_cut_back(new->k.p, insert);
bch2_cut_back(insert->k.p, &new->k_i);
@@ -119,7 +198,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
extent_for_each_ptr(extent_i_to_s(new), new_ptr)
new_ptr->cached = true;
- bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+ __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
}
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
@@ -146,8 +225,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
op->opts.background_target,
op->opts.data_replicas);
- ret = bch2_sum_sector_overwrites(&trans, iter, insert,
- &extending,
+ ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
&should_check_enospc,
&i_sectors_delta,
&disk_sectors_delta);
@@ -163,20 +241,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
goto out;
}
- ret = bch2_trans_update(&trans, iter, insert, 0) ?:
+ next_pos = insert->k.p;
+
+ ret = insert_snapshot_whiteouts(&trans, m->btree_id,
+ k.k->p, insert->k.p) ?:
+ bch2_trans_update(&trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
-err:
- if (!ret)
+ if (!ret) {
+ bch2_btree_iter_set_pos(&iter, next_pos);
atomic_long_inc(&c->extent_migrate_done);
+ if (ec_ob)
+ bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+ }
+err:
if (ret == -EINTR)
ret = 0;
if (ret)
break;
next:
- while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+ while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
bch2_keylist_pop_front(keys);
if (bch2_keylist_empty(keys))
goto out;
@@ -184,18 +271,18 @@ next:
continue;
nomatch:
if (m->ctxt) {
- BUG_ON(k.k->p.offset <= iter->pos.offset);
+ BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter->pos.offset,
+ atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
}
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
- bch2_btree_iter_next_slot(iter);
+ bch2_btree_iter_advance(&iter);
goto next;
}
out:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&_insert, c);
bch2_bkey_buf_exit(&_new, c);
@@ -216,11 +303,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
m->op.crc = rbio->pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
- if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
- m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
- m->op.csum_type = m->op.crc.csum_type;
- }
-
if (m->data_cmd == DATA_REWRITE)
bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
}
@@ -235,6 +317,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
struct extent_ptr_decoded p;
int ret;
@@ -255,6 +338,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
m->op.target = data_opts.target,
m->op.write_point = wp;
+ /*
+ * op->csum_type is normally initialized from the fs/file's current
+ * options - but if an extent is encrypted, we require that it stays
+ * encrypted:
+ */
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
+ m->op.nonce = crc.nonce + crc.offset;
+ m->op.csum_type = crc.csum_type;
+ break;
+ }
+
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
@@ -328,12 +423,12 @@ static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
@@ -511,13 +606,13 @@ err:
static int lookup_inode(struct btree_trans *trans, struct bpos pos,
struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -527,15 +622,15 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
goto err;
}
- ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+ ret = bkey_is_inode(k.k) ? 0 : -EIO;
if (ret)
goto err;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -553,7 +648,7 @@ static int __bch2_move_data(struct bch_fs *c,
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct bkey_buf sk;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct data_opts data_opts;
enum data_cmd data_cmd;
@@ -567,8 +662,9 @@ static int __bch2_move_data(struct bch_fs *c,
stats->btree_id = btree_id;
stats->pos = start;
- iter = bch2_trans_get_iter(&trans, btree_id, start,
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, btree_id, start,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
if (rate)
bch2_ratelimit_reset(rate);
@@ -597,9 +693,11 @@ static int __bch2_move_data(struct bch_fs *c,
}
} while (delay);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_begin(&trans);
+
+ k = bch2_btree_iter_peek(&iter);
- stats->pos = iter->pos;
+ stats->pos = iter.pos;
if (!k.k)
break;
@@ -652,8 +750,7 @@ static int __bch2_move_data(struct bch_fs *c,
data_cmd, data_opts);
if (ret2) {
if (ret2 == -EINTR) {
- bch2_trans_reset(&trans, 0);
- bch2_trans_cond_resched(&trans);
+ bch2_trans_begin(&trans);
continue;
}
@@ -673,18 +770,41 @@ next:
atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
&stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(iter);
- bch2_trans_cond_resched(&trans);
+ bch2_btree_iter_advance(&iter);
}
out:
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+ memset(stats, 0, sizeof(*stats));
+
+ scnprintf(stats->name, sizeof(stats->name),
+ "%s", name);
+}
+
+static inline void progress_list_add(struct bch_fs *c,
+ struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_add(&stats->list, &c->data_progress_list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static inline void progress_list_del(struct bch_fs *c,
+ struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_del(&stats->list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
@@ -697,6 +817,7 @@ int bch2_move_data(struct bch_fs *c,
enum btree_id id;
int ret;
+ progress_list_add(c, stats);
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
@@ -730,6 +851,7 @@ int bch2_move_data(struct bch_fs *c,
atomic64_read(&stats->sectors_moved),
atomic64_read(&stats->keys_moved));
+ progress_list_del(c, stats);
return ret;
}
@@ -746,7 +868,7 @@ static int bch2_move_btree(struct bch_fs *c,
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_opts data_opts;
@@ -754,6 +876,7 @@ static int bch2_move_btree(struct bch_fs *c,
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
+ progress_list_add(c, stats);
stats->data_type = BCH_DATA_btree;
@@ -762,17 +885,21 @@ static int bch2_move_btree(struct bch_fs *c,
id++) {
stats->btree_id = id;
- for_each_btree_node(&trans, iter, id,
- id == start_btree_id ? start_pos : POS_MIN,
- BTREE_ITER_PREFETCH, b) {
+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
+retry:
+ ret = 0;
+ while (bch2_trans_begin(&trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
break;
if ((cmp_int(id, end_btree_id) ?:
- bkey_cmp(b->key.k.p, end_pos)) > 0)
+ bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter->pos;
+ stats->pos = iter.pos;
switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
case DATA_SKIP:
@@ -786,13 +913,19 @@ static int bch2_move_btree(struct bch_fs *c,
BUG();
}
- ret = bch2_btree_node_rewrite(c, iter,
- b->data->keys.seq, 0) ?: ret;
+ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
next:
- bch2_trans_cond_resched(&trans);
+ bch2_btree_iter_next_node(&iter);
}
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_iter_free(&trans, iter) ?: ret;
if (kthread && kthread_should_stop())
break;
}
@@ -802,6 +935,11 @@ next:
if (ret)
bch_err(c, "error %i in bch2_move_btree", ret);
+ /* flush relevant btree updates */
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+
+ progress_list_del(c, stats);
return ret;
}
@@ -821,16 +959,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
struct data_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
- unsigned replicas = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- replicas = c->opts.metadata_replicas;
- break;
- case KEY_TYPE_extent:
- replicas = io_opts->data_replicas;
- break;
- }
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
return DATA_SKIP;
@@ -921,7 +1052,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
ret = bch2_move_btree(c,
0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
+ BTREE_ID_NR, SPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
@@ -943,6 +1074,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
+ bch_move_stats_init(stats, "rereplicate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@@ -950,10 +1082,6 @@ int bch2_data_job(struct bch_fs *c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
rereplicate_btree_pred, c, stats) ?: ret;
-
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
-
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
@@ -967,6 +1095,7 @@ int bch2_data_job(struct bch_fs *c,
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
+ bch_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
@@ -984,6 +1113,7 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
+ bch_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
break;
default:
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 5076153689d1..2a789a1158ca 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
+inline void bch_move_stats_init(struct bch_move_stats *stats,
+ char *name);
+
+
#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index fc0de165af9f..9df6d18137a5 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -6,6 +6,8 @@ struct bch_move_stats {
enum bch_data_type data_type;
enum btree_id btree_id;
struct bpos pos;
+ struct list_head list;
+ char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 2acca0ddb6fd..5c9eafc026c9 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -85,6 +85,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
BUG_ON(i != j);
#endif
if (i >= 0 &&
+ p.ptr.dev == h->data[i].dev &&
p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
p.ptr.gen == h->data[i].gen) {
/*
@@ -146,7 +147,8 @@ static int bch2_copygc(struct bch_fs *c)
size_t b, heap_size = 0;
int ret;
- memset(&move_stats, 0, sizeof(move_stats));
+ bch_move_stats_init(&move_stats, "copygc");
+
/*
* Find buckets with lowest sector counts, skipping completely
* empty buckets, by building a maxheap sorted by sector count,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 64bf5a382d63..a955ef2008c9 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = {
NULL
};
+const char * const bch2_csum_types[] = {
+ BCH_CSUM_TYPES()
+ NULL
+};
+
const char * const bch2_csum_opts[] = {
BCH_CSUM_OPTS()
NULL
};
+const char * const bch2_compression_types[] = {
+ BCH_COMPRESSION_TYPES()
+ NULL
+};
+
const char * const bch2_compression_opts[] = {
BCH_COMPRESSION_OPTS()
NULL
};
const char * const bch2_str_hash_types[] = {
+ BCH_STR_HASH_TYPES()
+ NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
BCH_STR_HASH_OPTS()
NULL
};
@@ -63,6 +78,19 @@ const char * const bch2_member_states[] = {
#undef x
+const char * const bch2_d_types[BCH_DT_MAX] = {
+ [DT_UNKNOWN] = "unknown",
+ [DT_FIFO] = "fifo",
+ [DT_CHR] = "chr",
+ [DT_DIR] = "dir",
+ [DT_BLK] = "blk",
+ [DT_REG] = "reg",
+ [DT_LNK] = "lnk",
+ [DT_SOCK] = "sock",
+ [DT_WHT] = "whiteout",
+ [DT_SUBVOL] = "subvol",
+};
+
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1e2fc5de5ca4..afb1bb2a62d2 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -12,12 +12,21 @@ extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
extern const char * const bch2_data_types[];
extern const char * const bch2_cache_replacement_policies[];
extern const char * const bch2_member_states[];
+extern const char * const bch2_d_types[];
+
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
/*
* Mount options; we also store defaults in the superblock.
@@ -134,7 +143,7 @@ enum opt_type {
NULL, NULL) \
x(str_hash, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_str_hash_types), \
+ OPT_STR(bch2_str_hash_opts), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
@@ -170,8 +179,18 @@ enum opt_type {
x(shard_inode_numbers, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, false, \
+ BCH_SB_SHARD_INUMS, true, \
NULL, "Shard new inode numbers by CPU id") \
+ x(inodes_use_key_cache, u8, \
+ OPT_FORMAT|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH_SB_INODES_USE_KEY_CACHE, true, \
+ NULL, "Use the btree key cache for the inodes btree") \
+ x(btree_node_mem_ptr_optimization, u8, \
+ OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ NO_SB_OPT, true, \
+ NULL, "Stash pointer to in memory btree node in btree ptr")\
x(gc_reserve_percent, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 35b409e0f366..8f8f4b0accd6 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -3,6 +3,7 @@
#include "btree_update.h"
#include "inode.h"
#include "quota.h"
+#include "subvolume.h"
#include "super-io.h"
static const char *bch2_sb_validate_quota(struct bch_sb *sb,
@@ -357,7 +358,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
@@ -372,9 +373,10 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
void bch2_fs_quota_exit(struct bch_fs *c)
@@ -414,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c)
}
}
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct bch_subvolume subvolume;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!k.k)
+ return 1;
+
+ ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't do quota accounting in snapshots:
+ */
+ if (BCH_SUBVOLUME_SNAP(&subvolume))
+ goto advance;
+
+ if (!bkey_is_inode(k.k))
+ goto advance;
+
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
+
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+ KEY_TYPE_QUOTA_NOCHECK);
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+ KEY_TYPE_QUOTA_NOCHECK);
+advance:
+ bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+ return 0;
+}
+
int bch2_fs_quota_read(struct bch_fs *c)
{
unsigned i, qtypes = enabled_qtypes(c);
struct bch_memquota_type *q;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bch_inode_unpacked u;
- struct bkey_s_c k;
+ struct btree_iter iter;
int ret;
mutex_lock(&c->sb_lock);
@@ -436,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- switch (k.k->type) {
- case KEY_TYPE_inode:
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
- if (ret)
- return ret;
-
- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
- KEY_TYPE_QUOTA_NOCHECK);
- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
- KEY_TYPE_QUOTA_NOCHECK);
- }
- }
- bch2_trans_iter_put(&trans, iter);
-
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ do {
+ ret = lockrestart_do(&trans,
+ bch2_fs_quota_read_inode(&trans, &iter));
+ } while (!ret);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
/* Enable/disable/delete quotas for an entire filesystem: */
@@ -717,13 +755,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
struct bkey_i_quota *new_quota,
struct qc_dqblk *qdq)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (unlikely(ret))
@@ -742,8 +780,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
if (qdq->d_fieldmask & QC_INO_HARD)
new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
- ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -760,7 +798,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
+ ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index a0dbf41d1d37..a573fede05b1 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
+ struct bch_move_stats move_stats;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
@@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg)
prev_start = jiffies;
prev_cputime = curr_cputime();
+ bch_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
@@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg)
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
- memset(&r->move_stats, 0, sizeof(r->move_stats));
+ memset(&move_stats, 0, sizeof(move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
@@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg)
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
rebalance_pred, NULL,
- &r->move_stats);
+ &move_stats);
}
return 0;
@@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
h1);
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n"
- "pos ");
- bch2_bpos_to_text(out, r->move_stats.pos);
- pr_buf(out, "\n");
+ pr_buf(out, "running\n");
break;
}
}
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 2f62a643c39f..7462a92e9598 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -19,7 +19,6 @@ struct bch_fs_rebalance {
enum rebalance_state state;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
- struct bch_move_stats move_stats;
unsigned enabled:1;
};
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 9bd6348842e0..c3b4d116275c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -20,6 +20,7 @@
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
@@ -39,6 +40,20 @@ static void drop_alloc_keys(struct journal_keys *keys)
keys->nr = dst;
}
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+ struct journal_key *i;
+
+ for (i = keys->d; i < keys->d + keys->nr; i++)
+ if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
/* iterate over keys read from the journal: */
static int __journal_key_cmp(enum btree_id l_btree_id,
@@ -312,7 +327,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
(k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&tmp, c, k);
- bch2_btree_node_prefetch(c, NULL, tmp.k,
+ bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
b->c.btree_id, b->c.level - 1);
bch2_btree_and_journal_iter_advance(&iter);
@@ -322,10 +337,11 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
bch2_bkey_buf_exit(&tmp, c);
}
-static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
enum btree_id btree_id,
btree_walk_key_fn key_fn)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf tmp;
@@ -349,11 +365,11 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
btree_and_journal_iter_prefetch(c, b, iter);
- ret = bch2_btree_and_journal_walk_recurse(c, child,
+ ret = bch2_btree_and_journal_walk_recurse(trans, child,
btree_id, key_fn);
six_unlock_read(&child->c.lock);
} else {
- ret = key_fn(c, k);
+ ret = key_fn(trans, k);
}
if (ret)
@@ -367,9 +383,10 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
return ret;
}
-int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
+int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
btree_walk_key_fn key_fn)
{
+ struct bch_fs *c = trans->c;
struct btree *b = c->btree_roots[btree_id].b;
int ret = 0;
@@ -377,7 +394,7 @@ int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
+ ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
six_unlock_read(&b->c.lock);
return ret;
@@ -501,64 +518,38 @@ static void replay_now_at(struct journal *j, u64 seq)
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
- enum btree_id id, unsigned level,
- struct bkey_i *k)
+ struct journal_key *k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
+ unsigned iter_flags =
+ BTREE_ITER_INTENT|
+ BTREE_ITER_NOT_EXTENTS;
int ret;
- iter = bch2_trans_get_node_iter(trans, id, k->k.p,
- BTREE_MAX_DEPTH, level,
- BTREE_ITER_INTENT);
+ if (!k->level && k->btree_id == BTREE_ID_alloc)
+ iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
- /*
- * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
- * extent_handle_overwrites() and extent_update_to_keys() - but we don't
- * want that here, journal replay is supposed to treat extents like
- * regular keys:
- */
- BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, k->level,
+ iter_flags);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
{
- unsigned commit_flags = BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW;
+ unsigned commit_flags =
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RESERVED;
if (!k->allocated)
commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
return bch2_trans_do(c, NULL, NULL, commit_flags,
- __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
-}
-
-static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
-{
- struct btree_iter *iter;
- int ret;
-
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_put(trans, iter);
- return ret;
-}
-
-static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
- return bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_alloc_replay_key(&trans, k));
+ __bch2_journal_replay_key(&trans, k));
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
@@ -596,7 +587,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (!i->level && i->btree_id == BTREE_ID_alloc) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_alloc_replay_key(c, i->k);
+ ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
@@ -725,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
- for (i = 0; i < nr_types; i++) {
+ for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
@@ -954,6 +945,74 @@ fsck_err:
return ret;
}
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL;
+ root_snapshot.v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+ &root_snapshot.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+ &root_volume.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(c, "root inode not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
@@ -972,6 +1031,8 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->sb.clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
+ else
+ bch_info(c, "recovering from unclean shutdown");
if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -990,7 +1051,6 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
ret = -EINVAL;
goto err;
-
}
if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
@@ -1005,11 +1065,20 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
- if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
- bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
- c->opts.version_upgrade = true;
- c->opts.fsck = true;
- c->opts.fix_errors = FSCK_OPT_YES;
+ if (!c->opts.nochanges) {
+ if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
+ bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+ c->opts.version_upgrade = true;
+ c->opts.fsck = true;
+ c->opts.fix_errors = FSCK_OPT_YES;
+ } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
+ bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
+ c->opts.version_upgrade = true;
+ c->opts.fsck = true;
+ } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
+ bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+ c->opts.version_upgrade = true;
+ }
}
ret = bch2_blacklist_table_initialize(c);
@@ -1074,6 +1143,8 @@ use_clean:
drop_alloc_keys(&c->journal_keys);
}
+ zero_out_btree_mem_ptr(&c->journal_keys);
+
ret = journal_replay_early(c, clean, &c->journal_entries);
if (ret)
goto err;
@@ -1176,6 +1247,31 @@ use_clean:
bch_verbose(c, "alloc write done");
}
+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+ bch2_fs_lazy_rw(c);
+
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+ }
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+ /* set bi_subvol on root inode */
+ err = "error upgrade root inode for subvolumes";
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_fs_upgrade_for_subvolumes(&trans));
+ if (ret)
+ goto err;
+ }
+
if (c->opts.fsck) {
bch_info(c, "starting fsck");
err = "error in fsck";
@@ -1202,7 +1298,9 @@ use_clean:
if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
- struct bch_move_stats stats = { 0 };
+ struct bch_move_stats stats;
+
+ bch_move_stats_init(&stats, "recovery");
bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c);
@@ -1334,9 +1432,22 @@ int bch2_fs_initialize(struct bch_fs *c)
}
}
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
- root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
bch2_inode_pack(c, &packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
@@ -1351,11 +1462,12 @@ int bch2_fs_initialize(struct bch_fs *c)
err = "error creating lost+found";
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+ bch2_create_trans(&trans,
+ BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
&lostfound,
0, 0, S_IFDIR|0700, 0,
- NULL, NULL));
+ NULL, NULL, (subvol_inum) { 0 }, 0));
if (ret) {
bch_err(c, "error creating lost+found");
goto err;
@@ -1368,7 +1480,7 @@ int bch2_fs_initialize(struct bch_fs *c)
}
err = "error writing first journal entry";
- ret = bch2_journal_meta(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
if (ret)
goto err;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index e5565e4f335a..e45c70b3693f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -45,9 +45,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
-typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
+typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
+int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index a420729288d4..8dcac7815c9f 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -7,6 +7,7 @@
#include "inode.h"
#include "io.h"
#include "reflink.h"
+#include "subvolume.h"
#include <linux/sched/signal.h>
@@ -31,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (bkey_val_bytes(p.k) != sizeof(*p.v))
return "incorrect value size";
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
+ return "idx < front_pad";
+
return NULL;
}
@@ -39,27 +44,28 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+ pr_buf(out, "idx %llu front_pad %u back_pad %u",
+ le64_to_cpu(p.v->idx),
+ le32_to_cpu(p.v->front_pad),
+ le32_to_cpu(p.v->back_pad));
}
-enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
- struct bkey_s _l, struct bkey_s _r)
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
{
struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
- struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+ struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
- return BCH_MERGE_NOMERGE;
+ /*
+ * Disabled for now, the triggers code needs to be reworked for merging
+ * of reflink pointers to work:
+ */
+ return false;
- if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
- bch2_key_resize(l.k, KEY_SIZE_MAX);
- bch2_cut_front_s(l.k->p, _r);
- return BCH_MERGE_PARTIAL;
- }
+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ return false;
bch2_key_resize(l.k, l.k->size + r.k->size);
-
- return BCH_MERGE_MERGE;
+ return true;
}
/* indirect extents */
@@ -84,6 +90,14 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+
/* indirect inline data */
const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
@@ -110,7 +124,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
struct bkey_i *orig)
{
struct bch_fs *c = trans->c;
- struct btree_iter *reflink_iter;
+ struct btree_iter reflink_iter = { NULL };
struct bkey_s_c k;
struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
@@ -120,11 +134,11 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
if (orig->k.type == KEY_TYPE_inline_data)
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
- for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
+ for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
POS(0, c->reflink_hint),
BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
- if (reflink_iter->pos.inode) {
- bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+ if (reflink_iter.pos.inode) {
+ bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
continue;
}
@@ -136,16 +150,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
goto err;
/* rewind iter to start of hole, if necessary: */
- bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+ bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
- r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k));
+ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
goto err;
bkey_init(&r_v->k);
r_v->k.type = bkey_type_to_indirect(&orig->k);
- r_v->k.p = reflink_iter->pos;
+ r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
r_v->k.version = orig->k.version;
@@ -155,26 +169,25 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
- ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
if (ret)
goto err;
- r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
- if (IS_ERR(r_p)) {
- ret = PTR_ERR(r_p);
- goto err;
- }
-
+ /*
+ * orig is in a bkey_buf which statically allocates 5 64s for the val,
+ * so we know it will be big enough:
+ */
orig->k.type = KEY_TYPE_reflink_p;
r_p = bkey_i_to_reflink_p(orig);
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+ memset(&r_p->v, 0, sizeof(r_p->v));
+
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
err:
- if (!IS_ERR(reflink_iter))
- c->reflink_hint = reflink_iter->pos.offset;
- bch2_trans_iter_put(trans, reflink_iter);
+ c->reflink_hint = reflink_iter.pos.offset;
+ bch2_trans_iter_exit(trans, &reflink_iter);
return ret;
}
@@ -184,7 +197,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
struct bkey_s_c k;
int ret;
- for_each_btree_key_continue(iter, 0, k, ret) {
+ for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
if (bkey_cmp(iter->pos, end) >= 0)
break;
@@ -192,22 +205,27 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
return k;
}
- bch2_btree_iter_set_pos(iter, end);
- return bkey_s_c_null;
+ if (bkey_cmp(iter->pos, end) >= 0)
+ bch2_btree_iter_set_pos(iter, end);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
}
s64 bch2_remap_range(struct bch_fs *c,
- struct bpos dst_start, struct bpos src_start,
- u64 remap_sectors, u64 *journal_seq,
+ subvol_inum dst_inum, u64 dst_offset,
+ subvol_inum src_inum, u64 src_offset,
+ u64 remap_sectors,
u64 new_i_size, s64 *i_sectors_delta)
{
struct btree_trans trans;
- struct btree_iter *dst_iter, *src_iter;
+ struct btree_iter dst_iter, src_iter;
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
+ struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+ struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos src_want;
u64 dst_done;
+ u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget(&c->writes))
@@ -222,13 +240,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
- src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
- BTREE_ITER_INTENT);
- dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+ BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+ BTREE_ITER_INTENT);
while ((ret == 0 || ret == -EINTR) &&
- bkey_cmp(dst_iter->pos, dst_end) < 0) {
+ bkey_cmp(dst_iter.pos, dst_end) < 0) {
struct disk_reservation disk_res = { 0 };
bch2_trans_begin(&trans);
@@ -238,31 +256,45 @@ s64 bch2_remap_range(struct bch_fs *c,
break;
}
- dst_done = dst_iter->pos.offset - dst_start.offset;
+ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+ &src_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+ &dst_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+ dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(src_iter, src_want);
+ bch2_btree_iter_set_pos(&src_iter, src_want);
- src_k = get_next_src(src_iter, src_end);
+ src_k = get_next_src(&src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
- if (bkey_cmp(src_want, src_iter->pos) < 0) {
- ret = bch2_fpunch_at(&trans, dst_iter,
- bpos_min(dst_end,
- POS(dst_iter->pos.inode, dst_iter->pos.offset +
- src_iter->pos.offset - src_want.offset)),
- journal_seq, i_sectors_delta);
+ if (bkey_cmp(src_want, src_iter.pos) < 0) {
+ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+ min(dst_end.offset,
+ dst_iter.pos.offset +
+ src_iter.pos.offset - src_want.offset),
+ i_sectors_delta);
continue;
}
if (src_k.k->type != KEY_TYPE_reflink_p) {
+ bch2_btree_iter_set_pos_to_extent_start(&src_iter);
+
bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
- bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k));
-
- ret = bch2_make_extent_indirect(&trans, src_iter,
+ ret = bch2_make_extent_indirect(&trans, &src_iter,
new_src.k);
if (ret)
continue;
@@ -285,46 +317,47 @@ s64 bch2_remap_range(struct bch_fs *c,
BUG();
}
- new_dst.k->k.p = dst_iter->pos;
+ new_dst.k->k.p = dst_iter.pos;
bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_want.offset,
- dst_end.offset - dst_iter->pos.offset));
- ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
- &disk_res, journal_seq,
+ dst_end.offset - dst_iter.pos.offset));
+
+ ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res, NULL,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
}
- bch2_trans_iter_put(&trans, dst_iter);
- bch2_trans_iter_put(&trans, src_iter);
+ bch2_trans_iter_exit(&trans, &dst_iter);
+ bch2_trans_iter_exit(&trans, &src_iter);
- BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
- BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+ BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
+ BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
- dst_done = dst_iter->pos.offset - dst_start.offset;
- new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
-
- bch2_trans_begin(&trans);
+ dst_done = dst_iter.pos.offset - dst_start.offset;
+ new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
do {
struct bch_inode_unpacked inode_u;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
+
+ bch2_trans_begin(&trans);
- inode_iter = bch2_inode_peek(&trans, &inode_u,
- dst_start.inode, BTREE_ITER_INTENT);
- ret2 = PTR_ERR_OR_ZERO(inode_iter);
+ ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+ dst_inum, BTREE_ITER_INTENT);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
inode_u.bi_size = new_i_size;
- ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, journal_seq, 0);
+ ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
}
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
} while (ret2 == -EINTR);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index bfc785619ee8..3745873fd88d 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -5,8 +5,7 @@
const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-enum merge_result bch2_reflink_p_merge(struct bch_fs *,
- struct bkey_s, struct bkey_s);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
@@ -58,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
}
}
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
- u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+ subvol_inum, u64, u64, u64, s64 *);
#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index dbbbcc6dcec6..002006593044 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -1010,6 +1010,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
+ if (e->data_type == BCH_DATA_cached)
+ continue;
+
for (i = 0; i < e->nr_devs; i++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index b8492b51a262..57d636740d2f 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -8,24 +8,25 @@
#include "error.h"
#include "inode.h"
#include "siphash.h"
+#include "subvolume.h"
#include "super.h"
#include <linux/crc32c.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
switch (opt) {
case BCH_STR_HASH_OPT_crc32c:
- return BCH_STR_HASH_CRC32C;
+ return BCH_STR_HASH_crc32c;
case BCH_STR_HASH_OPT_crc64:
- return BCH_STR_HASH_CRC64;
+ return BCH_STR_HASH_crc64;
case BCH_STR_HASH_OPT_siphash:
return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
- ? BCH_STR_HASH_SIPHASH
- : BCH_STR_HASH_SIPHASH_OLD;
+ ? BCH_STR_HASH_siphash
+ : BCH_STR_HASH_siphash_old;
default:
BUG();
}
@@ -50,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
.siphash_key = { .k0 = bi->bi_hash_seed }
};
- if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
+ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
@@ -76,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
const struct bch_hash_info *info)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
sizeof(info->siphash_key.k0));
break;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
sizeof(info->siphash_key.k0));
break;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
SipHash24_Init(&ctx->siphash, &info->siphash_key);
break;
default:
@@ -98,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
const void *data, size_t len)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
ctx->crc32c = crc32c(ctx->crc32c, data, len);
break;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
ctx->crc64 = crc64_be(ctx->crc64, data, len);
break;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
SipHash24_Update(&ctx->siphash, data, len);
break;
default:
@@ -117,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
const struct bch_hash_info *info)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
return ctx->crc32c;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
return ctx->crc64 >> 1;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
return SipHash24_End(&ctx->siphash) >> 1;
default:
BUG();
@@ -137,28 +138,40 @@ struct bch_hash_desc {
u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
bool (*cmp_key)(struct bkey_s_c, const void *);
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+ bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
};
-static __always_inline struct btree_iter *
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+ return k.k->type == desc.key_type &&
+ (!desc.is_visible || desc.is_visible(inum, k));
+}
+
+static __always_inline int
bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key,
+ subvol_inum inum, const void *key,
unsigned flags)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
- if (k.k->type == desc.key_type) {
+ if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
- return iter;
+ return 0;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
@@ -166,35 +179,38 @@ bch2_hash_lookup(struct btree_trans *trans,
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret ?: -ENOENT);
+ return ret ?: -ENOENT;
}
-static __always_inline struct btree_iter *
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
- if (k.k->type != desc.key_type)
- return iter;
+ if (!is_visible_key(desc, inum, k))
+ return 0;
}
+ bch2_trans_iter_exit(trans, iter);
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- bch2_trans_iter_put(trans, iter);
-
- return ERR_PTR(ret ?: -ENOSPC);
+ return ret ?: -ENOSPC;
}
static __always_inline
@@ -203,28 +219,27 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
const struct bch_hash_info *info,
struct btree_iter *start)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_copy_iter(trans, start);
+ bch2_trans_copy_iter(&iter, start);
- bch2_btree_iter_next_slot(iter);
+ bch2_btree_iter_advance(&iter);
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset) {
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
ret = 1;
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -232,20 +247,28 @@ static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, struct bkey_i *insert, int flags)
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
{
- struct btree_iter *iter, *slot = NULL;
+ struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ SPOS(inum.inum,
+ desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+ snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter.pos.inode != inum.inum)
break;
- if (k.k->type == desc.key_type) {
+ if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -253,9 +276,9 @@ int bch2_hash_set(struct btree_trans *trans,
continue;
}
- if (!slot &&
+ if (!slot.path &&
!(flags & BCH_HASH_SET_MUST_REPLACE))
- slot = bch2_trans_copy_iter(trans, iter);
+ bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
@@ -264,8 +287,8 @@ int bch2_hash_set(struct btree_trans *trans,
if (!ret)
ret = -ENOSPC;
out:
- bch2_trans_iter_put(trans, slot);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &slot);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
found:
@@ -277,11 +300,11 @@ not_found:
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
- if (!found && slot)
+ if (!found && slot.path)
swap(iter, slot);
- insert->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, insert, 0);
+ insert->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, insert, 0);
}
goto out;
@@ -291,7 +314,8 @@ static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- struct btree_iter *iter)
+ struct btree_iter *iter,
+ unsigned update_flags)
{
struct bkey_i *delete;
int ret;
@@ -309,25 +333,25 @@ int bch2_hash_delete_at(struct btree_trans *trans,
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, 0);
+ return bch2_trans_update(trans, iter, delete, update_flags);
}
static __always_inline
int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_hash_lookup(trans, desc, info, inode, key,
+ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ if (ret)
+ return ret;
- ret = bch2_hash_delete_at(trans, desc, info, iter);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
new file mode 100644
index 000000000000..7e909a118189
--- /dev/null
+++ b/fs/bcachefs/subvolume.c
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "fs.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+ pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+ BCH_SNAPSHOT_SUBVOL(s.v),
+ BCH_SNAPSHOT_DELETED(s.v),
+ le32_to_cpu(s.v->parent),
+ le32_to_cpu(s.v->children[0]),
+ le32_to_cpu(s.v->children[1]),
+ le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s;
+ u32 i, id;
+
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+ bkey_cmp(k.k->p, POS(0, 1)) < 0)
+ return "bad pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+ return "bad val size";
+
+ s = bkey_s_c_to_snapshot(k);
+
+ id = le32_to_cpu(s.v->parent);
+ if (id && id <= k.k->p.offset)
+ return "bad parent node";
+
+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+ return "children not normalized";
+
+ if (s.v->children[0] &&
+ s.v->children[0] == s.v->children[1])
+ return "duplicate child nodes";
+
+ for (i = 0; i < 2; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ if (id >= k.k->p.offset)
+ return "bad child node";
+ }
+
+ return NULL;
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_t *t;
+
+ t = genradix_ptr_alloc(&c->snapshots,
+ U32_MAX - new.k->p.offset,
+ GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ if (new.k->type == KEY_TYPE_snapshot) {
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+ t->parent = le32_to_cpu(s.v->parent);
+ t->children[0] = le32_to_cpu(s.v->children[0]);
+ t->children[1] = le32_to_cpu(s.v->children[1]);
+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+ } else {
+ t->parent = 0;
+ t->children[0] = 0;
+ t->children[1] = 0;
+ t->subvol = 0;
+ }
+
+ return 0;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+ if (!ret)
+ *s = *bkey_s_c_to_snapshot(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+ struct bch_snapshot v;
+ int ret;
+
+ if (!id)
+ return 0;
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %u not found", id);
+ if (ret)
+ return ret;
+
+ return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ unsigned i;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ u32 id = k.k->p.offset, child[2];
+ unsigned nr_live = 0, live_idx;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
+
+ for (i = 0; i < 2; i++) {
+ ret = snapshot_live(trans, child[i]);
+ if (ret < 0)
+ break;
+
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
+ }
+
+ snapshot_t(c, id)->equiv = nr_live == 1
+ ? snapshot_t(c, child[live_idx])->equiv
+ : id;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ bch_err(c, "error walking snapshots: %i", ret);
+
+ return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+ struct bkey_s_c_snapshot s)
+{
+ struct bch_subvolume subvol;
+ struct bch_snapshot v;
+ u32 i, id;
+ int ret;
+
+ id = le32_to_cpu(s.v->subvol);
+ ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+ bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ s.k->p.offset);
+ return -EINVAL;
+ }
+
+ id = le32_to_cpu(s.v->parent);
+ if (id) {
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+ le32_to_cpu(v.children[1]) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+ id, s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < 2 && s.v->children[i]; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.parent) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ id, le32_to_cpu(v.parent), s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_snapshot s;
+ unsigned id;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error %i checking snapshots", ret);
+ goto err;
+ }
+
+ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+again_2:
+ id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+ ret = snapshot_lookup(&trans, id, &s);
+
+ if (ret == -EINTR) {
+ k = bch2_btree_iter_peek(&iter);
+ goto again_2;
+ } else if (ret == -ENOENT)
+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, id);
+ else if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+ genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ bool have_deleted = false;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+ break;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(c, "found wrong key type %u in snapshot node table",
+ k.k->type);
+ continue;
+ }
+
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ have_deleted = true;
+
+ ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_exit(&trans);
+
+ if (!ret && have_deleted) {
+ bch_info(c, "restarting deletion of dead snapshots");
+ if (c->opts.fsck) {
+ bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+ } else {
+ bch2_delete_dead_snapshots(c);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_snapshot *s;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* already deleted? */
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ goto err;
+
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&s->k_i, k);
+
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot s;
+ struct bkey_i_snapshot *parent;
+ u32 parent_id;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ s = bkey_s_c_to_snapshot(k);
+
+ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+ parent_id = le32_to_cpu(s.v->parent);
+
+ if (parent_id) {
+ bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+ POS(0, parent_id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&p_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+ ret = PTR_ERR_OR_ZERO(parent);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&parent->k_i, k);
+
+ for (i = 0; i < 2; i++)
+ if (le32_to_cpu(parent->v.children[i]) == id)
+ break;
+
+ if (i == 2)
+ bch_err(trans->c, "snapshot %u missing child pointer to %u",
+ parent_id, id);
+ else
+ parent->v.children[i] = 0;
+
+ if (le32_to_cpu(parent->v.children[0]) <
+ le32_to_cpu(parent->v.children[1]))
+ swap(parent->v.children[0],
+ parent->v.children[1]);
+
+ ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &p_iter);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct btree_iter iter;
+ struct bkey_i_snapshot *n;
+ struct bkey_s_c k;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS_MIN, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < nr_snapids; i++) {
+ k = bch2_btree_iter_prev_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || !k.k->p.offset) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_snapshot_init(&n->k_i);
+ n->k.p = iter.pos;
+ n->v.flags = 0;
+ n->v.parent = cpu_to_le32(parent);
+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
+ n->v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+ bch2_trans_update(trans, &iter, &n->k_i, 0);
+
+ ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+ if (ret)
+ break;
+
+ new_snapids[i] = iter.pos.offset;
+ }
+
+ if (parent) {
+ bch2_btree_iter_set_pos(&iter, POS(0, parent));
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not found", parent);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, k);
+
+ if (n->v.children[0] || n->v.children[1]) {
+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ n->v.children[0] = cpu_to_le32(new_snapids[0]);
+ n->v.children[1] = cpu_to_le32(new_snapids[1]);
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+ bch2_trans_update(trans, &iter, &n->k_i, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+ BUG_ON(snapshot_list_has_id(s, id));
+
+ if (s->nr == s->size) {
+ size_t new_size = max(8U, s->size * 2);
+ void *n = krealloc(s->d,
+ new_size * sizeof(s->d[0]),
+ GFP_KERNEL);
+ if (!n) {
+ pr_err("error allocating snapshot ID list");
+ return -ENOMEM;
+ }
+
+ s->d = n;
+ s->size = new_size;
+ };
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+ struct snapshot_id_list *deleted,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct snapshot_id_list equiv_seen = { 0 };
+ struct bpos last_pos = POS_MIN;
+ int ret = 0;
+
+ /*
+ * XXX: We should also delete whiteouts that no longer overwrite
+ * anything
+ */
+
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ while ((bch2_trans_begin(trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
+ !(ret = bkey_err(k))) {
+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+ if (bkey_cmp(k.k->p, last_pos))
+ equiv_seen.nr = 0;
+ last_pos = k.k->p;
+
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(&equiv_seen, equiv)) {
+ if (btree_id == BTREE_ID_inodes &&
+ bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+ continue;
+
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ if (ret)
+ break;
+ } else {
+ ret = snapshot_id_add(&equiv_seen, equiv);
+ if (ret)
+ break;
+ }
+
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ kfree(equiv_seen.d);
+
+ return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ struct snapshot_id_list deleted = { 0 };
+ u32 i, id, children[2];
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ /*
+ * For every snapshot node: If we have no live children and it's not
+ * pointed to by a subvolume, delete it:
+ */
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ continue;
+
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
+
+ ret = snapshot_live(&trans, children[0]) ?:
+ snapshot_live(&trans, children[1]);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v)) {
+ ret = snapshot_id_add(&deleted, k.k->p.offset);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_snapshots(id))
+ continue;
+
+ ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+ if (ret) {
+ bch_err(c, "error deleting snapshot keys: %i", ret);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < deleted.nr; i++) {
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %u: %i",
+ deleted.d[i], ret);
+ goto err;
+ }
+ }
+err:
+ kfree(deleted.d);
+ bch2_trans_exit(&trans);
+ percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return;
+
+ if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+ percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ bch2_delete_dead_snapshots(trans->c);
+ return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+ return "invalid pos";
+
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ return "invalid pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+ return "bad val size";
+
+ return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ pr_buf(out, "root %llu snapshot id %u",
+ le64_to_cpu(s.v->inode),
+ le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+ if (ret == -ENOENT && inconsistent_if_not_found)
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+ if (!ret)
+ *s = *bkey_s_c_to_subvolume(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+ struct bch_subvolume *subvol)
+{
+ struct bch_snapshot snap;
+
+ return snapshot_lookup(trans, snapshot, &snap) ?:
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+ u32 *snapid)
+{
+ struct bch_subvolume s;
+ int ret;
+
+ ret = bch2_subvolume_get(trans, subvol, true,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_WITH_UPDATES,
+ &s);
+
+ *snapid = le32_to_cpu(s.snapshot);
+ return ret;
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
+ struct btree_trans_commit_hook *h;
+ struct bkey_i *delete;
+ u32 snapid;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+ ret = -EIO;
+ goto err;
+ }
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ ret = PTR_ERR_OR_ZERO(delete);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete->k);
+ delete->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, delete, 0);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ goto err;
+
+ h->fn = bch2_delete_dead_snapshots_hook;
+ bch2_trans_commit_hook(trans, h);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ snapshot_wait_for_pagecache_and_delete_work);
+ struct snapshot_id_list s;
+ u32 *id;
+ int ret = 0;
+
+ while (!ret) {
+ mutex_lock(&c->snapshots_unlinked_lock);
+ s = c->snapshots_unlinked;
+ memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (!s.nr)
+ break;
+
+ bch2_evict_subvolume_inodes(c, &s);
+
+ for (id = s.d; id < s.d + s.nr; id++) {
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_subvolume_delete(&trans, *id));
+ if (ret) {
+ bch_err(c, "error %i deleting subvolume %u", ret, *id);
+ break;
+ }
+ }
+
+ kfree(s.d);
+ }
+
+ percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+ struct btree_trans_commit_hook h;
+ u32 subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *_h)
+{
+ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ mutex_lock(&c->snapshots_unlinked_lock);
+ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+ ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (ret)
+ return ret;
+
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return -EROFS;
+
+ if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+ percpu_ref_put(&c->writes);
+ return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_subvolume *n;
+ struct subvolume_unlink_hook *h;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+ ret = -EIO;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, k);
+ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ goto err;
+
+ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook;
+ h->subvol = subvolid;
+ bch2_trans_commit_hook(trans, &h->h);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 src_subvolid,
+ u32 *new_subvolid,
+ u32 *new_snapshotid,
+ bool ro)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct bkey_i_subvolume *new_subvol = NULL;
+ struct bkey_i_subvolume *src_subvol = NULL;
+ struct bkey_s_c k;
+ u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+ int ret = 0;
+
+ for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ break;
+
+ /*
+ * bch2_subvolume_delete() doesn't flush the btree key cache -
+ * ideally it would but that's tricky
+ */
+ if (bkey_deleted(k.k) &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
+ goto found_slot;
+ }
+
+ if (!ret)
+ ret = -ENOSPC;
+ goto err;
+found_slot:
+ snapshot_subvols[0] = dst_iter.pos.offset;
+ snapshot_subvols[1] = src_subvolid;
+
+ if (src_subvolid) {
+ /* Creating a snapshot: */
+ src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+ ret = PTR_ERR_OR_ZERO(src_subvol);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+ POS(0, src_subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch_err(c, "subvolume %u not found", src_subvolid);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ bkey_reassemble(&src_subvol->k_i, k);
+ parent = le32_to_cpu(src_subvol->v.snapshot);
+ }
+
+ ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+ snapshot_subvols,
+ src_subvolid ? 2 : 1);
+ if (ret)
+ goto err;
+
+ if (src_subvolid) {
+ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+ bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+ }
+
+ new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ goto err;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+ new_subvol->k.p = dst_iter.pos;
+ bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+
+ *new_subvolid = new_subvol->k.p.offset;
+ *new_snapshotid = new_nodes[0];
+err:
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+ bch2_subvolume_wait_for_pagecache_and_delete);
+ mutex_init(&c->snapshots_unlinked_lock);
+ return 0;
+}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
new file mode 100644
index 000000000000..e4c3fdcdf22f
--- /dev/null
+++ b/fs/bcachefs/subvolume.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "subvolume_types.h"
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
+ .key_invalid = bch2_snapshot_invalid, \
+ .val_to_text = bch2_snapshot_to_text, \
+}
+
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+ struct bkey_s_c, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+ return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s = snapshot_t(c, id);
+
+ return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s;
+ u32 parent = bch2_snapshot_parent(c, id);
+
+ if (!parent)
+ return 0;
+
+ s = snapshot_t(c, bch2_snapshot_parent(c, id));
+ if (id == s->children[0])
+ return s->children[1];
+ if (id == s->children[1])
+ return s->children[0];
+ return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ while (id && id < ancestor)
+ id = bch2_snapshot_parent(c, id);
+
+ return id == ancestor;
+}
+
+struct snapshots_seen {
+ struct bpos pos;
+ size_t nr;
+ size_t size;
+ u32 *d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+ kfree(s->d);
+ s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ if (s->nr == s->size) {
+ size_t new_size = max(s->size, (size_t) 128) * 2;
+ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+ if (!d) {
+ bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+ new_size);
+ return -ENOMEM;
+ }
+
+ s->size = new_size;
+ s->d = d;
+ }
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+ unsigned i;
+
+ for (i = 0; i < s->nr; i++)
+ if (id == s->d[i])
+ return true;
+ return false;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
+ .key_invalid = bch2_subvolume_invalid, \
+ .val_to_text = bch2_subvolume_to_text, \
+}
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+ bool, int, struct bch_subvolume *);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+ struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+ u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 000000000000..9410b9587591
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+ u32 nr;
+ u32 size;
+ u32 *d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 977885166d55..88a8e54fbd7a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -439,10 +439,8 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
__copy_super(&c->disk_sb, src);
- if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
- set_bit(BCH_FS_ERROR, &c->flags);
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb))
+ set_bit(BCH_FS_INITIALIZED, &c->flags);
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
@@ -680,7 +678,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
sb->offset = sb->layout.sb_offset[idx];
- SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+ SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
null_nonce(), sb);
@@ -807,7 +805,8 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices"))
+ "Unable to write superblock to sufficient devices (from %ps)",
+ (void *) _RET_IP_))
ret = -1;
out:
/* Make new options visible after they're persistent: */
@@ -983,6 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 155cefd4b460..3744b6d519a7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -39,6 +39,7 @@
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
@@ -165,44 +166,6 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
&c->dev_usage_journal_res, u64s * nr);
}
-int bch2_congested(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
- struct backing_dev_info *bdi;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- rcu_read_lock();
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- const struct bch_devs_mask *devs =
- bch2_target_to_mask(c, c->opts.foreground_target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_member_device_rcu(ca, c, i, devs) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/* Filesystem RO/RW: */
/*
@@ -307,7 +270,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
void bch2_fs_read_only(struct bch_fs *c)
{
if (!test_bit(BCH_FS_RW, &c->flags)) {
- BUG_ON(c->journal.reclaim_thread);
+ bch2_journal_reclaim_stop(&c->journal);
return;
}
@@ -442,13 +405,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- /*
- * We need to write out a journal entry before we start doing btree
- * updates, to ensure that on unclean shutdown new journal blacklist
- * entries are created:
- */
- bch2_journal_meta(&c->journal);
-
clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
for_each_rw_member(ca, c, i)
@@ -469,12 +425,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
for_each_rw_member(ca, c, i)
bch2_wake_allocator(ca);
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret) {
- bch_err(c, "error starting journal reclaim: %i", ret);
- return ret;
- }
-
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -512,6 +462,7 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
@@ -530,12 +481,12 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
- if (c->btree_iters_bufs)
+ if (c->btree_paths_bufs)
for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+ kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
free_percpu(c->online_reserved);
- free_percpu(c->btree_iters_bufs);
+ free_percpu(c->btree_paths_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
@@ -551,8 +502,8 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->io_complete_wq );
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
- if (c->btree_error_wq)
- destroy_workqueue(c->btree_error_wq);
+ if (c->btree_io_complete_wq)
+ destroy_workqueue(c->btree_io_complete_wq);
if (c->btree_update_wq)
destroy_workqueue(c->btree_update_wq);
@@ -586,8 +537,7 @@ void __bch2_fs_stop(struct bch_fs *c)
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -605,7 +555,6 @@ void __bch2_fs_stop(struct bch_fs *c)
for_each_member_device(ca, c, i)
cancel_work_sync(&ca->io_error_work);
- cancel_work_sync(&c->btree_write_error_work);
cancel_work_sync(&c->read_only_work);
for (i = 0; i < c->sb.nr_devices; i++)
@@ -639,48 +588,53 @@ void bch2_fs_stop(struct bch_fs *c)
bch2_fs_free(c);
}
-static const char *bch2_fs_online(struct bch_fs *c)
+static int bch2_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
- const char *err = NULL;
unsigned i;
- int ret;
+ int ret = 0;
lockdep_assert_held(&bch_fs_list_lock);
- if (!list_empty(&c->list))
- return NULL;
-
- if (__bch2_uuid_to_fs(c->sb.uuid))
- return "filesystem UUID already open";
+ if (__bch2_uuid_to_fs(c->sb.uuid)) {
+ bch_err(c, "filesystem UUID already open");
+ return -EINVAL;
+ }
ret = bch2_fs_chardev_init(c);
- if (ret)
- return "error creating character device";
+ if (ret) {
+ bch_err(c, "error creating character device");
+ return ret;
+ }
bch2_fs_debug_init(c);
- if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
- kobject_add(&c->internal, &c->kobj, "internal") ||
- kobject_add(&c->opts_dir, &c->kobj, "options") ||
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
- bch2_opts_create_sysfs_files(&c->opts_dir))
- return "error creating sysfs objects";
+ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+ kobject_add(&c->internal, &c->kobj, "internal") ?:
+ kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ bch2_opts_create_sysfs_files(&c->opts_dir);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
+ return ret;
+ }
down_write(&c->state_lock);
- err = "error creating sysfs objects";
- for_each_member_device(ca, c, i)
- if (bch2_dev_sysfs_online(c, ca)) {
+ for_each_member_device(ca, c, i) {
+ ret = bch2_dev_sysfs_online(c, ca);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
percpu_ref_put(&ca->ref);
goto err;
}
+ }
+ BUG_ON(!list_empty(&c->list));
list_add(&c->list, &bch_fs_list);
- err = NULL;
err:
up_write(&c->state_lock);
- return err;
+ return ret;
}
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
@@ -688,13 +642,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
struct bch_sb_field_members *mi;
struct bch_fs *c;
unsigned i, iter_size;
- const char *err;
+ int ret = 0;
pr_verbose_init(opts, "");
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
- if (!c)
+ if (!c) {
+ c = ERR_PTR(-ENOMEM);
goto out;
+ }
__module_get(THIS_MODULE);
@@ -732,10 +688,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->usage_scratch_lock);
mutex_init(&c->bio_bounce_pages_lock);
+ mutex_init(&c->snapshot_table_lock);
- bio_list_init(&c->btree_write_error_list);
spin_lock_init(&c->btree_write_error_lock);
- INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
@@ -752,6 +707,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->ec_stripe_new_list);
mutex_init(&c->ec_stripe_new_lock);
+ INIT_LIST_HEAD(&c->data_progress_list);
+ mutex_init(&c->data_progress_lock);
+
spin_lock_init(&c->ec_stripes_heap_lock);
seqcount_init(&c->gc_pos_lock);
@@ -773,17 +731,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->sectors_available_lock);
- if (percpu_init_rwsem(&c->mark_lock))
+ ret = percpu_init_rwsem(&c->mark_lock);
+ if (ret)
goto err;
mutex_lock(&c->sb_lock);
+ ret = bch2_sb_to_fs(c, sb);
+ mutex_unlock(&c->sb_lock);
- if (bch2_sb_to_fs(c, sb)) {
- mutex_unlock(&c->sb_lock);
+ if (ret)
goto err;
- }
-
- mutex_unlock(&c->sb_lock);
scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
@@ -794,8 +751,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->block_bits = ilog2(c->opts.block_size);
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
- if (bch2_fs_init_fault("fs_alloc"))
+ if (bch2_fs_init_fault("fs_alloc")) {
+ bch_err(c, "fs_alloc fault injected");
+ ret = -EFAULT;
goto err;
+ }
iter_size = sizeof(struct sort_iter) +
(btree_blocks(c) + 1) * 2 *
@@ -805,7 +765,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
+ !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
@@ -819,33 +779,44 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
- !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
- sizeof(u64), GFP_KERNEL)) ||
- bch2_io_clock_init(&c->io_clock[READ]) ||
- bch2_io_clock_init(&c->io_clock[WRITE]) ||
- bch2_fs_journal_init(&c->journal) ||
- bch2_fs_replicas_init(c) ||
- bch2_fs_btree_cache_init(c) ||
- bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
- bch2_fs_btree_iter_init(c) ||
- bch2_fs_btree_interior_update_init(c) ||
- bch2_fs_io_init(c) ||
- bch2_fs_encryption_init(c) ||
- bch2_fs_compress_init(c) ||
- bch2_fs_ec_init(c) ||
- bch2_fs_fsio_init(c))
+ sizeof(u64), GFP_KERNEL))) {
+ ret = -ENOMEM;
goto err;
+ }
+
+ ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_cache_init(c) ?:
+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+ bch2_fs_btree_iter_init(c) ?:
+ bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_subvolumes_init(c) ?:
+ bch2_fs_io_init(c) ?:
+ bch2_fs_encryption_init(c) ?:
+ bch2_fs_compress_init(c) ?:
+ bch2_fs_ec_init(c) ?:
+ bch2_fs_fsio_init(c);
+ if (ret)
+ goto err;
+
+ if (c->opts.nochanges)
+ set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
- bch2_dev_alloc(c, i))
+ bch2_dev_alloc(c, i)) {
+ ret = -EEXIST;
goto err;
+ }
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
@@ -856,18 +827,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
mutex_lock(&bch_fs_list_lock);
- err = bch2_fs_online(c);
+ ret = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
- if (err) {
- bch_err(c, "bch2_fs_online() error: %s", err);
+
+ if (ret)
goto err;
- }
out:
- pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err:
bch2_fs_free(c);
- c = NULL;
+ c = ERR_PTR(ret);
goto out;
}
@@ -907,7 +877,6 @@ static void print_mount_opts(struct bch_fs *c)
int bch2_fs_start(struct bch_fs *c)
{
- const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
@@ -943,10 +912,11 @@ int bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
- err = "dynamic fault";
ret = -EINVAL;
- if (bch2_fs_init_fault("fs_start"))
+ if (bch2_fs_init_fault("fs_start")) {
+ bch_err(c, "fs_start fault injected");
goto err;
+ }
set_bit(BCH_FS_STARTED, &c->flags);
@@ -967,7 +937,6 @@ int bch2_fs_start(struct bch_fs *c)
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
- err = "error going read write";
ret = !test_bit(BCH_FS_RW, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
@@ -985,25 +954,22 @@ err:
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("mount with -o fix_errors to repair\n");
- err = "fsck error";
break;
case BCH_FSCK_REPAIR_UNIMPLEMENTED:
bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("repair unimplemented: inform the developers so that it can be added\n");
- err = "fsck error";
break;
case BCH_FSCK_REPAIR_IMPOSSIBLE:
bch_err(c, "filesystem contains errors, but repair impossible");
- err = "fsck error";
break;
case BCH_FSCK_UNKNOWN_VERSION:
- err = "unknown metadata version";;
+ bch_err(c, "unknown metadata version");
break;
case -ENOMEM:
- err = "cannot allocate memory";
+ bch_err(c, "cannot allocate memory");
break;
case -EIO:
- err = "IO error";
+ bch_err(c, "IO error");
break;
}
@@ -1065,8 +1031,7 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
@@ -1102,10 +1067,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
-
- sysfs_remove_link(block, "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
sysfs_remove_link(&ca->kobj, "block");
}
@@ -1142,12 +1104,12 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
}
if (ca->disk_sb.bdev) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
+ struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
if (ret)
return ret;
+
ret = sysfs_create_link(&ca->kobj, block, "block");
if (ret)
return ret;
@@ -1427,7 +1389,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
bch2_copygc_start(c);
}
-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
@@ -1436,10 +1398,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (bch2_dev_allocator_start(ca))
- return "error starting allocator thread";
-
- return NULL;
+ return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1465,9 +1424,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (new_state == BCH_MEMBER_STATE_rw &&
- __bch2_dev_read_write(c, ca))
- ret = -ENOMEM;
+ if (new_state == BCH_MEMBER_STATE_rw)
+ ret = __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
@@ -1497,15 +1455,18 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i));
+ ret = lockrestart_do(&trans,
+ bch2_btree_key_cache_flush(&trans,
+ BTREE_ID_alloc, POS(ca->dev_idx, i)));
if (ret)
break;
}
bch2_trans_exit(&trans);
- if (ret)
+ if (ret) {
+ bch_err(c, "error %i removing dev alloc info", ret);
return ret;
+ }
return bch2_btree_delete_range(c, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
@@ -1627,6 +1588,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_dev *ca = NULL;
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
+ struct bucket_array *buckets;
+ struct bucket *g;
unsigned dev_idx, nr_devices, u64s;
int ret;
@@ -1730,14 +1693,24 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
+ /*
+ * Clear marks before marking transactionally in the btree, so that
+ * per-device accounting gets done correctly:
+ */
+ down_read(&ca->bucket_lock);
+ buckets = bucket_array(ca);
+ for_each_bucket(g, buckets)
+ atomic64_set(&g->_mark.v, 0);
+ up_read(&ca->bucket_lock);
+
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret)
goto err_late;
if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- err = __bch2_dev_read_write(c, ca);
- if (err)
+ ret = __bch2_dev_read_write(c, ca);
+ if (ret)
goto err_late;
}
@@ -1781,24 +1754,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (err)
+ if (err) {
+ bch_err(c, "error bringing %s online: %s", path, err);
goto err;
+ }
- if (bch2_dev_attach_bdev(c, &sb)) {
- err = "bch2_dev_attach_bdev() error";
+ ret = bch2_dev_attach_bdev(c, &sb);
+ if (ret)
goto err;
- }
ca = bch_dev_locked(c, dev_idx);
- if (bch2_trans_mark_dev_sb(c, ca)) {
- err = "bch2_trans_mark_dev_sb() error";
+ ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
+ path, ret);
goto err;
}
if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- err = __bch2_dev_read_write(c, ca);
- if (err)
+ ret = __bch2_dev_read_write(c, ca);
+ if (ret)
goto err;
}
@@ -1816,7 +1792,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
err:
up_write(&c->state_lock);
bch2_free_super(&sb);
- bch_err(c, "error bringing %s online: %s", path, err);
return -EINVAL;
}
@@ -1890,20 +1865,23 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
{
- struct block_device *bdev = lookup_bdev(path);
struct bch_dev *ca;
+ dev_t dev;
unsigned i;
+ int ret;
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ ret = lookup_bdev(path, &dev);
+ if (ret)
+ return ERR_PTR(ret);
- for_each_member_device(ca, c, i)
- if (ca->disk_sb.bdev == bdev)
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ if (ca->disk_sb.bdev->bd_dev == dev)
goto found;
-
ca = ERR_PTR(-ENOENT);
found:
- bdput(bdev);
+ rcu_read_unlock();
+
return ca;
}
@@ -1917,7 +1895,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
- int ret = -ENOMEM;
+ int ret = 0;
pr_verbose_init(opts, "");
@@ -1932,8 +1910,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
- if (!sb)
+ if (!sb) {
+ ret = -ENOMEM;
goto err;
+ }
for (i = 0; i < nr_devices; i++) {
ret = bch2_read_super(devices[i], &opts, &sb[i]);
@@ -1970,18 +1950,20 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
i++;
}
- ret = -ENOMEM;
c = bch2_fs_alloc(sb[best_sb].sb, opts);
- if (!c)
+ if (IS_ERR(c)) {
+ ret = PTR_ERR(c);
goto err;
+ }
- err = "bch2_dev_online() error";
down_write(&c->state_lock);
- for (i = 0; i < nr_devices; i++)
- if (bch2_dev_attach_bdev(c, &sb[i])) {
+ for (i = 0; i < nr_devices; i++) {
+ ret = bch2_dev_attach_bdev(c, &sb[i]);
+ if (ret) {
up_write(&c->state_lock);
- goto err_print;
+ goto err;
}
+ }
up_write(&c->state_lock);
err = "insufficient devices";
@@ -2004,10 +1986,11 @@ err_print:
devices[0], err);
ret = -EINVAL;
err:
- if (c)
+ if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
- for (i = 0; i < nr_devices; i++)
- bch2_free_super(&sb[i]);
+ if (sb)
+ for (i = 0; i < nr_devices; i++)
+ bch2_free_super(&sb[i]);
c = ERR_PTR(ret);
goto out;
}
@@ -2033,12 +2016,12 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
if (err)
goto err;
} else {
+ allocated_fs = true;
c = bch2_fs_alloc(sb->sb, opts);
- err = "cannot allocate memory";
- if (!c)
- goto err;
- allocated_fs = true;
+ err = "bch2_fs_alloc() error";
+ if (IS_ERR(c))
+ goto err;
}
err = "bch2_dev_online() error";
@@ -2064,7 +2047,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
err:
mutex_unlock(&bch_fs_list_lock);
- if (allocated_fs)
+ if (allocated_fs && !IS_ERR(c))
bch2_fs_stop(c);
else if (c)
closure_put(&c->cl);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 5cee064995af..739e8fd18176 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -196,7 +196,6 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 84a7acb04d01..864be8601868 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -155,11 +155,6 @@ read_attribute(congested);
read_attribute(btree_avg_write_size);
-read_attribute(bucket_quantiles_last_read);
-read_attribute(bucket_quantiles_last_write);
-read_attribute(bucket_quantiles_fragmentation);
-read_attribute(bucket_quantiles_oldest_gen);
-
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@@ -171,6 +166,7 @@ read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
read_attribute(stripes_heap);
+read_attribute(open_buckets);
read_attribute(internal_uuid);
@@ -202,6 +198,8 @@ read_attribute(new_stripes);
read_attribute(io_timers_read);
read_attribute(io_timers_write);
+read_attribute(data_op_data_progress);
+
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
@@ -238,6 +236,37 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
return nr ? div64_u64(sectors, nr) : 0;
}
+static long stats_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_move_stats *stats)
+{
+ pr_buf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ pr_buf(out, "%s", "\n");
+
+ return 0;
+}
+
+static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ long ret = 0;
+ struct bch_move_stats *iter;
+
+ mutex_lock(&c->data_progress_lock);
+
+ if (list_empty(&c->data_progress_list))
+ pr_buf(out, "%s", "no progress to report\n");
+ else
+ list_for_each_entry(iter, &c->data_progress_list, list) {
+ stats_to_text(out, c, iter);
+ }
+
+ mutex_unlock(&c->data_progress_lock);
+ return ret;
+}
+
static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
@@ -256,7 +285,7 @@ static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
nr_compressed_extents = 0,
@@ -291,8 +320,9 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
break;
}
}
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
if (ret)
return ret;
@@ -409,6 +439,11 @@ SHOW(bch2_fs)
return out.pos - buf;
}
+ if (attr == &sysfs_open_buckets) {
+ bch2_open_buckets_to_text(&out, c);
+ return out.pos - buf;
+ }
+
if (attr == &sysfs_compression_stats) {
bch2_compression_stats_to_text(&out, c);
return out.pos - buf;
@@ -428,6 +463,11 @@ SHOW(bch2_fs)
return out.pos - buf;
}
+ if (attr == &sysfs_data_op_data_progress) {
+ data_progress_to_text(&out, c);
+ return out.pos - buf;
+ }
+
return 0;
}
@@ -567,6 +607,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
&sysfs_stripes_heap,
+ &sysfs_open_buckets,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
@@ -589,6 +630,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_io_timers_read,
&sysfs_io_timers_write,
+ &sysfs_data_op_data_progress,
+
&sysfs_internal_uuid,
NULL
};
@@ -703,76 +746,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
- size_t, void *);
-
-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- int rw = (private ? 1 : 0);
-
- return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- struct bucket *g = bucket(ca, b);
- return bucket_sectors_used(g->mark);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- return bucket_gc_gen(bucket(ca, b));
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
- const unsigned *l = _l;
- const unsigned *r = _r;
-
- return cmp_int(*l, *r);
-}
-
-static int quantiles_to_text(struct printbuf *out,
- struct bch_fs *c, struct bch_dev *ca,
- bucket_map_fn *fn, void *private)
-{
- size_t i, n;
- /* Compute 31 quantiles */
- unsigned q[31], *p;
-
- down_read(&ca->bucket_lock);
- n = ca->mi.nbuckets;
-
- p = vzalloc(n * sizeof(unsigned));
- if (!p) {
- up_read(&ca->bucket_lock);
- return -ENOMEM;
- }
-
- for (i = ca->mi.first_bucket; i < n; i++)
- p[i] = fn(c, ca, i, private);
-
- sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
- up_read(&ca->bucket_lock);
-
- while (n &&
- !p[n - 1])
- --n;
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
- vfree(p);
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- pr_buf(out, "%u ", q[i]);
- pr_buf(out, "\n");
- return 0;
-}
-
static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
{
enum alloc_reserve i;
@@ -934,15 +907,6 @@ SHOW(bch2_dev)
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_bucket_quantiles_last_read)
- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_last_write)
- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_fragmentation)
- return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_oldest_gen)
- return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf;
-
if (attr == &sysfs_reserve_stats) {
reserve_stats_to_text(&out, ca);
return out.pos - buf;
@@ -1034,12 +998,6 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
- /* alloc info - other stats: */
- &sysfs_bucket_quantiles_last_read,
- &sysfs_bucket_quantiles_last_write,
- &sysfs_bucket_quantiles_fragmentation,
- &sysfs_bucket_quantiles_oldest_gen,
-
&sysfs_reserve_stats,
/* debug: */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 63f4a83ad1de..d5a74f4db64d 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -29,45 +29,44 @@ static void delete_test_keys(struct bch_fs *c)
static int test_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bkey_cookie_init(&k.k_i);
+ k.k.p.snapshot = U32_MAX;
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
-
- ret = bch2_btree_iter_traverse(iter);
- if (ret) {
- bch_err(c, "lookup error in test_delete: %i", ret);
- goto err;
- }
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_trans_update(&trans, iter, &k.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
bch_err(c, "update error in test_delete: %i", ret);
goto err;
}
pr_info("deleting once");
- ret = bch2_btree_delete_at(&trans, iter, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error (first) in test_delete: %i", ret);
goto err;
}
pr_info("deleting twice");
- ret = bch2_btree_delete_at(&trans, iter, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error (second) in test_delete: %i", ret);
goto err;
}
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -75,39 +74,38 @@ err:
static int test_delete_written(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bkey_cookie_init(&k.k_i);
+ k.k.p.snapshot = U32_MAX;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
-
- ret = bch2_btree_iter_traverse(iter);
- if (ret) {
- bch_err(c, "lookup error in test_delete_written: %i", ret);
- goto err;
- }
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_trans_update(&trans, iter, &k.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
bch_err(c, "update error in test_delete_written: %i", ret);
goto err;
}
+ bch2_trans_unlock(&trans);
bch2_journal_flush_all_pins(&c->journal);
- ret = bch2_btree_delete_at(&trans, iter, 0);
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error in test_delete_written: %i", ret);
goto err;
}
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -115,7 +113,7 @@ err:
static int test_iterate(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -131,6 +129,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i;
+ k.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
@@ -156,12 +155,12 @@ static int test_iterate(struct bch_fs *c, u64 nr)
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
BUG_ON(k.k->p.offset != --i);
BUG_ON(i);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -169,7 +168,7 @@ err:
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -185,6 +184,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i + 8;
+ k.k.p.snapshot = U32_MAX;
k.k.size = 8;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
@@ -209,14 +209,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
}
BUG_ON(i);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
@@ -224,7 +224,7 @@ err:
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -240,6 +240,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i * 2;
+ k.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
@@ -261,7 +262,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
BUG_ON(k.k->p.offset != i);
i += 2;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr * 2);
@@ -278,7 +279,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
if (i == nr * 2)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
err:
bch2_trans_exit(&trans);
return ret;
@@ -287,7 +288,7 @@ err:
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -303,6 +304,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
bkey_cookie_init(&k.k_i);
k.k.p.offset = i + 16;
+ k.k.p.snapshot = U32_MAX;
k.k.size = 8;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
@@ -323,7 +325,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
BUG_ON(k.k->size != 8);
i += 16;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr);
@@ -342,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
if (i == nr)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
err:
bch2_trans_exit(&trans);
return 0;
@@ -355,21 +357,19 @@ err:
static int test_peek_end(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
-
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- bch2_trans_iter_put(&trans, iter);
-
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return 0;
}
@@ -377,21 +377,19 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
-
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- bch2_trans_iter_put(&trans, iter);
-
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return 0;
}
@@ -410,6 +408,7 @@ static int insert_test_extent(struct bch_fs *c,
bkey_cookie_init(&k.k_i);
k.k_i.k.p.offset = end;
+ k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start;
k.k_i.k.version.lo = test_version++;
@@ -536,18 +535,18 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
static int rand_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
+ bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret) {
bch_err(c, "error in rand_lookup: %i", ret);
@@ -555,62 +554,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
+static int rand_mixed_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i_cookie *cookie,
+ u64 i, u64 pos)
+{
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+ if (ret)
+ return ret;
+
+ if (!(i & 3) && k.k) {
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter->pos;
+ bch2_trans_update(trans, iter, &cookie->k_i, 0);
+ }
+
+ return 0;
+}
+
static int rand_mixed(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ struct btree_iter iter;
+ struct bkey_i_cookie cookie;
int ret = 0;
- u64 i;
+ u64 i, rand;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
-
- k = bch2_btree_iter_peek(iter);
- ret = bkey_err(k);
+ rand = test_rand();
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ rand_mixed_trans(&trans, &iter, &cookie, i, rand));
if (ret) {
- bch_err(c, "lookup error in rand_mixed: %i", ret);
+ bch_err(c, "update error in rand_mixed: %i", ret);
break;
}
-
- if (!(i & 3) && k.k) {
- struct bkey_i_cookie k;
-
- bkey_cookie_init(&k.k_i);
- k.k.p = iter->pos;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_trans_update(&trans, iter, &k.k_i, 0));
- if (ret) {
- bch_err(c, "update error in rand_mixed: %i", ret);
- break;
- }
- }
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int __do_delete(struct btree_trans *trans, struct bpos pos)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i delete;
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -621,9 +631,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bkey_init(&delete.k);
delete.k.p = k.k->p;
- ret = bch2_trans_update(trans, iter, &delete, 0);
+ ret = bch2_trans_update(trans, &iter, &delete, 0);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -653,7 +663,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
static int seq_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie insert;
int ret = 0;
@@ -665,10 +675,11 @@ static int seq_insert(struct bch_fs *c, u64 nr)
for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- insert.k.p = iter->pos;
+ insert.k.p = iter.pos;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_trans_update(&trans, iter, &insert.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &insert.k_i, 0));
if (ret) {
bch_err(c, "error in seq_insert: %i", ret);
break;
@@ -677,7 +688,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
if (++i == nr)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
@@ -686,7 +697,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
@@ -694,7 +705,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
;
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
@@ -703,7 +714,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
static int seq_overwrite(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
@@ -716,13 +727,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
bkey_reassemble(&u.k_i, k);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_trans_update(&trans, iter, &u.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &u.k_i, 0));
if (ret) {
bch_err(c, "error in seq_overwrite: %i", ret);
break;
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
@@ -771,7 +783,7 @@ static int btree_perf_test_thread(void *data)
wait_event(j->ready_wait, !atomic_read(&j->ready));
}
- ret = j->fn(j->c, j->nr / j->nr_threads);
+ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
if (ret)
j->ret = ret;
@@ -847,11 +859,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
bch2_hprint(&PBUF(nr_buf), nr);
- bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+ bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
name_buf, nr_buf, nr_threads,
- time / NSEC_PER_SEC,
- time * nr_threads / nr,
+ div_u64(time, NSEC_PER_SEC),
+ div_u64(time * nr_threads, nr),
per_sec_buf);
return j.ret;
}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e3ad26e244ab..52de7c49cacb 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
if (!page)
return -ENOMEM;
- BUG_ON(!bio_add_page(bio, page, len, 0));
+ if (unlikely(!bio_add_page(bio, page, len, 0))) {
+ __free_page(page);
+ break;
+ }
+
size -= len;
}
@@ -890,6 +894,7 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
u64 *ret;
int cpu;
+ /* access to pcpu vars has to be blocked by other locking */
preempt_disable();
ret = this_cpu_ptr(p);
preempt_enable();
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 4c67ea2866c6..80402b398442 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -18,9 +18,6 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
-#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
-#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT)
-
struct closure;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -88,7 +85,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
@@ -653,35 +650,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
@@ -741,10 +709,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
for_each_possible_cpu(cpu)
*per_cpu_ptr(dst, cpu) = 0;
-
- preempt_disable();
- *this_cpu_ptr(dst) = src;
- preempt_enable();
+ this_cpu_write(*dst, src);
}
static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index a3d252c741c8..a2d6bb7136c7 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -1,10 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
+#include <linux/string.h>
#include <asm/unaligned.h>
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
#include "varint.h"
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out - destination to encode to
+ * @v - unsigned integer to encode
+ *
+ * Returns the size in bytes of the encoded integer - at most 9 bytes
+ */
int bch2_varint_encode(u8 *out, u64 v)
{
unsigned bits = fls64(v|1);
@@ -13,19 +25,85 @@ int bch2_varint_encode(u8 *out, u64 v)
if (likely(bytes < 9)) {
v <<= bytes;
v |= ~(~0 << (bytes - 1));
+ v = cpu_to_le64(v);
+ memcpy(out, &v, bytes);
} else {
*out++ = 255;
bytes = 9;
+ put_unaligned_le64(v, out);
}
- put_unaligned_le64(v, out);
return bytes;
}
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in - varint to decode
+ * @end - end of buffer to decode from
+ * @out - on success, decoded integer
+ *
+ * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
{
+ unsigned bytes = likely(in < end)
+ ? ffz(*in & 255) + 1
+ : 1;
+ u64 v;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ v = 0;
+ memcpy(&v, in, bytes);
+ v = le64_to_cpu(v);
+ v >>= bytes;
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ }
+
+ put_unaligned_le64(v, out);
+ return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
+{
+#ifdef CONFIG_VALGRIND
+ VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
u64 v = get_unaligned_le64(in);
- unsigned bytes = ffz(v & 255) + 1;
+ unsigned bytes = ffz(*in) + 1;
if (unlikely(in + bytes > end))
return -1;
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
index 8daf813576b7..92a182fb3d7a 100644
--- a/fs/bcachefs/varint.h
+++ b/fs/bcachefs/varint.h
@@ -5,4 +5,7 @@
int bch2_varint_encode(u8 *, u64);
int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index e7b40b3ca4aa..464ed68318e7 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -118,26 +118,28 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
le16_to_cpu(xattr.v->x_val_len));
}
-int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
- const char *name, void *buffer, size_t size, int type)
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+ const char *name, void *buffer, size_t size, int type)
{
- struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+ struct btree_iter iter;
struct bkey_s_c_xattr xattr;
+ struct bkey_s_c k;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode),
+ &X_SEARCH(type, name, strlen(name)),
+ 0);
+ if (ret)
+ goto err1;
- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
- inode->v.i_ino,
- &X_SEARCH(type, name, strlen(name)),
- 0);
- ret = PTR_ERR_OR_ZERO(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
- goto err;
+ goto err2;
- xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+ xattr = bkey_s_c_to_xattr(k);
ret = le16_to_cpu(xattr.v->x_val_len);
if (buffer) {
if (ret > size)
@@ -145,21 +147,42 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
- bch2_trans_iter_put(&trans, iter);
-err:
- bch2_trans_exit(&trans);
-
- BUG_ON(ret == -EINTR);
+err2:
+ bch2_trans_iter_exit(trans, &iter);
+err1:
return ret == -ENOENT ? -ENODATA : ret;
}
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
+ const char *name, void *buffer, size_t size, int type)
+{
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
+}
+
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
const struct bch_hash_info *hash_info,
const char *name, const void *value, size_t size,
int type, int flags)
{
+ struct btree_iter inode_iter = { NULL };
+ struct bch_inode_unpacked inode_u;
int ret;
+ /*
+ * We need to do an inode update so that bi_journal_sync gets updated
+ * and fsync works:
+ *
+ * Perhaps we should be updating bi_mtime too?
+ */
+
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ bch2_inode_write(trans, &inode_iter, &inode_u);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret)
+ return ret;
+
if (value) {
struct bkey_i_xattr *xattr;
unsigned namelen = strlen(name);
@@ -272,16 +295,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
- u64 inum = dentry->d_inode->i_ino;
+ u64 offset = 0, inum = inode->ei_inode.bi_inum;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- POS(inum, 0), 0, k, ret) {
+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
@@ -294,9 +325,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_exit(&trans);
if (ret)
return ret;
@@ -323,6 +359,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
}
static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
@@ -331,8 +368,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
- bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
}
@@ -455,6 +492,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode,
}
static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 4151065ab853..f4f896545e1c 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -39,7 +39,8 @@ struct bch_inode_info;
int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
const char *, void *, size_t, int);
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+ const struct bch_hash_info *,
const char *, const void *, size_t, int, int);
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 0d3037419bc7..8b6359083b9c 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -99,6 +99,12 @@ enum fid_type {
FILEID_FAT_WITH_PARENT = 0x72,
/*
+ * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
+ */
+ FILEID_BCACHEFS_WITHOUT_PARENT = 0x80,
+ FILEID_BCACHEFS_WITH_PARENT = 0x81,
+
+ /*
* 128 bit child FID (struct lu_fid)
* 128 bit parent FID (struct lu_fid)
*/
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index b5fcda9e65d8..fce3146378f9 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -71,10 +71,10 @@ DECLARE_EVENT_CLASS(bio,
),
TP_fast_assign(
- __entry->dev = bio->bi_disk ? bio_dev(bio) : 0;
+ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u",
@@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail,
__entry->required, __entry->cl)
);
-TRACE_EVENT(btree_insert_key,
- TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
- TP_ARGS(c, b, k),
-
- TP_STRUCT__entry(
- __field(u8, id )
- __field(u64, inode )
- __field(u64, offset )
- __field(u32, size )
- ),
-
- TP_fast_assign(
- __entry->id = b->c.btree_id;
- __entry->inode = k->k.p.inode;
- __entry->offset = k->k.p.offset;
- __entry->size = k->k.size;
- ),
-
- TP_printk("btree %u: %llu:%llu len %u", __entry->id,
- __entry->inode, __entry->offset, __entry->size)
-);
-
DEFINE_EVENT(btree_node, btree_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
@@ -540,69 +518,6 @@ TRACE_EVENT(copygc_wait,
__entry->wait_amount, __entry->until)
);
-TRACE_EVENT(trans_get_iter,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *got_pos,
- unsigned got_locks,
- unsigned got_uptodate,
- struct bpos *src_pos,
- unsigned src_locks,
- unsigned src_uptodate),
- TP_ARGS(trans_ip, caller_ip, btree_id,
- got_pos, got_locks, got_uptodate,
- src_pos, src_locks, src_uptodate),
-
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, got_pos_inode )
- __field(u64, got_pos_offset )
- __field(u32, got_pos_snapshot )
- __field(u8, got_locks )
- __field(u8, got_uptodate )
- __field(u64, src_pos_inode )
- __field(u64, src_pos_offset )
- __field(u32, src_pos_snapshot )
- __field(u8, src_locks )
- __field(u8, src_uptodate )
- ),
-
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->got_pos_inode = got_pos->inode;
- __entry->got_pos_offset = got_pos->offset;
- __entry->got_pos_snapshot = got_pos->snapshot;
- __entry->got_locks = got_locks;
- __entry->got_uptodate = got_uptodate;
- __entry->src_pos_inode = src_pos->inode;
- __entry->src_pos_offset = src_pos->offset;
- __entry->src_pos_snapshot = src_pos->snapshot;
- __entry->src_locks = src_locks;
- __entry->src_uptodate = src_uptodate;
- ),
-
- TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
- "src %llu:%llu:%u l %u u %u",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->got_pos_inode,
- __entry->got_pos_offset,
- __entry->got_pos_snapshot,
- __entry->got_locks,
- __entry->got_uptodate,
- __entry->src_pos_inode,
- __entry->src_pos_offset,
- __entry->src_pos_snapshot,
- __entry->src_locks,
- __entry->src_uptodate)
-);
-
TRACE_EVENT(transaction_restart_ip,
TP_PROTO(unsigned long caller, unsigned long ip),
TP_ARGS(caller, ip),
@@ -772,92 +687,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
TP_ARGS(trans_ip, caller_ip, btree_id, pos)
);
-TRACE_EVENT(iter_traverse,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- int ret),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos, ret),
-
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(s32, ret )
- ),
-
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->ret = ret;
- ),
-
- TP_printk("%ps %pS pos %u %llu:%llu:%u ret %i",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->ret)
-);
-
-TRACE_EVENT(iter_set_search_pos,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *old_pos,
- struct bpos *new_pos,
- unsigned good_level),
- TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
-
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, old_pos_inode )
- __field(u64, old_pos_offset )
- __field(u32, old_pos_snapshot )
- __field(u64, new_pos_inode )
- __field(u64, new_pos_offset )
- __field(u32, new_pos_snapshot )
- __field(u8, good_level )
- ),
-
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->old_pos_inode = old_pos->inode;
- __entry->old_pos_offset = old_pos->offset;
- __entry->old_pos_snapshot = old_pos->snapshot;
- __entry->new_pos_inode = new_pos->inode;
- __entry->new_pos_offset = new_pos->offset;
- __entry->new_pos_snapshot = new_pos->snapshot;
- __entry->good_level = good_level;
- ),
-
- TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot,
- __entry->good_level)
-);
-
TRACE_EVENT(trans_restart_would_deadlock,
TP_PROTO(unsigned long trans_ip,
unsigned long caller_ip,
@@ -927,93 +756,42 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_pos_snapshot)
);
-TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
- unsigned long bytes),
- TP_ARGS(trans_ip, caller_ip, bytes),
+TRACE_EVENT(trans_restart_would_deadlock_write,
+ TP_PROTO(unsigned long trans_ip),
+ TP_ARGS(trans_ip),
TP_STRUCT__entry(
__field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(unsigned long, bytes )
),
TP_fast_assign(
__entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->bytes = bytes;
),
- TP_printk("%ps %pS bytes %lu",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->bytes)
+ TP_printk("%ps", (void *) __entry->trans_ip)
);
-DECLARE_EVENT_CLASS(node_lock_fail,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos,
- level, iter_seq, node, node_seq),
+TRACE_EVENT(trans_restart_mem_realloced,
+ TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
+ unsigned long bytes),
+ TP_ARGS(trans_ip, caller_ip, bytes),
TP_STRUCT__entry(
__field(unsigned long, trans_ip )
__field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(u32, level )
- __field(u32, iter_seq )
- __field(u32, node )
- __field(u32, node_seq )
+ __field(unsigned long, bytes )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->level = level;
- __entry->iter_seq = iter_seq;
- __entry->node = node;
- __entry->node_seq = node_seq;
+ __entry->trans_ip = trans_ip;
+ __entry->caller_ip = caller_ip;
+ __entry->bytes = bytes;
),
- TP_printk("%ps %pS btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
+ TP_printk("%ps %pS bytes %lu",
(void *) __entry->trans_ip,
(void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level, __entry->iter_seq,
- __entry->node, __entry->node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos,
- level, iter_seq, node, node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_relock_fail,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos,
- level, iter_seq, node, node_seq)
+ __entry->bytes)
);
#endif /* _TRACE_BCACHE_H */
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
index 75a735acd11b..fca1208720b6 100644
--- a/kernel/locking/six.c
+++ b/kernel/locking/six.c
@@ -17,7 +17,7 @@
#endif
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
+#define six_release(l) lock_release(l, _RET_IP_)
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */