diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-11-29 12:43:48 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:17 -0400 |
commit | 246252b293d2d2c9e8143303e508167b63a8d95b (patch) | |
tree | 3f451c6dbafabe595690a37444d28e8dffadc093 | |
parent | 7d52af47abc1e3c924ba29a12149d7a527b9236d (diff) |
Merge with 131853c881 bcachefs: Switch to macro for bkey_ops
75 files changed, 4748 insertions, 4785 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index e695ab786f80..10abddae6a80 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -2,7 +2,6 @@ config BCACHEFS_FS tristate "bcachefs filesystem support" depends on BLOCK - depends on (64BIT || LBDAF) select EXPORTFS select CLOSURES select LIBCRC32C diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 414ea2a74a5a..c7727d05cf49 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -26,7 +26,9 @@ bcachefs-y := \ ec.o \ error.o \ extents.o \ + extent_update.o \ fs.o \ + fs-common.o \ fs-ioctl.o \ fs-io.o \ fsck.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 59d4af1326ee..dcd0dfe87b51 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -280,49 +280,52 @@ int bch2_set_acl_trans(struct btree_trans *trans, return ret == -ENOENT ? 0 : ret; } -static int inode_update_for_set_acl_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - umode_t mode = (unsigned long) p; - - bi->bi_ctime = bch2_current_time(c); - bi->bi_mode = mode; - return 0; -} - -int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type) +int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; + struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; - umode_t mode = inode->v.i_mode; + struct posix_acl *acl; + umode_t mode; int ret; mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + acl = _acl; - if (type == ACL_TYPE_ACCESS && acl) { + inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto btree_err; + + mode = inode_u.bi_mode; + + if (type == ACL_TYPE_ACCESS) { ret = posix_acl_update_mode(&inode->v, &mode, &acl); if (ret) goto err; } -retry: - bch2_trans_begin(&trans); - ret = bch2_set_acl_trans(&trans, - &inode->ei_inode, - &inode->ei_str_hash, - acl, type) ?: - bch2_write_inode_trans(&trans, inode, &inode_u, - inode_update_for_set_acl_fn, - (void *)(unsigned long) mode) ?: + ret = bch2_set_acl_trans(&trans, &inode_u, + &inode->ei_str_hash, + acl, type); + if (ret) + goto btree_err; + + inode_u.bi_ctime = bch2_current_time(c); + inode_u.bi_mode = mode; + + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); +btree_err: if (ret == -EINTR) goto retry; if (unlikely(ret)) @@ -375,7 +378,7 @@ int bch2_acl_chmod(struct btree_trans *trans, } new->k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i)); + bch2_trans_update(trans, iter, &new->k_i); *new_acl = acl; acl = NULL; err: diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 9814179a6406..e252a039dc2b 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -152,6 +152,7 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst, { unsigned idx = 0; void *d = dst->v.data; + unsigned bytes; dst->v.fields = 0; dst->v.gen = src.gen; @@ -160,7 +161,9 @@ void bch2_alloc_pack(struct bkey_i_alloc *dst, BCH_ALLOC_FIELDS() #undef x - set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v); + bytes = (void *) d - (void *) &dst->v; + set_bkey_val_bytes(&dst->k, bytes); + memset_u64s_tail(&dst->v, 0, bytes); } static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) @@ -311,7 +314,7 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, new_u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_trans_update(trans, iter, &a->k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -899,7 +902,7 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_trans_update(trans, iter, &a->k_i); /* * XXX: @@ -1438,6 +1441,9 @@ again: cond_resched(); nodes_unwritten = false; + if (bch2_journal_error(&c->journal)) + return true; + rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) if (btree_node_need_write(b)) { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 033b73821fdb..9b186872c129 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -299,7 +299,6 @@ do { \ x(btree_node_sort) \ x(btree_node_read) \ x(btree_gc) \ - x(btree_update) \ x(btree_lock_contended_read) \ x(btree_lock_contended_intent) \ x(btree_lock_contended_write) \ @@ -426,7 +425,6 @@ struct bch_dev { */ alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; - spinlock_t freelist_lock; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -498,6 +496,7 @@ enum { /* misc: */ BCH_FS_BDEV_MOUNTED, BCH_FS_FIXED_GENS, + BCH_FS_ALLOC_WRITTEN, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -720,11 +719,13 @@ struct bch_fs { ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; + mempool_t large_bkey_pool; + /* REBALANCE */ struct bch_fs_rebalance rebalance; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 4577d77a9f38..3d85012a15fd 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -338,7 +338,8 @@ static inline void bkey_init(struct bkey *k) x(quota, 13) \ x(stripe, 14) \ x(reflink_p, 15) \ - x(reflink_v, 16) + x(reflink_v, 16) \ + x(inline_data, 17) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -911,6 +912,13 @@ struct bch_reflink_v { __u64 _data[0]; }; +/* Inline data */ + +struct bch_inline_data { + struct bch_val v; + u8 data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1314,6 +1322,8 @@ enum bch_sb_features { BCH_FEATURE_EC = 4, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_REFLINK = 6, + BCH_FEATURE_NEW_SIPHASH = 7, + BCH_FEATURE_INLINE_DATA = 8, BCH_FEATURE_NR, }; @@ -1340,11 +1350,19 @@ enum bch_csum_opts { BCH_CSUM_OPT_NR = 3, }; -enum bch_str_hash_opts { +enum bch_str_hash_type { BCH_STR_HASH_CRC32C = 0, BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH = 2, - BCH_STR_HASH_NR = 3, + BCH_STR_HASH_SIPHASH_OLD = 2, + BCH_STR_HASH_SIPHASH = 3, + BCH_STR_HASH_NR = 4, +}; + +enum bch_str_hash_opts { + BCH_STR_HASH_OPT_CRC32C = 0, + BCH_STR_HASH_OPT_CRC64 = 1, + BCH_STR_HASH_OPT_SIPHASH = 2, + BCH_STR_HASH_OPT_NR = 3, }; #define BCH_COMPRESSION_TYPES() \ @@ -1494,14 +1512,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); /* Btree: */ -#define BCH_BTREE_IDS() \ +#define BCH_BTREE_IDS() \ x(EXTENTS, 0, "extents") \ x(INODES, 1, "inodes") \ x(DIRENTS, 2, "dirents") \ x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") \ + x(EC, 6, "stripes") \ x(REFLINK, 7, "reflink") enum btree_id { diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 0f9dfe37b0af..4d0c9129cd4a 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -327,7 +327,7 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, const struct bkey_packed *src) { - dst->k = bkey_unpack_key(b, src); + __bkey_unpack_key(b, &dst->k, src); memcpy_u64s(&dst->v, bkeyp_val(&b->format, src), @@ -1058,26 +1058,20 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l, const struct bkey_packed *r, const struct btree *b) { - int packed = bkey_lr_packed(l, r); + struct bkey unpacked; - if (likely(packed == BKEY_PACKED_BOTH)) + if (likely(bkey_packed(l) && bkey_packed(r))) return __bch2_bkey_cmp_packed_format_checked(l, r, b); - switch (packed) { - case BKEY_PACKED_NONE: - return bkey_cmp(((struct bkey *) l)->p, - ((struct bkey *) r)->p); - case BKEY_PACKED_LEFT: - return __bch2_bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) l, - &((struct bkey *) r)->p); - case BKEY_PACKED_RIGHT: - return -__bch2_bkey_cmp_left_packed_format_checked(b, - (struct bkey_packed *) r, - &((struct bkey *) l)->p); - default: - unreachable(); + if (bkey_packed(l)) { + __bkey_unpack_key_format_checked(b, &unpacked, l); + l = (void*) &unpacked; + } else if (bkey_packed(r)) { + __bkey_unpack_key_format_checked(b, &unpacked, r); + r = (void*) &unpacked; } + + return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); } __pure __flatten diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 5ef66aed338d..f2d5f3009b21 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -33,6 +33,16 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) +static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, + struct bkey_packed *end) +{ + k = bkey_next(k); + + while (k != end && !k->u64s) + k = (void *) ((u64 *) k + 1); + return k; +} + #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) @@ -87,8 +97,8 @@ do { \ (u64 *) (_dst) < (u64 *) (_src) + \ ((struct bkey *) (_src))->u64s); \ \ - __memmove_u64s_down((_dst), (_src), \ - ((struct bkey *) (_src))->u64s); \ + memcpy_u64s_small((_dst), (_src), \ + ((struct bkey *) (_src))->u64s); \ } while (0) struct btree; @@ -554,6 +564,7 @@ BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); BKEY_VAL_ACCESSORS(reflink_p); BKEY_VAL_ACCESSORS(reflink_v); +BKEY_VAL_ACCESSORS(inline_data); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index f01405dd502b..ed448fad83c5 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, .key_invalid = empty_val_key_invalid, \ } +static const char *key_type_inline_data_invalid(const struct bch_fs *c, + struct bkey_s_c k) +{ + return NULL; +} + +static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); +} + +#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ + .key_invalid = key_type_inline_data_invalid, \ + .val_to_text = key_type_inline_data_to_text, \ +} + static const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() @@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if ((btree_node_type_is_extents(type) || - type == BKEY_TYPE_BTREE) && - bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX) + if (type == BKEY_TYPE_BTREE && + bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; if (btree_node_type_is_extents(type)) { diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h new file mode 100644 index 000000000000..f607a0cb37ed --- /dev/null +++ b/fs/bcachefs/bkey_on_stack.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_ON_STACK_H +#define _BCACHEFS_BKEY_ON_STACK_H + +#include "bcachefs.h" + +struct bkey_on_stack { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, + struct bch_fs *c, + struct bkey_s_c k) +{ + bkey_on_stack_realloc(s, c, k.k->u64s); + bkey_reassemble(s->k, k); +} + +static inline void bkey_on_stack_init(struct bkey_on_stack *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bkey_on_stack_exit(struct bkey_on_stack *s, + struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_ON_STACK_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index e32fad5a91ac..2e205db5433d 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_on_stack.h" #include "bkey_sort.h" #include "bset.h" #include "extents.h" @@ -74,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter, { i->k += __btree_node_offset_to_key(b, i->k)->u64s; + while (i->k != i->end && + !__btree_node_offset_to_key(b, i->k)->u64s) + i->k++; + if (i->k == i->end) *i = iter->data[--iter->used]; } @@ -118,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) { - iter->data->k = bkey_next(iter->data->k); + iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end); BUG_ON(iter->data->k > iter->data->end); @@ -292,8 +297,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bkey l_unpacked, r_unpacked; struct bkey_s l, r; struct btree_nr_keys nr; + struct bkey_on_stack split; memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&split); heap_resort(iter, extent_sort_cmp, NULL); @@ -343,29 +350,28 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, if (bkey_cmp(l.k->p, r.k->p) >= 0) { sort_key_next(iter, b, _r); } else { - __bch2_cut_front(l.k->p, r); + bch2_cut_front_s(l.k->p, r); extent_save(b, rk, r.k); } extent_sort_sift(iter, b, _r - iter->data); } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - BKEY_PADDED(k) tmp; /* * r wins, but it overlaps in the middle of l - split l: */ - bkey_reassemble(&tmp.k, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k); + bkey_on_stack_reassemble(&split, c, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), split.k); - __bch2_cut_front(r.k->p, l); + bch2_cut_front_s(r.k->p, l); extent_save(b, lk, l.k); extent_sort_sift(iter, b, 0); extent_sort_append(c, f, &nr, dst->start, - &prev, bkey_i_to_s(&tmp.k)); + &prev, bkey_i_to_s(split.k)); } else { - bch2_cut_back(bkey_start_pos(r.k), l.k); + bch2_cut_back_s(bkey_start_pos(r.k), l); extent_save(b, lk, l.k); } } @@ -373,6 +379,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, extent_sort_advance_prev(f, &nr, dst->start, &prev); dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + + bkey_on_stack_exit(&split, c); return nr; } @@ -418,7 +426,7 @@ bch2_sort_repack_merge(struct bch_fs *c, struct bkey_packed *prev = NULL, *k_packed; struct bkey_s k; struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; + struct bkey unpacked; memset(&nr, 0, sizeof(nr)); @@ -426,11 +434,7 @@ bch2_sort_repack_merge(struct bch_fs *c, if (filter_whiteouts && bkey_whiteout(k_packed)) continue; - EBUG_ON(bkeyp_val_u64s(&src->format, k_packed) > - BKEY_EXTENT_VAL_U64s_MAX); - - bch2_bkey_unpack(src, &tmp.k, k_packed); - k = bkey_i_to_s(&tmp.k); + k = __bkey_disassemble(src, k_packed, &unpacked); if (filter_whiteouts && bch2_bkey_normalize(c, k)) diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 32436ed5cc80..a0f0b0eadffb 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -76,7 +76,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) for (_k = i->start, k = bkey_unpack_key(b, _k); _k < vstruct_last(i); _k = _n, k = n) { - _n = bkey_next(_k); + _n = bkey_next_skip_noops(_k, vstruct_last(i)); bch2_bkey_to_text(&PBUF(buf), &k); printk(KERN_ERR "block %u key %5u: %s\n", set, @@ -144,9 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) struct btree_nr_keys nr = { 0 }; for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) + bset_tree_for_each_key(b, t, k) if (!bkey_whiteout(k)) btree_keys_account_key_add(&nr, t - b->set, k); @@ -294,38 +292,23 @@ static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, /* Auxiliary search trees */ -#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) -#define BFLOAT_FAILED_PREV (U8_MAX - 1) -#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) -#define BFLOAT_FAILED (U8_MAX - 2) - -#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) +#define BFLOAT_FAILED_UNPACKED U8_MAX +#define BFLOAT_FAILED U8_MAX struct bkey_float { u8 exponent; u8 key_offset; - union { - u32 mantissa32; - struct { - u16 mantissa16; - u16 _pad; - }; - }; -} __packed; - -#define BFLOAT_32BIT_NR 32U + u16 mantissa; +}; +#define BKEY_MANTISSA_BITS 16 static unsigned bkey_float_byte_offset(unsigned idx) { - int d = (idx - BFLOAT_32BIT_NR) << 1; - - d &= ~(d >> 31); - - return idx * 6 - d; + return idx * sizeof(struct bkey_float); } struct ro_aux_tree { - struct bkey_float _d[0]; + struct bkey_float f[0]; }; struct rw_aux_tree { @@ -380,8 +363,8 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) return t->aux_data_offset; case BSET_RO_AUX_TREE: return t->aux_data_offset + - DIV_ROUND_UP(bkey_float_byte_offset(t->size) + - sizeof(u8) * t->size, 8); + DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + + t->size * sizeof(u8), 8); case BSET_RW_AUX_TREE: return t->aux_data_offset + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); @@ -420,17 +403,11 @@ static u8 *ro_aux_tree_prev(const struct btree *b, return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); } -static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, - unsigned idx) -{ - return (void *) b + bkey_float_byte_offset(idx); -} - static struct bkey_float *bkey_float(const struct btree *b, const struct bset_tree *t, unsigned idx) { - return bkey_float_get(ro_aux_tree_base(b, t), idx); + return ro_aux_tree_base(b, t)->f + idx; } static void bset_aux_tree_verify(struct btree *b) @@ -633,7 +610,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next(k); + k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -669,21 +646,6 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, return idx; } -static inline unsigned bfloat_mantissa(const struct bkey_float *f, - unsigned idx) -{ - return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; -} - -static inline void bfloat_mantissa_set(struct bkey_float *f, - unsigned idx, unsigned mantissa) -{ - if (idx < BFLOAT_32BIT_NR) - f->mantissa32 = mantissa; - else - f->mantissa16 = mantissa; -} - static inline unsigned bkey_mantissa(const struct bkey_packed *k, const struct bkey_float *f, unsigned idx) @@ -703,9 +665,9 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ v >>= f->exponent & 7; #else - v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); + v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; #endif - return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; + return (u16) v; } static void make_bfloat(struct btree *b, struct bset_tree *t, @@ -715,14 +677,10 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *p = tree_to_prev_bkey(b, t, j); struct bkey_packed *l, *r; - unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; unsigned mantissa; int shift, exponent, high_bit; - EBUG_ON(bkey_next(p) != m); - if (is_power_of_2(j)) { l = min_key; @@ -764,8 +722,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * the original key. */ - if (!bkey_packed(l) || !bkey_packed(r) || - !bkey_packed(p) || !bkey_packed(m) || + if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || !b->nr_key_bits) { f->exponent = BFLOAT_FAILED_UNPACKED; return; @@ -782,8 +739,8 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, * of the key: we handle this later: */ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), - min_t(unsigned, bits, b->nr_key_bits) - 1); - exponent = high_bit - (bits - 1); + min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); + exponent = high_bit - (BKEY_MANTISSA_BITS - 1); /* * Then we calculate the actual shift value, from the start of the key @@ -792,12 +749,12 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; - EBUG_ON(shift + bits > b->format.key_u64s * 64); + EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); #else shift = high_bit_offset + b->nr_key_bits - exponent - - bits; + BKEY_MANTISSA_BITS; EBUG_ON(shift < KEY_PACKED_BITS_START); #endif @@ -813,37 +770,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, if (exponent < 0) mantissa |= ~(~0U << -exponent); - bfloat_mantissa_set(f, j, mantissa); - - /* - * The bfloat must be able to tell its key apart from the previous key - - * if its key and the previous key don't differ in the required bits, - * flag as failed - unless the keys are actually equal, in which case - * we aren't required to return a specific one: - */ - if (exponent > 0 && - bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && - bkey_cmp_packed(b, p, m)) { - f->exponent = BFLOAT_FAILED_PREV; - return; - } - - /* - * f->mantissa must compare >= the original key - for transitivity with - * the comparison in bset_search_tree. If we're dropping set bits, - * increment it: - */ - if (exponent > (int) bch2_bkey_ffs(b, m)) { - if (j < BFLOAT_32BIT_NR - ? f->mantissa32 == U32_MAX - : f->mantissa16 == U16_MAX) - f->exponent = BFLOAT_FAILED_OVERFLOW; - - if (j < BFLOAT_32BIT_NR) - f->mantissa32++; - else - f->mantissa16++; - } + f->mantissa = mantissa; } /* bytes remaining - only valid for last bset: */ @@ -856,14 +783,8 @@ static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) { - unsigned bytes = __bset_tree_capacity(b, t); - - if (bytes < 7 * BFLOAT_32BIT_NR) - return bytes / 7; - - bytes -= 7 * BFLOAT_32BIT_NR; - - return BFLOAT_32BIT_NR + bytes / 5; + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); } static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) @@ -880,9 +801,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) rw_aux_tree(b, t)[0].offset = __btree_node_key_to_offset(b, btree_bkey_first(b, t)); - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) { + bset_tree_for_each_key(b, t, k) { if (t->size == bset_rw_tree_capacity(b, t)) break; @@ -915,7 +834,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next(k); + prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -931,10 +850,10 @@ retry: EBUG_ON(tree_to_bkey(b, t, j) != k); } - while (bkey_next(k) != btree_bkey_last(b, t)) - k = bkey_next(k); + while (k != btree_bkey_last(b, t)) + prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); - t->max_key = bkey_unpack_pos(b, k); + t->max_key = bkey_unpack_pos(b, prev); /* Then we build the tree */ eytzinger1_for_each(j, t->size) @@ -1060,7 +979,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next(i)) + for (i = p; i != k; i = bkey_next_skip_noops(i, k)) if (i->type >= min_key_type) ret = i; @@ -1070,9 +989,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, if (btree_keys_expensive_checks(b)) { BUG_ON(ret >= orig_k); - for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); + for (i = ret + ? bkey_next_skip_noops(ret, orig_k) + : btree_bkey_first(b, t); i != orig_k; - i = bkey_next(i)) + i = bkey_next_skip_noops(i, orig_k)) BUG_ON(i->type >= min_key_type); } @@ -1107,7 +1028,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; - if (bkey_next(k) == btree_bkey_last(b, t)) { + if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { t->max_key = bkey_unpack_pos(b, k); for (j = 1; j < t->size; j = j * 2 + 1) @@ -1231,7 +1152,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next(k); + k = bkey_next_skip_noops(k, end); if (k == end) break; @@ -1333,14 +1254,38 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, return rw_aux_to_bkey(b, t, l); } -noinline -static int bset_search_tree_slowpath(const struct btree *b, - struct bset_tree *t, struct bpos *search, - const struct bkey_packed *packed_search, - unsigned n) +static inline void prefetch_four_cachelines(void *p) { - return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), - packed_search, search) < 0; +#ifdef CONFIG_X86_64 + asm(".intel_syntax noprefix;" + "prefetcht0 [%0 - 127 + 64 * 0];" + "prefetcht0 [%0 - 127 + 64 * 1];" + "prefetcht0 [%0 - 127 + 64 * 2];" + "prefetcht0 [%0 - 127 + 64 * 3];" + ".att_syntax prefix;" + : + : "r" (p + 127)); +#else + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + prefetch(p + L1_CACHE_BYTES * 3); +#endif +} + +static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + const struct bkey_float *f, + unsigned idx) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; + + return f->exponent > key_bits_start; +#else + unsigned key_bits_end = high_bit_offset + b->nr_key_bits; + + return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; +#endif } __flatten @@ -1350,44 +1295,37 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); - struct bkey_float *f = bkey_float_get(base, 1); - void *p; - unsigned inorder, n = 1; + struct bkey_float *f; + struct bkey_packed *k; + unsigned inorder, n = 1, l, r; + int cmp; - while (1) { - if (likely(n << 4 < t->size)) { - p = bkey_float_get(base, n << 4); - prefetch(p); - } else if (n << 3 < t->size) { - inorder = __eytzinger1_to_inorder(n, t->size, t->extra); - p = bset_cacheline(b, t, inorder); -#ifdef CONFIG_X86_64 - asm(".intel_syntax noprefix;" - "prefetcht0 [%0 - 127 + 64 * 0];" - "prefetcht0 [%0 - 127 + 64 * 1];" - "prefetcht0 [%0 - 127 + 64 * 2];" - "prefetcht0 [%0 - 127 + 64 * 3];" - ".att_syntax prefix;" - : - : "r" (p + 127)); -#else - prefetch(p + L1_CACHE_BYTES * 0); - prefetch(p + L1_CACHE_BYTES * 1); - prefetch(p + L1_CACHE_BYTES * 2); - prefetch(p + L1_CACHE_BYTES * 3); -#endif - } else if (n >= t->size) - break; + do { + if (likely(n << 4 < t->size)) + prefetch(&base->f[n << 4]); - f = bkey_float_get(base, n); + f = &base->f[n]; - if (packed_search && - likely(f->exponent < BFLOAT_FAILED)) - n = n * 2 + (bfloat_mantissa(f, n) < - bkey_mantissa(packed_search, f, n)); - else - n = n * 2 + bset_search_tree_slowpath(b, t, - search, packed_search, n); + if (!unlikely(packed_search)) + goto slowpath; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + + l = f->mantissa; + r = bkey_mantissa(packed_search, f, n); + + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + goto slowpath; + + n = n * 2 + (l < r); + continue; +slowpath: + k = tree_to_bkey(b, t, n); + cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); + if (!cmp) + return k; + + n = n * 2 + (cmp < 0); } while (n < t->size); inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); @@ -1396,29 +1334,23 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, * n would have been the node we recursed to - the low bit tells us if * we recursed left or recursed right. */ - if (n & 1) { - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else { - if (--inorder) { - n = eytzinger1_prev(n >> 1, t->size); - f = bkey_float_get(base, n); - return cacheline_to_bkey(b, t, inorder, f->key_offset); - } else + if (likely(!(n & 1))) { + --inorder; + if (unlikely(!inorder)) return btree_bkey_first(b, t); + + f = &base->f[eytzinger1_prev(n >> 1, t->size)]; } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); } -/* - * Returns the first key greater than or equal to @search - */ -__always_inline __flatten -static struct bkey_packed *bch2_bset_search(struct btree *b, +static __always_inline __flatten +struct bkey_packed *__bch2_bset_search(struct btree *b, struct bset_tree *t, struct bpos *search, - struct bkey_packed *packed_search, const struct bkey_packed *lossy_packed_search) { - struct bkey_packed *m; /* * First, we search for a cacheline, then lastly we do a linear search @@ -1437,11 +1369,9 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, switch (bset_aux_tree_type(t)) { case BSET_NO_AUX_TREE: - m = btree_bkey_first(b, t); - break; + return btree_bkey_first(b, t); case BSET_RW_AUX_TREE: - m = bset_search_write_set(b, t, search, lossy_packed_search); - break; + return bset_search_write_set(b, t, search, lossy_packed_search); case BSET_RO_AUX_TREE: /* * Each node in the auxiliary search tree covers a certain range @@ -1453,20 +1383,30 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, if (bkey_cmp(*search, t->max_key) > 0) return btree_bkey_last(b, t); - m = bset_search_tree(b, t, search, lossy_packed_search); - break; + return bset_search_tree(b, t, search, lossy_packed_search); + default: + unreachable(); } +} +static __always_inline __flatten +struct bkey_packed *bch2_bset_search_linear(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search, + struct bkey_packed *m) +{ if (lossy_packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, m) > 0) - m = bkey_next(m); + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, search, m) > 0) - m = bkey_next(m); + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); @@ -1479,6 +1419,23 @@ static struct bkey_packed *bch2_bset_search(struct btree *b, return m; } +/* + * Returns the first key greater than or equal to @search + */ +static __always_inline __flatten +struct bkey_packed *bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search) +{ + struct bkey_packed *m = __bch2_bset_search(b, t, search, + lossy_packed_search); + + return bch2_bset_search_linear(b, t, search, + packed_search, lossy_packed_search, m); +} + /* Btree node iterator */ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, @@ -1565,11 +1522,14 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * So we've got to search for start_of_range, then after the lookup iterate * past any extents that compare equal to the position we searched for. */ +__flatten void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { - struct bset_tree *t; struct bkey_packed p, *packed_search = NULL; + struct btree_node_iter_set *pos = iter->data; + struct bkey_packed *k[MAX_BSETS]; + unsigned i; EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); bset_aux_tree_verify(b); @@ -1588,11 +1548,23 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, return; } - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, - packed_search, &p), - btree_bkey_last(b, t)); + for (i = 0; i < b->nsets; i++) { + k[i] = __bch2_bset_search(b, b->set + i, search, &p); + prefetch_four_cachelines(k[i]); + } + + for (i = 0; i < b->nsets; i++) { + struct bset_tree *t = b->set + i; + struct bkey_packed *end = btree_bkey_last(b, t); + + k[i] = bch2_bset_search_linear(b, t, search, + packed_search, &p, k[i]); + if (k[i] != end) + *pos++ = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k[i]), + __btree_node_key_to_offset(b, end) + }; + } bch2_btree_node_iter_sort(iter, b); } @@ -1668,6 +1640,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, EBUG_ON(iter->data->k > iter->data->end); + while (!__btree_node_iter_set_end(iter, 0) && + !__bch2_btree_node_iter_peek_all(iter, b)->u64s) + iter->data->k++; + if (unlikely(__btree_node_iter_set_end(iter, 0))) { bch2_btree_node_iter_set_drop(iter, iter->data); return; @@ -1786,17 +1762,9 @@ void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) stats->floats += t->size - 1; for (j = 1; j < t->size; j++) - switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: - stats->failed_unpacked++; - break; - case BFLOAT_FAILED_PREV: - stats->failed_prev++; - break; - case BFLOAT_FAILED_OVERFLOW: - stats->failed_overflow++; - break; - } + stats->failed += + bkey_float(b, t, j)->exponent == + BFLOAT_FAILED; } } } @@ -1805,9 +1773,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, struct bkey_packed *k) { struct bset_tree *t = bch2_bkey_to_bset(b, k); - struct bkey_packed *l, *r, *p; - struct bkey uk, up; - char buf1[200], buf2[200]; + struct bkey uk; unsigned j, inorder; if (out->pos != out->end) @@ -1825,7 +1791,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, return; switch (bkey_float(b, t, j)->exponent) { - case BFLOAT_FAILED_UNPACKED: + case BFLOAT_FAILED: uk = bkey_unpack_key(b, k); pr_buf(out, " failed unpacked at depth %u\n" @@ -1833,41 +1799,5 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ilog2(j), uk.p.inode, uk.p.offset); break; - case BFLOAT_FAILED_PREV: - p = tree_to_prev_bkey(b, t, j); - l = is_power_of_2(j) - ? btree_bkey_first(b, t) - : tree_to_prev_bkey(b, t, j >> ffs(j)); - r = is_power_of_2(j + 1) - ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) - : tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - up = bkey_unpack_key(b, p); - uk = bkey_unpack_key(b, k); - bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); - bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); - - pr_buf(out, - " failed prev at depth %u\n" - "\tkey starts at bit %u but first differing bit at %u\n" - "\t%llu:%llu\n" - "\t%llu:%llu\n" - "\t%s\n" - "\t%s\n", - ilog2(j), - bch2_bkey_greatest_differing_bit(b, l, r), - bch2_bkey_greatest_differing_bit(b, p, k), - uk.p.inode, uk.p.offset, - up.p.inode, up.p.offset, - buf1, buf2); - break; - case BFLOAT_FAILED_OVERFLOW: - uk = bkey_unpack_key(b, k); - pr_buf(out, - " failed overflow at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); - break; } } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 643bd9e8bc4d..2653a74b3b14 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; } -#define for_each_bset(_b, _t) \ +#define for_each_bset(_b, _t) \ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) +#define bset_tree_for_each_key(_b, _t, _k) \ + for (_k = btree_bkey_first(_b, _t); \ + _k != btree_bkey_last(_b, _t); \ + _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) + static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; @@ -564,6 +569,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n, n->unpacked_keys += sign; } +static inline void btree_keys_account_val_delta(struct btree *b, + struct bkey_packed *k, + int delta) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + + b->nr.live_u64s += delta; + b->nr.bset_u64s[t - b->set] += delta; +} + #define btree_keys_account_key_add(_nr, _bset_idx, _k) \ btree_keys_account_key(_nr, _bset_idx, _k, 1) #define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ @@ -582,9 +597,7 @@ struct bset_stats { } sets[BSET_TREE_NR_TYPES]; size_t floats; - size_t failed_unpacked; - size_t failed_prev; - size_t failed_overflow; + size_t failed; }; void bch2_btree_keys_stats(struct btree *, struct bset_stats *); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 046524c8d5ea..5d3acba525c2 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -674,10 +674,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); retry: - rcu_read_lock(); b = btree_cache_find(bc, k); - rcu_read_unlock(); - if (unlikely(!b)) { /* * We must have the parent locked to call bch2_btree_node_fill(), @@ -878,10 +875,7 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, BUG_ON(!btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); - rcu_read_lock(); b = btree_cache_find(bc, k); - rcu_read_unlock(); - if (b) return; @@ -915,9 +909,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" - " failed unpacked %zu\n" - " failed prev %zu\n" - " failed overflow %zu\n", + " failed unpacked %zu\n", f->key_u64s, f->bits_per_field[0], f->bits_per_field[1], @@ -934,7 +926,5 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, - stats.failed_unpacked, - stats.failed_prev, - stats.failed_overflow); + stats.failed); } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index f4adb07a3de2..8bbf60b07736 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -216,7 +216,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, : expensive_debug_checks(c) ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale; + u8 max_stale = 0; int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -640,12 +640,7 @@ static int bch2_gc_start(struct bch_fs *c, { struct bch_dev *ca; unsigned i; - - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); + int ret; BUG_ON(c->usage_gc); @@ -673,6 +668,18 @@ static int bch2_gc_start(struct bch_fs *c, } } + ret = bch2_ec_mem_alloc(c, true); + if (ret) + return ret; + + percpu_down_write(&c->mark_lock); + + /* + * indicate to stripe code that we need to allocate for the gc stripes + * radix tree, too + */ + gc_pos_set(c, gc_phase(GC_PHASE_START)); + for_each_member_device(ca, c, i) { struct bucket_array *dst = __bucket_array(ca, 1); struct bucket_array *src = __bucket_array(ca, 0); @@ -697,7 +704,9 @@ static int bch2_gc_start(struct bch_fs *c, } }; - return bch2_ec_mem_alloc(c, true); + percpu_up_write(&c->mark_lock); + + return 0; } /** @@ -730,10 +739,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, down_write(&c->gc_lock); again: - percpu_down_write(&c->mark_lock); ret = bch2_gc_start(c, metadata_only); - percpu_up_write(&c->mark_lock); - if (ret) goto out; @@ -916,7 +922,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next(k)) { + k = bkey_next_skip_noops(k, vstruct_last(s2))) { last = k; u64s += k->u64s; } @@ -1034,11 +1040,12 @@ next: old_nodes[i] = new_nodes[i]; } else { old_nodes[i] = NULL; - if (new_nodes[i]) - six_unlock_intent(&new_nodes[i]->lock); } } + for (i = 0; i < nr_new_nodes; i++) + six_unlock_intent(&new_nodes[i]->lock); + bch2_btree_update_done(as); bch2_keylist_free(&keylist, NULL); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 7c88b9d64935..c345262d804b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -26,34 +26,33 @@ static void verify_no_dups(struct btree *b, struct bkey_packed *end) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k; + struct bkey_packed *k, *p; + + if (start == end) + return; - for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { - struct bkey l = bkey_unpack_key(b, k); - struct bkey r = bkey_unpack_key(b, bkey_next(k)); + for (p = start, k = bkey_next_skip_noops(start, end); + k != end; + p = k, k = bkey_next_skip_noops(k, end)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); BUG_ON(btree_node_is_extents(b) ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); } #endif } -static void clear_needs_whiteout(struct bset *i) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = false; -} - -static void set_needs_whiteout(struct bset *i) +static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = true; + for (k = i->start; + k != vstruct_last(i); + k = bkey_next_skip_noops(k, vstruct_last(i))) + k->needs_whiteout = v; } static void btree_bounce_free(struct bch_fs *c, unsigned order, @@ -168,7 +167,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_next_skip_noops(k, end); if (bkey_deleted(k) && btree_node_is_extents(b)) continue; @@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_next_skip_noops(k, end); if (!bkey_whiteout(k)) { bkey_copy(out, k); @@ -510,7 +509,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -680,14 +679,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b, struct bkey tmp; const char *invalid; - if (btree_err_on(!k->u64s, - BTREE_ERR_FIXABLE, c, b, i, - "KEY_U64s 0: %zu bytes of metadata lost", - vstruct_end(i) - (void *) k)) { - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; - } - if (btree_err_on(bkey_next(k) > vstruct_last(i), BTREE_ERR_FIXABLE, c, b, i, "key extends past end of bset")) { @@ -756,7 +747,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, prev_pos = u.k->p; prev = k; - k = bkey_next(k); + k = bkey_next_skip_noops(k, vstruct_last(i)); } SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); @@ -915,12 +906,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry continue; } - k = bkey_next(k); + k = bkey_next_skip_noops(k, vstruct_last(i)); } bch2_bset_build_aux_tree(b, b->set, false); - set_needs_whiteout(btree_bset_first(b)); + set_needs_whiteout(btree_bset_first(b), true); btree_node_reset_sib_u64s(b); out: @@ -1425,7 +1416,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, : bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); - clear_needs_whiteout(i); + set_needs_whiteout(i, false); /* do we have data to write? */ if (b->written && !i->u64s) @@ -1500,10 +1491,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, wbio->data = data; wbio->wbio.order = order; wbio->wbio.used_mempool = used_mempool; - wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; wbio->wbio.bio.bi_private = b; + if (b->level || !b->written) + wbio->wbio.bio.bi_opf |= REQ_FUA; + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* @@ -1576,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) } for_each_bset(b, t) - set_needs_whiteout(bset(b, t)); + set_needs_whiteout(bset(b, t), true); bch2_btree_verify(c, b); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index c817aeed878a..955a80cafae3 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -62,10 +62,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) { - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; + unsigned total_u64s = bset_u64s(t); + unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set]; - return dead_u64s > 128 && dead_u64s * 3 > bset_u64s; + return dead_u64s > 64 && dead_u64s * 3 > total_u64s; } static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 40cd87d73a4f..a4180124d7d1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -64,21 +64,9 @@ static inline int btree_iter_pos_cmp(struct btree_iter *iter, /* Btree node locking: */ -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) { - struct btree_iter *linked; - - EBUG_ON(iter->l[b->level].b != b); - EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); - - trans_for_each_iter_with_node(iter->trans, b, linked) - linked->l[b->level].lock_seq += 2; - - six_unlock_write(&b->lock); + bch2_btree_node_unlock_write_inlined(b, iter); } void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) @@ -306,9 +294,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) __flatten static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) { - return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? btree_iter_get_locks(iter, false, trace) - : true; + return btree_iter_get_locks(iter, false, trace); } bool __bch2_btree_iter_upgrade(struct btree_iter *iter, @@ -473,7 +459,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, } BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && - (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && + btree_iter_type(iter) == BTREE_ITER_KEYS && !bkey_whiteout(&iter->k) && bch2_btree_node_iter_end(&l->iter)); } @@ -513,6 +499,30 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); } +static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_node_iter *node_iter = &iter->l[0].iter; + + if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { + bkey_disassemble(b, where, &iter->k); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + } +} + +void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_iter *linked; + + trans_for_each_iter_with_node(iter->trans, b, linked) { + __bch2_btree_iter_fix_key_modified(linked, b, where); + __bch2_btree_iter_verify(linked, b); + } +} + static void __bch2_btree_node_iter_fix(struct btree_iter *iter, struct btree *b, struct btree_node_iter *node_iter, @@ -833,8 +843,6 @@ void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) btree_iter_node_set(linked, b); } - - six_unlock_intent(&b->lock); } void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) @@ -941,7 +949,7 @@ static void btree_iter_prefetch(struct btree_iter *iter) btree_node_unlock(iter, iter->level); } -static inline int btree_iter_down(struct btree_iter *iter) +static __always_inline int btree_iter_down(struct btree_iter *iter) { struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; @@ -950,7 +958,7 @@ static inline int btree_iter_down(struct btree_iter *iter) enum six_lock_type lock_type = __btree_lock_want(iter, level); BKEY_PADDED(k) tmp; - BUG_ON(!btree_node_locked(iter, iter->level)); + EBUG_ON(!btree_node_locked(iter, iter->level)); bch2_bkey_unpack(l->b, &tmp.k, bch2_btree_node_iter_peek(&l->iter, l->b)); @@ -1010,8 +1018,11 @@ retry_all: if (unlikely(ret == -EIO)) { trans->error = true; - orig_iter->flags |= BTREE_ITER_ERROR; - orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR; + if (orig_iter) { + orig_iter->flags |= BTREE_ITER_ERROR; + orig_iter->l[orig_iter->level].b = + BTREE_ITER_NO_NODE_ERROR; + } goto out; } @@ -1085,7 +1096,15 @@ static int btree_iter_traverse_one(struct btree_iter *iter) if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - if (bch2_btree_iter_relock(iter, false)) + /* + * if we need interior nodes locked, call btree_iter_relock() to make + * sure we walk back up enough that we lock them: + */ + if (iter->uptodate == BTREE_ITER_NEED_RELOCK || + iter->locks_want > 1) + bch2_btree_iter_relock(iter, false); + + if (iter->uptodate < BTREE_ITER_NEED_RELOCK) return 0; /* @@ -1152,6 +1171,7 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter, EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != (btree_node_type_is_extents(iter->btree_id) && type != BTREE_ITER_NODES)); + EBUG_ON(btree_iter_type(iter) != type); bch2_btree_trans_verify_locks(iter->trans); } @@ -1357,6 +1377,13 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) if (debug_check_iterators(iter->trans->c)) { struct bkey k = bkey_unpack_key(l->b, _k); + + /* + * this flag is internal to the btree code, + * we don't care if it doesn't match - if it's now set + * it just means the key has been written out to disk: + */ + k.needs_whiteout = iter->k.needs_whiteout; BUG_ON(memcmp(&k, &iter->k, sizeof(k))); } @@ -1436,6 +1463,14 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } + if (unlikely(bkey_deleted(&iter->k))) { + /* + * we're currently pointed at a hole, because previously we were + * iterating over slots: + */ + return bch2_btree_iter_peek(iter); + } + do { bch2_btree_node_iter_advance(&l->iter, l->b); p = bch2_btree_node_iter_peek_all(&l->iter, l->b); @@ -1661,7 +1696,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { int ret; - bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); if (iter->uptodate == BTREE_ITER_UPTODATE) return btree_iter_peek_uptodate(iter); @@ -1675,7 +1710,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); + bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); iter->pos = btree_type_successor(iter->btree_id, iter->k.p); @@ -1729,15 +1764,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans, /* new transactional stuff: */ -int bch2_trans_iter_put(struct btree_trans *trans, - struct btree_iter *iter) -{ - int ret = btree_iter_err(iter); - - trans->iters_live &= ~(1ULL << iter->idx); - return ret; -} - static inline void __bch2_trans_iter_free(struct btree_trans *trans, unsigned idx) { @@ -1745,26 +1771,27 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans, trans->iters_linked &= ~(1ULL << idx); trans->iters_live &= ~(1ULL << idx); trans->iters_touched &= ~(1ULL << idx); - trans->iters_unlink_on_restart &= ~(1ULL << idx); - trans->iters_unlink_on_commit &= ~(1ULL << idx); } -int bch2_trans_iter_free(struct btree_trans *trans, - struct btree_iter *iter) +int bch2_trans_iter_put(struct btree_trans *trans, + struct btree_iter *iter) { int ret = btree_iter_err(iter); - __bch2_trans_iter_free(trans, iter->idx); + if (!(trans->iters_touched & (1ULL << iter->idx)) && + !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) + __bch2_trans_iter_free(trans, iter->idx); + + trans->iters_live &= ~(1ULL << iter->idx); return ret; } -int bch2_trans_iter_free_on_commit(struct btree_trans *trans, - struct btree_iter *iter) +int bch2_trans_iter_free(struct btree_trans *trans, + struct btree_iter *iter) { - int ret = btree_iter_err(iter); + trans->iters_touched &= ~(1ULL << iter->idx); - trans->iters_unlink_on_commit |= 1ULL << iter->idx; - return ret; + return bch2_trans_iter_put(trans, iter); } static int bch2_trans_realloc_iters(struct btree_trans *trans, @@ -1830,7 +1857,7 @@ success: return 0; } -static int btree_trans_iter_alloc(struct btree_trans *trans) +static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { unsigned idx = __ffs64(~trans->iters_linked); @@ -1838,9 +1865,27 @@ static int btree_trans_iter_alloc(struct btree_trans *trans) goto got_slot; if (trans->nr_iters == trans->size) { - int ret = bch2_trans_realloc_iters(trans, trans->size * 2); + int ret; + + if (trans->nr_iters >= BTREE_ITER_MAX) { + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) { + pr_err("iter: btree %s pos %llu:%llu%s%s%s", + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : ""); + } + + panic("trans iter oveflow\n"); + } + + ret = bch2_trans_realloc_iters(trans, trans->size * 2); if (ret) - return ret; + return ERR_PTR(ret); } idx = trans->nr_iters++; @@ -1850,71 +1895,97 @@ static int btree_trans_iter_alloc(struct btree_trans *trans) got_slot: BUG_ON(trans->iters_linked & (1ULL << idx)); trans->iters_linked |= 1ULL << idx; - return idx; + return &trans->iters[idx]; +} + +static inline void btree_iter_copy(struct btree_iter *dst, + struct btree_iter *src) +{ + unsigned i, idx = dst->idx; + + *dst = *src; + dst->idx = idx; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) + six_lock_increment(&dst->l[i].b->lock, + __btree_lock_want(dst, i)); +} + +static inline struct bpos bpos_diff(struct bpos l, struct bpos r) +{ + if (bkey_cmp(l, r) > 0) + swap(l, r); + + return POS(r.inode - l.inode, r.offset - l.offset); } static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, unsigned btree_id, struct bpos pos, - unsigned flags, u64 iter_id) + unsigned flags) { - struct btree_iter *iter; - int idx; + struct btree_iter *iter, *best = NULL; BUG_ON(trans->nr_iters > BTREE_ITER_MAX); - for (idx = 0; idx < trans->nr_iters; idx++) { - if (!(trans->iters_linked & (1ULL << idx))) + trans_for_each_iter(trans, iter) { + if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) continue; - iter = &trans->iters[idx]; - if (iter_id - ? iter->id == iter_id - : (iter->btree_id == btree_id && - !bkey_cmp(iter->pos, pos))) - goto found; + if (iter->btree_id != btree_id) + continue; + + if (best && + bkey_cmp(bpos_diff(best->pos, pos), + bpos_diff(iter->pos, pos)) < 0) + continue; + + best = iter; } - idx = -1; -found: - if (idx < 0) { - idx = btree_trans_iter_alloc(trans); - if (idx < 0) - return ERR_PTR(idx); - iter = &trans->iters[idx]; - iter->id = iter_id; + if (!best) { + iter = btree_trans_iter_alloc(trans); + if (IS_ERR(iter)) + return iter; bch2_btree_iter_init(trans, iter, btree_id, pos, flags); - } else { - iter = &trans->iters[idx]; - - iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); - iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + } else if ((trans->iters_live & (1ULL << best->idx)) || + (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { + iter = btree_trans_iter_alloc(trans); + if (IS_ERR(iter)) + return iter; - if ((iter->flags & BTREE_ITER_INTENT) && - !bch2_btree_iter_upgrade(iter, 1)) { - trace_trans_restart_upgrade(trans->ip); - return ERR_PTR(-EINTR); - } + btree_iter_copy(iter, best); + } else { + iter = best; } - BUG_ON(iter->btree_id != btree_id); - BUG_ON(trans->iters_live & (1ULL << idx)); - trans->iters_live |= 1ULL << idx; - trans->iters_touched |= 1ULL << idx; + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); + + if (iter->flags & BTREE_ITER_INTENT) + bch2_btree_iter_upgrade(iter, 1); + else + bch2_btree_iter_downgrade(iter); BUG_ON(iter->btree_id != btree_id); BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); + BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + BUG_ON(trans->iters_live & (1ULL << iter->idx)); + + trans->iters_live |= 1ULL << iter->idx; + trans->iters_touched |= 1ULL << iter->idx; return iter; } -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos, unsigned flags, - u64 iter_id) +struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos, unsigned flags) { struct btree_iter *iter = - __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id); + __btree_trans_get_iter(trans, btree_id, pos, flags); if (!IS_ERR(iter)) bch2_btree_iter_set_pos(iter, pos); @@ -1930,7 +2001,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, { struct btree_iter *iter = __btree_trans_get_iter(trans, btree_id, pos, - flags|BTREE_ITER_NODES, 0); + flags|BTREE_ITER_NODES); unsigned i; BUG_ON(IS_ERR(iter)); @@ -1950,28 +2021,22 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) { struct btree_iter *iter; - int i, idx; - - idx = btree_trans_iter_alloc(trans); - if (idx < 0) - return ERR_PTR(idx); - trans->iters_live |= 1ULL << idx; - trans->iters_touched |= 1ULL << idx; - trans->iters_unlink_on_restart |= 1ULL << idx; + iter = btree_trans_iter_alloc(trans); + if (IS_ERR(iter)) + return iter; - iter = &trans->iters[idx]; + btree_iter_copy(iter, src); - memcpy(&iter->trans, - &src->trans, - (void *) &iter[1] - (void *) &iter->trans); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_locked(iter, i)) - six_lock_increment(&iter->l[i].b->lock, - __btree_lock_want(iter, i)); + trans->iters_live |= 1ULL << iter->idx; + /* + * Don't mark it as touched, we don't need to preserve this iter since + * it's cheap to copy it again: + */ + trans->iters_touched &= ~(1ULL << iter->idx); + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - return &trans->iters[idx]; + return iter; } static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) @@ -2010,10 +2075,11 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } -inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) +inline void bch2_trans_unlink_iters(struct btree_trans *trans) { - iters &= trans->iters_linked; - iters &= ~trans->iters_live; + u64 iters = trans->iters_linked & + ~trans->iters_touched & + ~trans->iters_live; while (iters) { unsigned idx = __ffs64(iters); @@ -2023,33 +2089,24 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) } } -void bch2_trans_begin(struct btree_trans *trans) +void bch2_trans_reset(struct btree_trans *trans, unsigned flags) { - u64 iters_to_unlink; + struct btree_iter *iter; - /* - * On transaction restart, the transaction isn't required to allocate - * all the same iterators it on the last iteration: - * - * Unlink any iterators it didn't use this iteration, assuming it got - * further (allocated an iter with a higher idx) than where the iter - * was originally allocated: - */ - iters_to_unlink = ~trans->iters_live & - ((1ULL << fls64(trans->iters_live)) - 1); + trans_for_each_iter(trans, iter) + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - iters_to_unlink |= trans->iters_unlink_on_restart; - iters_to_unlink |= trans->iters_unlink_on_commit; + bch2_trans_unlink_iters(trans); - trans->iters_live = 0; + if (flags & TRANS_RESET_ITERS) + trans->iters_live = 0; - bch2_trans_unlink_iters(trans, iters_to_unlink); + trans->iters_touched &= trans->iters_live; - trans->iters_touched = 0; - trans->iters_unlink_on_restart = 0; - trans->iters_unlink_on_commit = 0; trans->nr_updates = 0; - trans->mem_top = 0; + + if (flags & TRANS_RESET_MEM) + trans->mem_top = 0; bch2_btree_iter_traverse_all(trans); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index e4967215e1d9..4c5032222319 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -48,6 +48,11 @@ static inline int btree_iter_err(const struct btree_iter *iter) /* Iterate over iters within a transaction: */ +#define trans_for_each_iter_all(_trans, _iter) \ + for (_iter = (_trans)->iters; \ + _iter < (_trans)->iters + (_trans)->nr_iters; \ + _iter++) + static inline struct btree_iter * __trans_next_iter(struct btree_trans *trans, unsigned idx) { @@ -99,6 +104,8 @@ static inline void bch2_btree_iter_verify(struct btree_iter *iter, static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} #endif +void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, + struct bkey_packed *); void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, struct btree_node_iter *, struct bkey_packed *, unsigned, unsigned); @@ -246,6 +253,11 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, : bch2_btree_iter_next(iter); } +static inline int bkey_err(struct bkey_s_c k) +{ + return PTR_ERR_OR_ZERO(k.k); +} + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ @@ -257,57 +269,39 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, (_ret) = PTR_ERR_OR_ZERO(((_k) = \ __bch2_btree_iter_next(_iter, _flags)).k)) -#define for_each_btree_key_continue(_iter, _flags, _k) \ +#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ - !IS_ERR_OR_NULL((_k).k); \ + !((_ret) = bkey_err(_k)) && (_k).k; \ (_k) = __bch2_btree_iter_next(_iter, _flags)) -static inline int bkey_err(struct bkey_s_c k) -{ - return PTR_ERR_OR_ZERO(k.k); -} - /* new multiple iterator interface: */ int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); -int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *); -void bch2_trans_unlink_iters(struct btree_trans *, u64); +void bch2_trans_unlink_iters(struct btree_trans *); -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned, u64); +struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id, + struct bpos, unsigned); struct btree_iter *bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *); +struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); -static __always_inline u64 __btree_iter_id(void) -{ - u64 ret = 0; +#define TRANS_RESET_ITERS (1 << 0) +#define TRANS_RESET_MEM (1 << 1) - ret <<= 32; - ret |= _RET_IP_ & U32_MAX; - ret <<= 32; - ret |= _THIS_IP_ & U32_MAX; - return ret; -} +void bch2_trans_reset(struct btree_trans *, unsigned); -static __always_inline struct btree_iter * -bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, - struct bpos pos, unsigned flags) +static inline void bch2_trans_begin(struct btree_trans *trans) { - return __bch2_trans_get_iter(trans, btree_id, pos, flags, - __btree_iter_id()); + return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); } -struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); - -void bch2_trans_begin(struct btree_trans *); - static inline void bch2_trans_begin_updates(struct btree_trans *trans) { - trans->nr_updates = 0; + return bch2_trans_reset(trans, TRANS_RESET_MEM); } void *bch2_trans_kmalloc(struct btree_trans *, size_t); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index ea07ba19c5dc..aaad2d289e79 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -203,6 +203,24 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, __bch2_btree_node_relock(iter, level); } +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) +{ + struct btree_iter *linked; + + EBUG_ON(iter->l[b->level].b != b); + EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); + + trans_for_each_iter_with_node(iter->trans, b, linked) + linked->l[b->level].lock_seq += 2; + + six_unlock_write(&b->lock); +} + void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); @@ -212,7 +230,7 @@ static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter EBUG_ON(iter->l[b->level].b != b); EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq); - if (!six_trylock_write(&b->lock)) + if (unlikely(!six_trylock_write(&b->lock))) __bch2_btree_node_lock_write(b, iter); } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index b0da09630911..efa68bb578ab 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -180,20 +180,21 @@ struct btree_node_iter { enum btree_iter_type { BTREE_ITER_KEYS, - BTREE_ITER_SLOTS, BTREE_ITER_NODES, }; #define BTREE_ITER_TYPE ((1 << 2) - 1) -#define BTREE_ITER_INTENT (1 << 2) -#define BTREE_ITER_PREFETCH (1 << 3) +#define BTREE_ITER_SLOTS (1 << 2) +#define BTREE_ITER_INTENT (1 << 3) +#define BTREE_ITER_PREFETCH (1 << 4) +#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 4) -#define BTREE_ITER_ERROR (1 << 5) +#define BTREE_ITER_IS_EXTENTS (1 << 6) +#define BTREE_ITER_ERROR (1 << 7) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -234,33 +235,16 @@ struct btree_iter { * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; - - u64 id; }; -struct deferred_update { - struct journal_preres res; - struct journal_entry_pin journal; - - spinlock_t lock; - unsigned dirty:1; - - u8 allocated_u64s; - enum btree_id btree_id; - - /* must be last: */ - struct bkey_i k; -}; +static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_TYPE; +} struct btree_insert_entry { struct bkey_i *k; - - union { struct btree_iter *iter; - struct deferred_update *d; - }; - - bool deferred; }; #define BTREE_ITER_MAX 64 @@ -268,13 +252,10 @@ struct btree_insert_entry { struct btree_trans { struct bch_fs *c; unsigned long ip; - u64 commit_start; u64 iters_linked; u64 iters_live; u64 iters_touched; - u64 iters_unlink_on_restart; - u64 iters_unlink_on_commit; u8 nr_iters; u8 nr_updates; @@ -298,12 +279,11 @@ struct btree_trans { struct disk_reservation *disk_res; unsigned flags; unsigned journal_u64s; + struct replicas_delta_list *fs_usage_deltas; struct btree_iter iters_onstack[2]; struct btree_insert_entry updates_onstack[6]; u8 updates_sorted_onstack[6]; - - struct replicas_delta_list *fs_usage_deltas; }; #define BTREE_FLAG(flag) \ @@ -435,6 +415,12 @@ static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) +static inline unsigned bset_u64s(struct bset_tree *t) +{ + return t->end_offset - t->data_offset - + sizeof(struct bset) / sizeof(u64); +} + static inline unsigned bset_byte_offset(struct btree *b, void *i) { return i - (void *) b->data; @@ -475,19 +461,22 @@ static inline bool btree_node_is_extents(struct btree *b) return btree_node_type_is_extents(btree_node_type(b)); } +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_ALLOC)| \ + (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_REFLINK)| \ + (1U << BKEY_TYPE_EC)| \ + (1U << BKEY_TYPE_BTREE)) + +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_EXTENTS)| \ + (1U << BKEY_TYPE_INODES)| \ + (1U << BKEY_TYPE_REFLINK)) + static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - switch (type) { - case BKEY_TYPE_ALLOC: - case BKEY_TYPE_BTREE: - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_INODES: - case BKEY_TYPE_EC: - case BKEY_TYPE_REFLINK: - return true; - default: - return false; - } + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); } struct btree_root { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 36e34b3d9213..ad8cbf3fb778 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -15,24 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, struct bkey_i *); -void bch2_deferred_update_free(struct bch_fs *, - struct deferred_update *); -struct deferred_update * -bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned); - -#define BTREE_INSERT_ENTRY(_iter, _k) \ - ((struct btree_insert_entry) { \ - .iter = (_iter), \ - .k = (_k), \ - }) - -#define BTREE_INSERT_DEFERRED(_d, _k) \ - ((struct btree_insert_entry) { \ - .k = (_k), \ - .d = (_d), \ - .deferred = true, \ - }) - enum { __BTREE_INSERT_ATOMIC, __BTREE_INSERT_NOUNLOCK, @@ -45,7 +27,6 @@ enum { __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_NOMARK_OVERWRITES, __BTREE_INSERT_NOMARK, - __BTREE_INSERT_MARK_INMEM, __BTREE_INSERT_NO_CLEAR_REPLICAS, __BTREE_INSERT_BUCKET_INVALIDATE, __BTREE_INSERT_NOWAIT, @@ -86,9 +67,6 @@ enum { /* Don't call mark new key at all: */ #define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) -/* Don't mark transactionally: */ -#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM) - #define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) #define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) @@ -115,16 +93,42 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, struct btree *, struct bkey_i_btree_ptr *); -int bch2_trans_commit(struct btree_trans *, - struct disk_reservation *, - u64 *, unsigned); +int __bch2_trans_commit(struct btree_trans *); + +/** + * bch2_trans_commit - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +static inline int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + unsigned flags) +{ + trans->disk_res = disk_res; + trans->journal_seq = journal_seq; + trans->flags = flags; + + return __bch2_trans_commit(trans); +} static inline void bch2_trans_update(struct btree_trans *trans, - struct btree_insert_entry entry) + struct btree_iter *iter, + struct bkey_i *k) { EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); - trans->updates[trans->nr_updates++] = entry; + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { + .iter = iter, .k = k + }; } #define bch2_trans_do(_c, _journal_seq, _flags, _do) \ @@ -145,23 +149,9 @@ static inline void bch2_trans_update(struct btree_trans *trans, _ret; \ }) -#define __trans_next_update(_trans, _i, _filter) \ -({ \ - while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\ - (_i)++; \ - \ - (_i) < (_trans)->updates + (_trans->nr_updates); \ -}) - -#define __trans_for_each_update(_trans, _i, _filter) \ +#define trans_for_each_update(_trans, _i) \ for ((_i) = (_trans)->updates; \ - __trans_next_update(_trans, _i, _filter); \ + (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_update(trans, i) \ - __trans_for_each_update(trans, i, true) - -#define trans_for_each_update_iter(trans, i) \ - __trans_for_each_update(trans, i, !(i)->deferred) - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 6813eddd26f5..f8a30cb34750 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) bch2_bkey_format_add_pos(s, b->data->min_key); for_each_bset(b, t) - for (k = btree_bkey_first(b, t); - k != btree_bkey_last(b, t); - k = bkey_next(k)) + bset_tree_for_each_key(b, t, k) if (!bkey_whiteout(k)) { uk = bkey_unpack_key(b, k); bch2_bkey_format_add_key(s, &uk); @@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as, */ k = set1->start; while (1) { - if (bkey_next(k) == vstruct_last(set1)) + struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); + + if (n == vstruct_last(set1)) break; if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) break; @@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as, nr_unpacked++; prev = k; - k = bkey_next(k); + k = n; } BUG_ON(!prev); @@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, { struct btree_node_iter node_iter; struct bkey_i *k = bch2_keylist_front(keys); - struct bkey_packed *p; + struct bkey_packed *src, *dst, *n; struct bset *i; BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); @@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, * for the pivot: */ i = btree_bset_first(b); - p = i->start; - while (p != vstruct_last(i)) - if (bkey_deleted(p)) { - le16_add_cpu(&i->u64s, -p->u64s); - set_btree_bset_end(b, b->set); - memmove_u64s_down(p, bkey_next(p), - (u64 *) vstruct_last(i) - - (u64 *) p); - } else - p = bkey_next(p); + src = dst = i->start; + while (src != vstruct_last(i)) { + n = bkey_next_skip_noops(src, vstruct_last(i)); + if (!bkey_deleted(src)) { + memmove_u64s_down(dst, src, src->u64s); + dst = bkey_next(dst); + } + src = n; + } + + i->u64s = cpu_to_le16((u64 *) dst - i->_data); + set_btree_bset_end(b, b->set); BUG_ON(b->nsets != 1 || b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); @@ -1446,8 +1448,20 @@ static void btree_split(struct btree_update *as, struct btree *b, bch2_btree_iter_node_replace(iter, n2); bch2_btree_iter_node_replace(iter, n1); + /* + * The old node must be freed (in memory) _before_ unlocking the new + * nodes - else another thread could re-acquire a read lock on the old + * node after another thread has locked and updated the new node, thus + * seeing stale data: + */ bch2_btree_node_free_inmem(c, b, iter); + if (n3) + six_unlock_intent(&n3->lock); + if (n2) + six_unlock_intent(&n2->lock); + six_unlock_intent(&n1->lock); + bch2_btree_trans_verify_locks(iter->trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], @@ -1761,6 +1775,8 @@ retry: bch2_btree_node_free_inmem(c, b, iter); bch2_btree_node_free_inmem(c, m, iter); + six_unlock_intent(&n->lock); + bch2_btree_update_done(as); if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) @@ -1855,6 +1871,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_replace(iter, n); bch2_btree_node_free_inmem(c, b, iter); + six_unlock_intent(&n->lock); bch2_btree_update_done(as); return 0; @@ -2172,6 +2189,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_bset_init_first(b, &b->data->keys); bch2_btree_build_aux_trees(b); + b->data->flags = 0; b->data->min_key = POS_MIN; b->data->max_key = POS_MAX; b->data->format = bch2_btree_calc_format(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index e5156e908110..c5a0ab5d7bb8 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -284,17 +284,17 @@ static inline unsigned btree_write_set_buffer(struct btree *b) static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { - struct bset *i = btree_bset_last(b); + struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); - if (unlikely(bset_written(b, i))) { + if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) return bne; } else { - if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) return bne; } diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 0d32fb8726c7..d37a95299240 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -10,27 +10,22 @@ #include "buckets.h" #include "debug.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" #include "replicas.h" +#include <linux/prefetch.h> #include <linux/sort.h> #include <trace/events/bcachefs.h> static inline bool same_leaf_as_prev(struct btree_trans *trans, - unsigned sorted_idx) + unsigned idx) { - struct btree_insert_entry *i = trans->updates + - trans->updates_sorted[sorted_idx]; - struct btree_insert_entry *prev = sorted_idx - ? trans->updates + trans->updates_sorted[sorted_idx - 1] - : NULL; - - return !i->deferred && - prev && - i->iter->l[0].b == prev->iter->l[0].b; + return idx && + trans->updates[trans->updates_sorted[idx]].iter->l[0].b == + trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; } #define trans_for_each_update_sorted(_trans, _i, _iter) \ @@ -44,7 +39,7 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, { bch2_btree_node_lock_write(b, iter); - if (btree_node_just_written(b) && + if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) bch2_btree_iter_reinit_node(iter, b); @@ -56,30 +51,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static void btree_trans_lock_write(struct btree_trans *trans, bool lock) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned iter; - - trans_for_each_update_sorted(trans, i, iter) { - if (same_leaf_as_prev(trans, iter)) - continue; - - if (lock) - bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter); - else - bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); - } -} - -static inline int btree_trans_cmp(struct btree_insert_entry l, - struct btree_insert_entry r) -{ - return cmp_int(l.deferred, r.deferred) ?: - btree_iter_cmp(l.iter, r.iter); -} - static inline void btree_trans_sort_updates(struct btree_trans *trans) { struct btree_insert_entry *l, *r; @@ -89,7 +60,7 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans) for (pos = 0; pos < nr; pos++) { r = trans->updates + trans->updates_sorted[pos]; - if (btree_trans_cmp(*l, *r) <= 0) + if (btree_iter_cmp(l->iter, r->iter) <= 0) break; } @@ -100,8 +71,6 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans) trans->updates_sorted[pos] = l - trans->updates; nr++; } - - BUG_ON(nr != trans->nr_updates); } /* Inserting into a given leaf node (last stage of insert): */ @@ -274,8 +243,8 @@ static void bch2_insert_fixup_key(struct btree_trans *trans, EBUG_ON(insert->k->k.u64s > bch_btree_keys_u64s_remaining(trans->c, l->b)); - if (bch2_btree_bset_insert_key(iter, l->b, &l->iter, - insert->k)) + if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, + insert->k))) bch2_btree_journal_key(trans, iter, insert->k); } @@ -288,7 +257,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + struct bset_tree *t = bset_tree_last(b); + int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -298,7 +268,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, bch2_insert_fixup_extent(trans, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); @@ -312,165 +282,31 @@ static void btree_insert_key_leaf(struct btree_trans *trans, trace_btree_insert_key(c, b, insert->k); } -/* Deferred btree updates: */ - -static void deferred_update_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct deferred_update *d = - container_of(pin, struct deferred_update, journal); - struct journal_preres res = { 0 }; - u64 tmp[32]; - struct bkey_i *k = (void *) tmp; - int ret; - - if (d->allocated_u64s > ARRAY_SIZE(tmp)) { - k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS); - - BUG_ON(!k); /* XXX */ - } - - spin_lock(&d->lock); - if (d->dirty) { - BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s); - - swap(res, d->res); - - BUG_ON(d->k.k.u64s > d->allocated_u64s); - - bkey_copy(k, &d->k); - d->dirty = false; - spin_unlock(&d->lock); - - ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED); - bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), - c, "error flushing deferred btree update: %i", ret); - - spin_lock(&d->lock); - } - - if (!d->dirty) - bch2_journal_pin_drop(j, &d->journal); - spin_unlock(&d->lock); - - bch2_journal_preres_put(j, &res); - if (k != (void *) tmp) - kfree(k); -} - -static void btree_insert_key_deferred(struct btree_trans *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct deferred_update *d = insert->d; - int difference; - - BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); - BUG_ON(insert->k->u64s > d->allocated_u64s); - - __btree_journal_key(trans, d->btree_id, insert->k); - - spin_lock(&d->lock); - BUG_ON(jset_u64s(insert->k->u64s) > - trans->journal_preres.u64s); - - difference = jset_u64s(insert->k->u64s) - d->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - d->res.u64s += difference; - } - - bkey_copy(&d->k, insert->k); - d->dirty = true; - - bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, - deferred_update_flush); - spin_unlock(&d->lock); -} - -void bch2_deferred_update_free(struct bch_fs *c, - struct deferred_update *d) -{ - deferred_update_flush(&c->journal, &d->journal, 0); - - BUG_ON(journal_pin_active(&d->journal)); - - bch2_journal_pin_flush(&c->journal, &d->journal); - kfree(d); -} - -struct deferred_update * -bch2_deferred_update_alloc(struct bch_fs *c, - enum btree_id btree_id, - unsigned u64s) -{ - struct deferred_update *d; - - BUG_ON(u64s > U8_MAX); - - d = kmalloc(offsetof(struct deferred_update, k) + - u64s * sizeof(u64), GFP_NOFS); - BUG_ON(!d); - - memset(d, 0, offsetof(struct deferred_update, k)); - - spin_lock_init(&d->lock); - d->allocated_u64s = u64s; - d->btree_id = btree_id; - - return d; -} - /* Normal update interface: */ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { struct bch_fs *c = trans->c; - enum btree_id btree_id = !i->deferred - ? i->iter->btree_id - : i->d->btree_id; - - if (!i->deferred) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !(trans->flags & BTREE_INSERT_ATOMIC)); - } + + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + !(trans->flags & BTREE_INSERT_ATOMIC)); BUG_ON(debug_check_bkeys(c) && !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id)); + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); } -static int bch2_trans_journal_preres_get(struct btree_trans *trans) +static noinline int +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned u64s = 0; int ret; - trans_for_each_update(trans, i) - if (i->deferred) - u64s += jset_u64s(i->k->k.u64s); - - if (!u64s) - return 0; - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, u64s, - JOURNAL_RES_GET_NONBLOCK); - if (ret != -EAGAIN) - return ret; - bch2_trans_unlock(trans); ret = bch2_journal_preres_get(&c->journal, @@ -486,8 +322,8 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans) return 0; } -static int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) +static inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) { struct bch_fs *c = trans->c; int ret; @@ -525,102 +361,63 @@ btree_key_can_insert(struct btree_trans *trans, return BTREE_INSERT_OK; } -static int btree_trans_check_can_insert(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static inline void do_btree_insert_one(struct btree_trans *trans, + struct btree_insert_entry *insert) { - struct btree_insert_entry *i; - unsigned iter, u64s = 0; - int ret; - - trans_for_each_update_sorted(trans, i, iter) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, iter)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = btree_key_can_insert(trans, i, &u64s); - if (ret) { - *stopped_at = i; - return ret; - } - } - - return 0; + btree_insert_key_leaf(trans, insert); } -static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_insert_entry *insert) +static inline bool update_has_trans_triggers(struct btree_insert_entry *i) { - if (likely(!insert->deferred)) - btree_insert_key_leaf(trans, insert); - else - btree_insert_key_deferred(trans, insert); + return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); } -static inline bool update_triggers_transactional(struct btree_trans *trans, - struct btree_insert_entry *i) +static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) { - return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && - (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES || - i->iter->btree_id == BTREE_ID_REFLINK); + return (BTREE_NODE_TYPE_HAS_TRIGGERS & + ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & + (1U << i->iter->btree_id); } -static inline bool update_has_triggers(struct btree_trans *trans, - struct btree_insert_entry *i) +static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) { - return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - !i->deferred && - btree_node_type_needs_gc(i->iter->btree_id); + __bch2_btree_iter_unlock(iter); } -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_btree_insert_at(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static noinline void bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE ? BCH_BUCKET_MARK_BUCKET_INVALIDATE : 0; - int ret; - trans_for_each_update_iter(trans, i) - BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - - /* - * note: running triggers will append more updates to the list of - * updates as we're walking it: - */ - trans_for_each_update_iter(trans, i) - if (update_has_triggers(trans, i) && - update_triggers_transactional(trans, i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - if (ret) - goto out_clear_replicas; - } + if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) + return; trans_for_each_update(trans, i) - btree_insert_entry_checks(trans, i); - bch2_btree_trans_verify_locks(trans); - - /* - * No more updates can be added - sort updates so we can take write - * locks in the correct order: - */ - btree_trans_sort_updates(trans); + if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) + bch2_mark_update(trans, i, NULL, + mark_flags|BCH_BUCKET_MARK_GC); +} - btree_trans_lock_write(trans, true); +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage = NULL; + struct btree_insert_entry *i; + unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE + ? BCH_BUCKET_MARK_BUCKET_INVALIDATE + : 0; + unsigned iter, u64s = 0; + bool marking = false; + int ret; if (race_fault()) { - ret = -EINTR; trace_trans_restart_fault_inject(trans->ip); - goto out; + return -EINTR; } /* @@ -628,24 +425,28 @@ static inline int do_btree_insert_at(struct btree_trans *trans, * held, otherwise another thread could write the node changing the * amount of space available: */ - ret = btree_trans_check_can_insert(trans, stopped_at); - if (ret) - goto out; - trans_for_each_update_iter(trans, i) { - if (!btree_node_type_needs_gc(i->iter->btree_id)) - continue; + prefetch(&trans->c->journal.flags); - if (!fs_usage) { - percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); - } + trans_for_each_update_sorted(trans, i, iter) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, iter)) + u64s = 0; - if (!bch2_bkey_replicas_marked_locked(c, - bkey_i_to_s_c(i->k), true)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto out; + u64s += i->k->k.u64s; + ret = btree_key_can_insert(trans, i, &u64s); + if (ret) { + *stopped_at = i; + return ret; } + + if (btree_node_type_needs_gc(i->iter->btree_id)) + marking = true; + } + + if (marking) { + percpu_down_read(&c->mark_lock); + fs_usage = bch2_fs_usage_scratch_get(c); } /* @@ -653,16 +454,17 @@ static inline int do_btree_insert_at(struct btree_trans *trans, * succeed: */ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - trans->journal_u64s = 0; - - trans_for_each_update(trans, i) - trans->journal_u64s += jset_u64s(i->k->k.u64s); - - ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); + ret = bch2_trans_journal_res_get(trans, + JOURNAL_RES_GET_NONBLOCK); if (ret) - goto out; + goto err; } + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (journal_seq_verify(c)) trans_for_each_update(trans, i) @@ -672,49 +474,146 @@ static inline int do_btree_insert_at(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - trans_for_each_update_iter(trans, i) - if (update_has_triggers(trans, i) && - !update_triggers_transactional(trans, i)) - bch2_mark_update(trans, i, fs_usage, mark_flags); + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && + bch2_replicas_delta_list_apply(c, fs_usage, + trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; + } - if (fs_usage && trans->fs_usage_deltas) - bch2_replicas_delta_list_apply(c, fs_usage, - trans->fs_usage_deltas); + trans_for_each_update(trans, i) + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_nontrans_triggers(i)) + bch2_mark_update(trans, i, fs_usage, mark_flags); - if (fs_usage) + if (marking) bch2_trans_fs_usage_apply(trans, fs_usage); - if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - unlikely(c->gc_pos.phase)) - trans_for_each_update_iter(trans, i) - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i, NULL, - mark_flags| - BCH_BUCKET_MARK_GC); + if (unlikely(c->gc_pos.phase)) + bch2_trans_mark_gc(trans); trans_for_each_update(trans, i) do_btree_insert_one(trans, i); -out: - BUG_ON(ret && - (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) && - trans->journal_res.ref); - - btree_trans_lock_write(trans, false); - - if (fs_usage) { +err: + if (marking) { bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); } - bch2_journal_res_put(&c->journal, &trans->journal_res); -out_clear_replicas: - if (trans->fs_usage_deltas) { - memset(&trans->fs_usage_deltas->fs_usage, 0, - sizeof(trans->fs_usage_deltas->fs_usage)); - trans->fs_usage_deltas->used = 0; + return ret; +} + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, + struct btree_insert_entry **stopped_at) +{ + struct btree_insert_entry *i; + struct btree_iter *iter; + unsigned idx, u64s, journal_preres_u64s = 0; + int ret; + + /* + * note: running triggers will append more updates to the list of + * updates as we're walking it: + */ + trans_for_each_update(trans, i) { + /* we know trans->nounlock won't be set here: */ + if (unlikely(!(i->iter->locks_want < 1 + ? __bch2_btree_iter_upgrade(i->iter, 1) + : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; + } + + if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && + update_has_trans_triggers(i)) { + ret = bch2_trans_mark_update(trans, i->iter, i->k); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + return ret; + } + } + + u64s = jset_u64s(i->k->k.u64s); + if (0) + journal_preres_u64s += u64s; + trans->journal_u64s += u64s; } - return ret; + ret = bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, journal_preres_u64s, + JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -EAGAIN)) + ret = bch2_trans_journal_preres_get_cold(trans, + journal_preres_u64s); + if (unlikely(ret)) + return ret; + + /* + * Can't be holding any read locks when we go to take write locks: + * + * note - this must be done after bch2_trans_journal_preres_get_cold() + * or anything else that might call bch2_trans_relock(), since that + * would just retake the read locks: + */ + trans_for_each_iter_all(trans, iter) { + if (iter->nodes_locked != iter->nodes_intent_locked) { + EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + EBUG_ON(trans->iters_live & (1ULL << iter->idx)); + bch2_btree_iter_unlock_noinline(iter); + } + } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_update(trans, i) + btree_insert_entry_checks(trans, i); + bch2_btree_trans_verify_locks(trans); + + /* + * No more updates can be added - sort updates so we can take write + * locks in the correct order: + */ + btree_trans_sort_updates(trans); + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_lock_for_insert(trans->c, + i->iter->l[0].b, i->iter); + + ret = bch2_trans_commit_write_locked(trans, stopped_at); + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, + i->iter); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + bch2_journal_res_put(&trans->c->journal, &trans->journal_res); + + if (unlikely(ret)) + return ret; + + if (trans->flags & BTREE_INSERT_NOUNLOCK) + trans->nounlock = true; + + trans_for_each_update_sorted(trans, i, idx) + if (!same_leaf_as_prev(trans, idx)) + bch2_foreground_maybe_merge(trans->c, i->iter, + 0, trans->flags); + + trans->nounlock = false; + + trans_for_each_update(trans, i) + bch2_btree_iter_downgrade(i->iter); + + return 0; } static noinline @@ -771,7 +670,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); if (ret) return ret; @@ -822,67 +721,29 @@ int bch2_trans_commit_error(struct btree_trans *trans, return ret; } -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -static int __bch2_trans_commit(struct btree_trans *trans, - struct btree_insert_entry **stopped_at) +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - unsigned iter; int ret; - trans_for_each_update_iter(trans, i) { - if (!bch2_btree_iter_upgrade(i->iter, 1)) { - trace_trans_restart_upgrade(trans->ip); - ret = -EINTR; - goto err; - } - - ret = btree_iter_err(i->iter); - if (ret) - goto err; - } - - ret = do_btree_insert_at(trans, stopped_at); - if (unlikely(ret)) - goto err; - - if (trans->flags & BTREE_INSERT_NOUNLOCK) - trans->nounlock = true; - - trans_for_each_update_sorted(trans, i, iter) - if (!same_leaf_as_prev(trans, iter)) - bch2_foreground_maybe_merge(c, i->iter, - 0, trans->flags); + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + return -EROFS; - trans->nounlock = false; + bch2_trans_unlock(trans); - trans_for_each_update_iter(trans, i) - bch2_btree_iter_downgrade(i->iter); -err: - /* make sure we didn't drop or screw up locks: */ - bch2_btree_trans_verify_locks(trans); + ret = bch2_fs_read_write_early(c); + if (ret) + return ret; - return ret; + percpu_ref_get(&c->writes); + return 0; } -int bch2_trans_commit(struct btree_trans *trans, - struct disk_reservation *disk_res, - u64 *journal_seq, - unsigned flags) +int __bch2_trans_commit(struct btree_trans *trans) { - struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; + struct btree_iter *iter; unsigned orig_nr_updates = trans->nr_updates; unsigned orig_mem_top = trans->mem_top; int ret = 0; @@ -891,63 +752,50 @@ int bch2_trans_commit(struct btree_trans *trans, goto out_noupdates; /* for the sake of sanity: */ - BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC)); - - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); + EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); - if (!trans->commit_start) - trans->commit_start = local_clock(); + if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&trans->c->gc_lock); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->disk_res = disk_res; - trans->journal_seq = journal_seq; - trans->flags = flags; - - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - !percpu_ref_tryget(&c->writes))) { - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) - return -EROFS; - - bch2_trans_unlock(trans); - ret = bch2_fs_read_write_early(c); + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!percpu_ref_tryget(&trans->c->writes))) { + ret = bch2_trans_commit_get_rw_cold(trans); if (ret) return ret; + } +retry: + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + trans->journal_u64s = 0; - percpu_ref_get(&c->writes); + ret = do_bch2_trans_commit(trans, &i); - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); } -retry: - ret = bch2_trans_journal_preres_get(trans); - if (ret) - goto err; - ret = __bch2_trans_commit(trans, &i); + /* make sure we didn't drop or screw up locks: */ + bch2_btree_trans_verify_locks(trans); + if (ret) goto err; out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); - if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&c->writes); + if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) + percpu_ref_put(&trans->c->writes); out_noupdates: - if (!ret && trans->commit_start) { - bch2_time_stats_update(&c->times[BCH_TIME_btree_update], - trans->commit_start); - trans->commit_start = 0; - } + EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); + trans_for_each_iter_all(trans, iter) + iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; if (!ret) { - bch2_trans_unlink_iters(trans, ~trans->iters_touched| - trans->iters_unlink_on_commit); + bch2_trans_unlink_iters(trans); trans->iters_touched = 0; } trans->nr_updates = 0; @@ -957,18 +805,16 @@ out_noupdates: err: ret = bch2_trans_commit_error(trans, i, ret); - /* free updates and memory used by triggers, they'll be reexecuted: */ - trans->nr_updates = orig_nr_updates; - trans->mem_top = orig_mem_top; - /* can't loop if it was passed in and we changed it: */ if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) ret = -EINTR; + if (ret) + goto out; - if (!ret) - goto retry; - - goto out; + /* free updates and memory used by triggers, they'll be reexecuted: */ + trans->nr_updates = orig_nr_updates; + trans->mem_top = orig_mem_top; + goto retry; } /** @@ -994,7 +840,7 @@ retry: iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + bch2_trans_update(&trans, iter, k); ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); if (ret == -EINTR) @@ -1037,14 +883,14 @@ retry: /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); + bch2_cut_back(end, &delete); ret = bch2_extent_trim_atomic(&delete, iter); if (ret) break; } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); + bch2_trans_update(trans, iter, &delete); ret = bch2_trans_commit(trans, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); @@ -1071,7 +917,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k.k); k.k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); + bch2_trans_update(trans, iter, &k); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE|flags); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 6a4773a92029..8d223aa2bee5 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -499,14 +499,18 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c) } } -static inline void update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, - s64 sectors) +static inline int update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); - BUG_ON(idx < 0); + if (idx < 0) + return -1; + + if (!fs_usage) + return 0; switch (r->data_type) { case BCH_DATA_BTREE: @@ -520,6 +524,7 @@ static inline void update_replicas(struct bch_fs *c, break; } fs_usage->replicas[idx] += sectors; + return 0; } static inline void update_cached_sectors(struct bch_fs *c, @@ -579,23 +584,41 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -void bch2_replicas_delta_list_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct replicas_delta_list *r) +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +int bch2_replicas_delta_list_apply(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct replicas_delta_list *r) { struct replicas_delta *d = r->d; struct replicas_delta *top = (void *) r->d + r->used; + unsigned i; - acc_u64s((u64 *) fs_usage, - (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64)); + for (d = r->d; d != top; d = replicas_delta_next(d)) + if (update_replicas(c, fs_usage, &d->r, d->delta)) { + top = d; + goto unwind; + } - while (d != top) { - BUG_ON((void *) d > (void *) top); + if (!fs_usage) + return 0; - update_replicas(c, fs_usage, &d->r, d->delta); + fs_usage->nr_inodes += r->nr_inodes; - d = (void *) d + replicas_entry_bytes(&d->r) + 8; + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + fs_usage->reserved += r->persistent_reserved[i]; + fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; } + + return 0; +unwind: + for (d = r->d; d != top; d = replicas_delta_next(d)) + update_replicas(c, fs_usage, &d->r, -d->delta); + return -1; } #define do_mark_fn(fn, c, pos, flags, ...) \ @@ -807,28 +830,44 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, - unsigned offset, s64 delta, - unsigned flags) +static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) { + return DIV_ROUND_UP(sectors * n, d); +} + +static s64 __ptr_disk_sectors_delta(unsigned old_size, + unsigned offset, s64 delta, + unsigned flags, + unsigned n, unsigned d) +{ + BUG_ON(!n || !d); + if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { - BUG_ON(offset + -delta > p.crc.live_size); + BUG_ON(offset + -delta > old_size); - return -((s64) ptr_disk_sectors(p)) + - __ptr_disk_sectors(p, offset) + - __ptr_disk_sectors(p, p.crc.live_size - - offset + delta); + return -disk_sectors_scaled(n, d, old_size) + + disk_sectors_scaled(n, d, offset) + + disk_sectors_scaled(n, d, old_size - offset + delta); } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { - BUG_ON(offset + -delta > p.crc.live_size); + BUG_ON(offset + -delta > old_size); - return -((s64) ptr_disk_sectors(p)) + - __ptr_disk_sectors(p, p.crc.live_size + - delta); + return -disk_sectors_scaled(n, d, old_size) + + disk_sectors_scaled(n, d, old_size + delta); } else { - return ptr_disk_sectors(p); + return disk_sectors_scaled(n, d, delta); } } +static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, + unsigned offset, s64 delta, + unsigned flags) +{ + return __ptr_disk_sectors_delta(p.crc.live_size, + offset, delta, flags, + p.crc.compressed_size, + p.crc.uncompressed_size); +} + static void bucket_set_stripe(struct bch_fs *c, const struct bch_stripe *v, struct bch_fs_usage *fs_usage, @@ -964,15 +1003,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - s64 sectors, unsigned flags) + s64 sectors, unsigned flags, + struct bch_replicas_padded *r, + unsigned *nr_data, + unsigned *nr_parity) { bool gc = flags & BCH_BUCKET_MARK_GC; struct stripe *m; - unsigned old, new, nr_data; + unsigned old, new; int blocks_nonempty_delta; - s64 parity_sectors; - - BUG_ON(!sectors); m = genradix_ptr(&c->stripes[gc], p.idx); @@ -987,13 +1026,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, BUG_ON(m->r.e.data_type != data_type); - nr_data = m->nr_blocks - m->nr_redundant; - - parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); - - if (sectors < 0) - parity_sectors = -parity_sectors; - sectors += parity_sectors; + *nr_data = m->nr_blocks - m->nr_redundant; + *nr_parity = m->nr_redundant; + *r = m->r; old = m->block_sectors[p.block]; m->block_sectors[p.block] += sectors; @@ -1011,8 +1046,6 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); - update_replicas(c, fs_usage, &m->r.e, sectors); - return 0; } @@ -1027,7 +1060,6 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, struct extent_ptr_decoded p; struct bch_replicas_padded r; s64 dirty_sectors = 0; - unsigned i; int ret; r.e.data_type = data_type; @@ -1041,29 +1073,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type, - fs_usage, journal_seq, flags); + fs_usage, journal_seq, flags); if (p.ptr.cached) { if (!stale) update_cached_sectors(c, fs_usage, p.ptr.dev, disk_sectors); - } else if (!p.ec_nr) { + } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - for (i = 0; i < p.ec_nr; i++) { - ret = bch2_mark_stripe_ptr(c, p.ec[i], - data_type, fs_usage, - disk_sectors, flags); - if (ret) - return ret; - } - + struct bch_replicas_padded ec_r; + unsigned nr_data, nr_parity; + s64 parity_sectors; + + ret = bch2_mark_stripe_ptr(c, p.ec, data_type, + fs_usage, disk_sectors, flags, + &ec_r, &nr_data, &nr_parity); + if (ret) + return ret; + + parity_sectors = + __ptr_disk_sectors_delta(p.crc.live_size, + offset, sectors, flags, + p.crc.compressed_size * nr_parity, + p.crc.uncompressed_size * nr_data); + + update_replicas(c, fs_usage, &ec_r.e, + disk_sectors + parity_sectors); + + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ r.e.nr_required = 0; } } - update_replicas(c, fs_usage, &r.e, dirty_sectors); + if (r.e.nr_devs) + update_replicas(c, fs_usage, &r.e, dirty_sectors); return 0; } @@ -1316,7 +1365,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, bch_err(c, "disk usage increased more than %llu sectors reserved", disk_res_sectors); - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { struct btree_iter *iter = i->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; @@ -1358,7 +1407,7 @@ static int trans_get_key(struct btree_trans *trans, struct btree_insert_entry *i; int ret; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (i->iter->btree_id == btree_id && (btree_node_type_is_extents(btree_id) ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && @@ -1369,13 +1418,11 @@ static int trans_get_key(struct btree_trans *trans, return 1; } - *iter = __bch2_trans_get_iter(trans, btree_id, pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0); + *iter = bch2_trans_get_iter(trans, btree_id, pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); if (IS_ERR(*iter)) return PTR_ERR(*iter); - bch2_trans_iter_free_on_commit(trans, *iter); - *k = bch2_btree_iter_peek_slot(*iter); ret = bkey_err(*k); if (ret) @@ -1397,13 +1444,13 @@ static void *trans_update_key(struct btree_trans *trans, bkey_init(&new_k->k); new_k->k.p = iter->pos; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (i->iter == iter) { i->k = new_k; return new_k; } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k)); + bch2_trans_update(trans, iter, new_k); return new_k; } @@ -1417,7 +1464,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k; struct bkey_alloc_unpacked u; struct bkey_i_alloc *a; - unsigned old; + u16 *dst_sectors; bool overflow; int ret; @@ -1427,7 +1474,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, if (ret < 0) return ret; - if (!ret) { + if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { /* * During journal replay, and if gc repairs alloc info at * runtime, the alloc info in the btree might not be up to date @@ -1472,22 +1519,24 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, goto out; } - if (!p.ptr.cached) { - old = u.dirty_sectors; - overflow = checked_add(u.dirty_sectors, sectors); - } else { - old = u.cached_sectors; - overflow = checked_add(u.cached_sectors, sectors); + dst_sectors = !p.ptr.cached + ? &u.dirty_sectors + : &u.cached_sectors; + + overflow = checked_add(*dst_sectors, sectors); + + if (overflow) { + bch2_fs_inconsistent(c, + "bucket sector count overflow: %u + %lli > U16_MAX", + *dst_sectors, sectors); + /* return an error indicating that we need full fsck */ + ret = -EIO; + goto out; } u.data_type = u.dirty_sectors || u.cached_sectors ? data_type : 0; - bch2_fs_inconsistent_on(overflow, c, - "bucket sector count overflow: %u + %lli > U16_MAX", - old, sectors); - BUG_ON(overflow); - a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); ret = PTR_ERR_OR_ZERO(a); if (ret) @@ -1503,16 +1552,16 @@ out: static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, - s64 sectors, enum bch_data_type data_type) + s64 sectors, enum bch_data_type data_type, + struct bch_replicas_padded *r, + unsigned *nr_data, + unsigned *nr_parity) { struct bch_fs *c = trans->c; - struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; struct bkey_s_c k; struct bkey_s_stripe s; - unsigned nr_data; - s64 parity_sectors; int ret = 0; ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); @@ -1535,20 +1584,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, bkey_reassemble(new_k, k); s = bkey_i_to_s_stripe(new_k); - nr_data = s.v->nr_blocks - s.v->nr_redundant; - - parity_sectors = DIV_ROUND_UP(abs(sectors) * s.v->nr_redundant, nr_data); - - if (sectors < 0) - parity_sectors = -parity_sectors; - stripe_blockcount_set(s.v, p.block, stripe_blockcount_get(s.v, p.block) + - sectors + parity_sectors); + sectors); - bch2_bkey_to_replicas(&r.e, s.s_c); - - update_replicas_list(trans, &r.e, sectors); + *nr_data = s.v->nr_blocks - s.v->nr_redundant; + *nr_parity = s.v->nr_redundant; + bch2_bkey_to_replicas(&r->e, s.s_c); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1565,7 +1607,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, struct bch_replicas_padded r; s64 dirty_sectors = 0; bool stale; - unsigned i; int ret; r.e.data_type = data_type; @@ -1590,22 +1631,35 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, if (!stale) update_cached_sectors_list(trans, p.ptr.dev, disk_sectors); - } else if (!p.ec_nr) { + } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - for (i = 0; i < p.ec_nr; i++) { - ret = bch2_trans_mark_stripe_ptr(trans, p.ec[i], - disk_sectors, data_type); - if (ret) - return ret; - } + struct bch_replicas_padded ec_r; + unsigned nr_data, nr_parity; + s64 parity_sectors; + + ret = bch2_trans_mark_stripe_ptr(trans, p.ec, + disk_sectors, data_type, + &ec_r, &nr_data, &nr_parity); + if (ret) + return ret; + + parity_sectors = + __ptr_disk_sectors_delta(p.crc.live_size, + offset, sectors, flags, + p.crc.compressed_size * nr_parity, + p.crc.uncompressed_size * nr_data); + + update_replicas_list(trans, &ec_r.e, + disk_sectors + parity_sectors); r.e.nr_required = 0; } } - update_replicas_list(trans, &r.e, dirty_sectors); + if (r.e.nr_devs) + update_replicas_list(trans, &r.e, dirty_sectors); return 0; } @@ -1710,9 +1764,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d = replicas_deltas_realloc(trans, 0); if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) - d->fs_usage.nr_inodes++; + d->nr_inodes++; else - d->fs_usage.nr_inodes--; + d->nr_inodes--; return 0; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1721,10 +1775,9 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, sectors *= replicas; replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->fs_usage.persistent_reserved)); + ARRAY_SIZE(d->persistent_reserved)); - d->fs_usage.reserved += sectors; - d->fs_usage.persistent_reserved[replicas - 1] += sectors; + d->persistent_reserved[replicas - 1] += sectors; return 0; } case KEY_TYPE_reflink_p: diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a4bab66d8d17..ad6f731b1cea 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -137,8 +137,8 @@ static inline u8 ptr_stale(struct bch_dev *ca, return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } -static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, - unsigned live_size) +static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, + unsigned live_size) { return live_size && p.crc.compression_type ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, @@ -146,7 +146,7 @@ static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, : live_size; } -static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p) +static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) { return __ptr_disk_sectors(p, p.crc.live_size); } @@ -279,9 +279,9 @@ int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, struct bch_fs_usage *, unsigned); -void bch2_replicas_delta_list_apply(struct bch_fs *, - struct bch_fs_usage *, - struct replicas_delta_list *); +int bch2_replicas_delta_list_apply(struct bch_fs *, + struct bch_fs_usage *, + struct replicas_delta_list *); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 94bd9da34847..f3ff4a18b1fd 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -102,7 +102,11 @@ struct replicas_delta { struct replicas_delta_list { unsigned size; unsigned used; - struct bch_fs_usage fs_usage; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; struct replicas_delta d[0]; }; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 607f57a64009..a5c947e8adf3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <keys/user-type.h> @@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -126,7 +127,6 @@ static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, do_encrypt(c->chacha20, nonce, key, sizeof(key)); desc->tfm = c->poly1305; - desc->flags = 0; crypto_shash_init(desc); crypto_shash_update(desc, key, sizeof(key)); } @@ -199,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -224,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -462,7 +462,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -545,7 +545,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -573,7 +573,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -605,7 +605,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 0b359aba2526..b84e81bac8ff 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 8ac6990c6971..f18266330687 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -135,17 +135,16 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, return ret; } -void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) +void __bch2_increment_clock(struct io_clock *clock) { - struct io_clock *clock = &c->io_clock[rw]; struct io_timer *timer; unsigned long now; + unsigned sectors; /* Buffer up one megabyte worth of IO in the percpu counter */ preempt_disable(); - if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) < - IO_CLOCK_PCPU_SECTORS)) { + if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) { preempt_enable(); return; } diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 5cb043c579d8..bfbbca8a207b 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -6,7 +6,18 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, unsigned long); -void bch2_increment_clock(struct bch_fs *, unsigned, int); + +void __bch2_increment_clock(struct io_clock *); + +static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + + if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= + IO_CLOCK_PCPU_SECTORS)) + __bch2_increment_clock(clock); +} void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 24f565614cd9..3787390da47f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); #ifndef CONFIG_HIGHMEM - __bio_for_each_contig_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (bv.bv_len == start.bi_size) return (struct bbuf) { .b = page_address(bv.bv_page) + bv.bv_offset, diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 1442dacef0de..38017699c04a 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -138,10 +138,10 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int __bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - int flags) +int bch2_dirent_create(struct btree_trans *trans, + u64 dir_inum, const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + int flags) { struct bkey_i_dirent *dirent; int ret; @@ -155,16 +155,6 @@ int __bch2_dirent_create(struct btree_trans *trans, dir_inum, &dirent->k_i, flags); } -int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *journal_seq, int flags) -{ - return bch2_trans_do(c, journal_seq, flags, - __bch2_dirent_create(&trans, dir_inum, hash_info, - type, name, dst_inum, flags)); -} - static void dirent_copy_target(struct bkey_i_dirent *dst, struct bkey_s_c_dirent src) { @@ -172,23 +162,22 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } -static struct bpos bch2_dirent_pos(struct bch_inode_info *inode, - const struct qstr *name) -{ - return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name)); -} - int bch2_dirent_rename(struct btree_trans *trans, - struct bch_inode_info *src_dir, const struct qstr *src_name, - struct bch_inode_info *dst_dir, const struct qstr *dst_name, - enum bch_rename_mode mode) + u64 src_dir, struct bch_hash_info *src_hash, + u64 dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, u64 *src_inum, + const struct qstr *dst_name, u64 *dst_inum, + enum bch_rename_mode mode) { struct btree_iter *src_iter, *dst_iter; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name); + struct bpos dst_pos = + POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); int ret; + *src_inum = *dst_inum = 0; + /* * Lookup dst: * @@ -198,24 +187,25 @@ int bch2_dirent_rename(struct btree_trans *trans, */ dst_iter = mode == BCH_RENAME ? bch2_hash_hole(trans, bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - dst_dir->v.i_ino, dst_name) + dst_hash, dst_dir, dst_name) : bch2_hash_lookup(trans, bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - dst_dir->v.i_ino, dst_name, + dst_hash, dst_dir, dst_name, BTREE_ITER_INTENT); if (IS_ERR(dst_iter)) return PTR_ERR(dst_iter); old_dst = bch2_btree_iter_peek_slot(dst_iter); + if (mode != BCH_RENAME) + *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); + /* Lookup src: */ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - src_dir->v.i_ino, src_name, + src_hash, src_dir, src_name, BTREE_ITER_INTENT); if (IS_ERR(src_iter)) return PTR_ERR(src_iter); old_src = bch2_btree_iter_peek_slot(src_iter); + *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); /* Create new dst key: */ new_dst = dirent_create_key(trans, 0, dst_name, 0); @@ -255,9 +245,8 @@ int bch2_dirent_rename(struct btree_trans *trans, * new_dst at the src position: */ new_dst->k.p = src_iter->pos; - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(src_iter, - &new_dst->k_i)); + bch2_trans_update(trans, src_iter, + &new_dst->k_i); return 0; } else { /* If we're overwriting, we can't insert new_dst @@ -270,8 +259,7 @@ int bch2_dirent_rename(struct btree_trans *trans, } else { /* Check if we need a whiteout to delete src: */ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - src_iter); + src_hash, src_iter); if (ret < 0) return ret; @@ -280,17 +268,17 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i)); + bch2_trans_update(trans, src_iter, &new_src->k_i); + bch2_trans_update(trans, dst_iter, &new_dst->k_i); return 0; } -int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name) +int bch2_dirent_delete_at(struct btree_trans *trans, + const struct bch_hash_info *hash_info, + struct btree_iter *iter) { - return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, name); + return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + hash_info, iter); } int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, @@ -301,7 +289,17 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, return bch2_trans_do(c, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, - __bch2_dirent_delete(&trans, dir_inum, hash_info, name)); + bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info, + dir_inum, name)); +} + +struct btree_iter * +__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name, unsigned flags) +{ + return bch2_hash_lookup(trans, bch2_dirent_hash_desc, + hash_info, dir_inum, name, flags); } u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, @@ -315,8 +313,8 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, bch2_trans_init(&trans, c, 0, 0); - iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc, - hash_info, dir_inum, name, 0); + iter = __bch2_dirent_lookup_trans(&trans, dir_inum, + hash_info, name, 0); if (IS_ERR(iter)) { BUG_ON(PTR_ERR(iter) == -EINTR); goto out; @@ -350,53 +348,37 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) return ret; } -int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) { - return bch2_trans_do(c, NULL, 0, - bch2_empty_dir_trans(&trans, dir_inum)); -} - -int bch2_readdir(struct bch_fs *c, struct file *file, - struct dir_context *ctx) -{ - struct bch_inode_info *inode = file_bch_inode(file); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; - unsigned len; int ret; - if (!dir_emit_dots(file, ctx)) - return 0; - bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS(inode->v.i_ino, ctx->pos), 0, k, ret) { + POS(inum, ctx->pos), 0, k, ret) { + if (k.k->p.inode > inum) + break; + if (k.k->type != KEY_TYPE_dirent) continue; dirent = bkey_s_c_to_dirent(k); - if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0) - continue; - - if (k.k->p.inode > inode->v.i_ino) - break; - - len = bch2_dirent_name_bytes(dirent); - /* * XXX: dir_emit() can fault and block, while we're holding * locks */ - if (!dir_emit(ctx, dirent.v->d_name, len, + ctx->pos = dirent.k->p.offset; + if (!dir_emit(ctx, dirent.v->d_name, + bch2_dirent_name_bytes(dirent), le64_to_cpu(dirent.v->d_inum), dirent.v->d_type)) break; - - ctx->pos = k.k->p.offset + 1; + ctx->pos = dirent.k->p.offset + 1; } ret = bch2_trans_exit(&trans) ?: ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index bc64718a7832..e6184dc796d3 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -29,15 +29,13 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int __bch2_dirent_create(struct btree_trans *, u64, - const struct bch_hash_info *, u8, - const struct qstr *, u64, int); -int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, - u8, const struct qstr *, u64, u64 *, int); - -int __bch2_dirent_delete(struct btree_trans *, u64, - const struct bch_hash_info *, - const struct qstr *); +int bch2_dirent_create(struct btree_trans *, u64, + const struct bch_hash_info *, u8, + const struct qstr *, u64, int); + +int bch2_dirent_delete_at(struct btree_trans *, + const struct bch_hash_info *, + struct btree_iter *); int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *, u64 *); @@ -48,15 +46,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - struct bch_inode_info *, const struct qstr *, - struct bch_inode_info *, const struct qstr *, + u64, struct bch_hash_info *, + u64, struct bch_hash_info *, + const struct qstr *, u64 *, + const struct qstr *, u64 *, enum bch_rename_mode); +struct btree_iter * +__bch2_dirent_lookup_trans(struct btree_trans *, u64, + const struct bch_hash_info *, + const struct qstr *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_empty_dir(struct bch_fs *, u64); -int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); +int bch2_readdir(struct bch_fs *, u64, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index be2eca0fcdf7..5287b5ee7d4a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -135,8 +136,6 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, (u64) s->ptrs[i].offset, stripe_blockcount_get(s, i)); - - bch2_bkey_ptrs_to_text(out, c, k); } static int ptr_matches_stripe(struct bch_fs *c, @@ -433,10 +432,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) closure_init_stack(&cl); - BUG_ON(!rbio->pick.idx || - rbio->pick.idx - 1 >= rbio->pick.ec_nr); + BUG_ON(!rbio->pick.has_ec); - stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; + stripe_idx = rbio->pick.ec.idx; buf = kzalloc(sizeof(*buf), GFP_NOIO); if (!buf) @@ -561,7 +559,7 @@ static int ec_stripe_mem_alloc(struct bch_fs *c, size_t idx = iter->pos.offset; int ret = 0; - if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) + if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) return ret; bch2_trans_unlock(iter->trans); @@ -738,7 +736,7 @@ found_slot: stripe->k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); + bch2_trans_update(&trans, iter, &stripe->k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -779,10 +777,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; int ret = 0, dev, idx; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -792,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { bch2_btree_iter_next(iter); continue; @@ -807,19 +807,19 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, dev = s->key.v.ptrs[idx].dev; - bkey_reassemble(&tmp.k, k); - e = bkey_i_to_s_extent(&tmp.k); + bkey_on_stack_reassemble(&sk, c, k); + e = bkey_i_to_s_extent(sk.k); - extent_for_each_ptr(e, ptr) - if (ptr->dev != dev) + extent_for_each_ptr(e, ptr) { + if (ptr->dev == dev) + ec_ptr = ptr; + else ptr->cached = true; + } - ptr = (void *) bch2_extent_has_device(e.c, dev); - BUG_ON(!ptr); - - extent_stripe_ptr_add(e, s, ptr, idx); + extent_stripe_ptr_add(e, s, ec_ptr, idx); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k)); + bch2_trans_update(&trans, iter, sk.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -832,6 +832,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, } bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return ret; } @@ -1231,7 +1232,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i)); + bch2_trans_update(trans, iter, &new_key->k_i); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags); @@ -1278,7 +1279,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) struct btree_trans trans; struct btree_iter *btree_iter; struct journal_iter journal_iter; - struct bkey_s_c btree_k, journal_k, k; + struct bkey_s_c btree_k, journal_k; int ret; ret = bch2_fs_ec_start(c); @@ -1294,33 +1295,31 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) journal_k = bch2_journal_iter_peek(&journal_iter); while (1) { + bool btree; + if (btree_k.k && journal_k.k) { int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); - if (cmp < 0) { - k = btree_k; + if (!cmp) btree_k = bch2_btree_iter_next(btree_iter); - } else if (cmp == 0) { - btree_k = bch2_btree_iter_next(btree_iter); - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); - } else { - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); - } + btree = cmp < 0; } else if (btree_k.k) { - k = btree_k; - btree_k = bch2_btree_iter_next(btree_iter); + btree = true; } else if (journal_k.k) { - k = journal_k; - journal_k = bch2_journal_iter_next(&journal_iter); + btree = false; } else { break; } - bch2_mark_key(c, k, 0, 0, NULL, 0, + bch2_mark_key(c, btree ? btree_k : journal_k, + 0, 0, NULL, 0, BCH_BUCKET_MARK_ALLOC_READ| BCH_BUCKET_MARK_NOATOMIC); + + if (btree) + btree_k = bch2_btree_iter_next(btree_iter); + else + journal_k = bch2_journal_iter_next(&journal_iter); } ret = bch2_trans_exit(&trans) ?: ret; @@ -1351,6 +1350,9 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) if (ret) return ret; + if (!idx) + return 0; + if (!gc && !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), GFP_KERNEL)) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 304ff92500be..5a5cfee623e2 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -64,7 +64,7 @@ void bch2_io_error(struct bch_dev *ca) enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) { - struct fsck_err_state *s; + struct fsck_err_state *s = NULL; va_list args; bool fix = false, print = true, suppressing = false; char _buf[sizeof(s->buf)], *buf = _buf; @@ -99,8 +99,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, found: list_move(&s->list, &c->fsck_errors); s->nr++; - suppressing = s->nr == FSCK_ERR_RATELIMIT_NR; - print = s->nr <= FSCK_ERR_RATELIMIT_NR; + if (c->opts.ratelimit_errors && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } buf = s->buf; print: va_start(args, fmt); @@ -156,7 +161,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_lock(&c->fsck_error_lock); list_for_each_entry_safe(s, n, &c->fsck_errors, list) { - if (s->nr > FSCK_ERR_RATELIMIT_NR) + if (s->ratelimited) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); list_del(&s->list); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 2591e12305b7..7dcb0f6552fc 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -114,6 +114,7 @@ struct fsck_err_state { struct list_head list; const char *fmt; u64 nr; + bool ratelimited; char buf[512]; }; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c new file mode 100644 index 000000000000..742b4d78cb3a --- /dev/null +++ b/fs/bcachefs/extent_update.c @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_on_stack.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "debug.h" +#include "extents.h" +#include "extent_update.h" + +/* + * This counts the number of iterators to the alloc & ec btrees we'll need + * inserting/removing this extent: + */ +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + + return ret; +} + +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters, + bool overwrite) +{ + int ret = 0; + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + ret = 1; + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } + } + + return ret; +} + +#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) + +int bch2_extent_atomic_end(struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) +{ + struct btree_trans *trans = iter->trans; + struct btree *b; + struct btree_node_iter node_iter; + struct bkey_packed *_k; + unsigned nr_iters = 0; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + b = iter->l[0].b; + node_iter = iter->l[0].iter; + + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); + + *end = bpos_min(insert->k.p, b->key.k.p); + + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, + &nr_iters, EXTENT_ITERS_MAX / 2, false); + if (ret < 0) + return ret; + + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, + KEY_TYPE_discard))) { + struct bkey unpacked; + struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + unsigned offset = 0; + + if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) + break; + + if (bkey_cmp(bkey_start_pos(&insert->k), + bkey_start_pos(k.k)) > 0) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + + ret = count_iters_for_insert(trans, k, offset, end, + &nr_iters, EXTENT_ITERS_MAX, true); + if (ret) + break; + + bch2_btree_node_iter_advance(&node_iter, b); + } + + return ret < 0 ? ret : 0; +} + +int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, k); + return 0; +} + +int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(iter, k, &end); + if (ret) + return ret; + + return !bkey_cmp(end, k->k.p); +} + +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *trans, + struct btree_insert_entry *insert, + unsigned *u64s) +{ + struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_node_iter node_iter = l->iter; + enum bch_extent_overlap overlap; + struct bkey_packed *_k; + struct bkey unpacked; + struct bkey_s_c k; + int sectors; + + /* + * We avoid creating whiteouts whenever possible when deleting, but + * those optimizations mean we may potentially insert two whiteouts + * instead of one (when we overlap with the front of one extent and the + * back of another): + */ + if (bkey_whiteout(&insert->k->k)) + *u64s += BKEY_U64s; + + _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard); + if (!_k) + return BTREE_INSERT_OK; + + k = bkey_disassemble(l->b, _k, &unpacked); + + overlap = bch2_extent_overlap(&insert->k->k, k.k); + + /* account for having to split existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; + + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; + + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, + sectors, flags)) { + case 0: + break; + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); + } + } + + return BTREE_INSERT_OK; +} + +static void verify_extent_nonoverlapping(struct bch_fs *c, + struct btree *b, + struct btree_node_iter *_iter, + struct bkey_i *insert) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct btree_node_iter iter; + struct bkey_packed *k; + struct bkey uk; + + if (!expensive_debug_checks(c)) + return; + + iter = *_iter; + k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); + + iter = *_iter; + k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); +#if 0 + BUG_ON(k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); +#else + if (k && + (uk = bkey_unpack_key(b, k), + bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { + char buf1[100]; + char buf2[100]; + + bch2_bkey_to_text(&PBUF(buf1), &insert->k); + bch2_bkey_to_text(&PBUF(buf2), &uk); + + bch2_dump_btree_node(b); + panic("insert > next :\n" + "insert %s\n" + "next %s\n", + buf1, buf2); + } +#endif + +#endif +} + +static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert) +{ + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + verify_extent_nonoverlapping(c, l->b, &l->iter, insert); + + if (debug_check_bkeys(c)) + bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); + + bch2_bset_insert(l->b, &l->iter, k, insert, 0); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); +} + +static void +extent_squash(struct bch_fs *c, struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_packed *_k, struct bkey_s k, + enum bch_extent_overlap overlap) +{ + struct btree_iter_level *l = &iter->l[0]; + int u64s_delta; + + switch (overlap) { + case BCH_EXTENT_OVERLAP_FRONT: + /* insert overlaps with start of k: */ + u64s_delta = bch2_cut_front_s(insert->k.p, k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + break; + + case BCH_EXTENT_OVERLAP_BACK: + /* insert overlaps with end of k: */ + u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + EBUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + + /* + * As the auxiliary tree is indexed by the end of the + * key and we've just changed the end, update the + * auxiliary tree. + */ + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + break; + + case BCH_EXTENT_OVERLAP_ALL: { + /* The insert key completely covers k, invalidate k */ + if (!bkey_whiteout(k.k)) + btree_account_key_drop(l->b, _k); + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; + + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, u64s, 0); + } else { + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + } + + break; + } + case BCH_EXTENT_OVERLAP_MIDDLE: { + struct bkey_on_stack split; + + bkey_on_stack_init(&split); + bkey_on_stack_reassemble(&split, c, k.s_c); + + /* + * The insert key falls 'in the middle' of k + * The insert key splits k in 3: + * - start only in k, preserve + * - middle common section, invalidate in k + * - end only in k, preserve + * + * We update the old key to preserve the start, + * insert will be the new common section, + * we manually insert the end that we are preserving. + * + * modify k _before_ doing the insert (which will move + * what k points to) + */ + split.k->k.needs_whiteout |= bkey_written(l->b, _k); + + bch2_cut_back(bkey_start_pos(&insert->k), split.k); + BUG_ON(bkey_deleted(&split.k->k)); + + u64s_delta = bch2_cut_front_s(insert->k.p, k); + btree_keys_account_val_delta(l->b, _k, u64s_delta); + + BUG_ON(bkey_deleted(k.k)); + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + + extent_bset_insert(c, iter, split.k); + bkey_on_stack_exit(&split, c); + break; + } + } +} + +/** + * bch_extent_insert_fixup - insert a new extent and deal with overlaps + * + * this may result in not actually doing the insert, or inserting some subset + * of the insert key. For cmpxchg operations this is where that logic lives. + * + * All subsets of @insert that need to be inserted are inserted using + * bch2_btree_insert_and_journal(). If @b or @res fills up, this function + * returns false, setting @iter->pos for the prefix of @insert that actually got + * inserted. + * + * BSET INVARIANTS: this function is responsible for maintaining all the + * invariants for bsets of extents in memory. things get really hairy with 0 + * size extents + * + * within one bset: + * + * bkey_start_pos(bkey_next(k)) >= k + * or bkey_start_offset(bkey_next(k)) >= k->offset + * + * i.e. strict ordering, no overlapping extents. + * + * multiple bsets (i.e. full btree node): + * + * ∀ k, j + * k.size != 0 ∧ j.size != 0 → + * ¬ (k > bkey_start_pos(j) ∧ k < j) + * + * i.e. no two overlapping keys _of nonzero size_ + * + * We can't realistically maintain this invariant for zero size keys because of + * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j + * there may be another 0 size key between them in another bset, and it will + * thus overlap with the merged key. + * + * In addition, the end of iter->pos indicates how much has been processed. + * If the end of iter->pos is not the same as the end of insert, then + * key insertion needs to continue/be retried. + */ +void bch2_insert_fixup_extent(struct btree_trans *trans, + struct btree_insert_entry *insert_entry) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert_entry->iter; + struct bkey_i *insert = insert_entry->k; + struct btree_iter_level *l = &iter->l[0]; + struct btree_node_iter node_iter = l->iter; + bool deleting = bkey_whiteout(&insert->k); + bool update_journal = !deleting; + bool update_btree = !deleting; + struct bkey_i whiteout = *insert; + struct bkey_packed *_k; + struct bkey unpacked; + + EBUG_ON(iter->level); + EBUG_ON(!insert->k.size); + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); + + while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) + break; + + if (!bkey_whiteout(k.k)) + update_journal = true; + + if (!update_journal) { + bch2_cut_front(cur_end, insert); + bch2_cut_front(cur_end, &whiteout); + bch2_btree_iter_set_pos_same_leaf(iter, cur_end); + goto next; + } + + /* + * When deleting, if possible just do it by switching the type + * of the key we're deleting, instead of creating and inserting + * a new whiteout: + */ + if (deleting && + !update_btree && + !bkey_cmp(insert->k.p, k.k->p) && + !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { + if (!bkey_whiteout(k.k)) { + btree_account_key_drop(l->b, _k); + _k->type = KEY_TYPE_discard; + reserve_whiteout(l->b, _k); + bch2_btree_iter_fix_key_modified(iter, + l->b, _k); + } + break; + } + + if (k.k->needs_whiteout || bkey_written(l->b, _k)) { + insert->k.needs_whiteout = true; + update_btree = true; + } + + if (update_btree && + overlap == BCH_EXTENT_OVERLAP_ALL && + bkey_whiteout(k.k) && + k.k->needs_whiteout) { + unreserve_whiteout(l->b, _k); + _k->needs_whiteout = false; + } + + extent_squash(c, iter, insert, _k, k, overlap); + + if (!update_btree) + bch2_cut_front(cur_end, insert); +next: + node_iter = l->iter; + + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) + break; + } + + l->iter = node_iter; + bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); + + if (update_btree) { + if (deleting) + insert->k.type = KEY_TYPE_discard; + + EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); + + extent_bset_insert(c, iter, insert); + } + + if (update_journal) { + struct bkey_i *k = !deleting ? insert : &whiteout; + + if (deleting) + k->k.type = KEY_TYPE_discard; + + EBUG_ON(bkey_deleted(&k->k) || !k->k.size); + + bch2_btree_journal_key(trans, iter, k); + } + + bch2_cut_front(insert->k.p, insert); +} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h new file mode 100644 index 000000000000..89d18e4b6758 --- /dev/null +++ b/fs/bcachefs/extent_update.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENT_UPDATE_H +#define _BCACHEFS_EXTENT_UPDATE_H + +#include "bcachefs.h" + +int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, + struct bpos *); +int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); + +enum btree_insert_ret +bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, + unsigned *); +void bch2_insert_fixup_extent(struct btree_trans *, + struct btree_insert_entry *); + +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 4b1c652cdbce..6bcc178604b0 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -9,12 +9,10 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_gc.h" -#include "btree_update.h" -#include "btree_update_interior.h" +#include "btree_iter.h" #include "buckets.h" #include "checksum.h" #include "debug.h" -#include "dirent.h" #include "disk_groups.h" #include "error.h" #include "extents.h" @@ -24,85 +22,18 @@ #include "super.h" #include "super-io.h" #include "util.h" -#include "xattr.h" #include <trace/events/bcachefs.h> -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - unsigned nr_ptrs = 0; - - bkey_for_each_ptr(p, ptr) - nr_ptrs++; - - return nr_ptrs; -} - -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k) -{ - unsigned nr_ptrs = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: { - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - nr_ptrs += !ptr->cached; - BUG_ON(!nr_ptrs); - break; - } - case KEY_TYPE_reservation: - nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return nr_ptrs; -} - -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) -{ - unsigned i, durability = 0; - struct bch_dev *ca; - - if (p.ptr.cached) - return 0; - - ca = bch_dev_bkey_exists(c, p.ptr.dev); - - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) - durability = max_t(unsigned, durability, ca->mi.durability); - - for (i = 0; i < p.ec_nr; i++) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.idx); - - if (WARN_ON(!s)) - continue; - - durability = max_t(unsigned, durability, s->nr_redundant); - } - - return durability; -} - -unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned durability = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; - return durability; -} +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, unsigned dev) @@ -206,10 +137,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.idx++; if (force_reconstruct_read(c) && - !p.idx && p.ec_nr) + !p.idx && p.has_ec) p.idx++; - if (p.idx >= p.ec_nr + 1) + if (p.idx >= (unsigned) p.has_ec + 1) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -222,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, return ret; } -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) +/* KEY_TYPE_btree_ptr: */ + +const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + return bch2_bkey_ptrs_invalid(c, k); +} - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; +void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + const char *err; + char buf[160]; + struct bucket_mark mark; + struct bch_dev *ca; - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bkey_for_each_ptr(ptrs, ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + + mark = ptr_bucket_mark(ca, ptr); + + err = "stale"; + if (gen_after(mark.gen, ptr->gen)) + goto err; + + err = "inconsistent"; + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) + goto err; } + + return; +err: + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, buf, PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); } -void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bch_extent_ptr *ptr; + bch2_bkey_ptrs_to_text(out, c, k); +} - bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +/* KEY_TYPE_extent: */ + +const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + return bch2_bkey_ptrs_invalid(c, k); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + char buf[160]; - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - return ptr; + /* + * XXX: we should be doing most/all of these checks at startup time, + * where we check bch2_bkey_invalid() in btree_node_read_done() + * + * But note that we can't check for stale pointers or incorrect gc marks + * until after journal replay is done (it might be an extent that's + * going to get overwritten during replay) + */ - return NULL; -} + if (percpu_down_read_trylock(&c->mark_lock)) { + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + percpu_up_read(&c->mark_lock); + } + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; -bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; - bkey_for_each_ptr(ptrs, ptr) - if (bch2_dev_in_target(c, ptr->dev, target) && - (!ptr->cached || - !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - return true; + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); - return false; + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); + } } -/* extent specific utility code */ +void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev) +enum merge_result bch2_extent_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) { - const struct bch_extent_ptr *ptr; + struct bkey_s_extent l = bkey_s_to_extent(_l); + struct bkey_s_extent r = bkey_s_to_extent(_r); + union bch_extent_entry *en_l = l.v->start; + union bch_extent_entry *en_r = r.v->start; + struct bch_extent_crc_unpacked crc_l, crc_r; - extent_for_each_ptr(e, ptr) - if (ptr->dev == dev) - return ptr; + if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) + return BCH_MERGE_NOMERGE; - return NULL; -} + crc_l = bch2_extent_crc_unpack(l.k, NULL); -const struct bch_extent_ptr * -bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group) -{ - const struct bch_extent_ptr *ptr; + extent_for_each_entry(l, en_l) { + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return BCH_MERGE_NOMERGE; - if (ca->mi.group && - ca->mi.group - 1 == group) - return ptr; + switch (extent_entry_type(en_l)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *lp = &en_l->ptr; + const struct bch_extent_ptr *rp = &en_r->ptr; + struct bch_dev *ca; + + if (lp->offset + crc_l.compressed_size != rp->offset || + lp->dev != rp->dev || + lp->gen != rp->gen) + return BCH_MERGE_NOMERGE; + + /* We don't allow extents to straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp->dev); + + if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) + return BCH_MERGE_NOMERGE; + + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: + if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || + en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) + return BCH_MERGE_NOMERGE; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + if (crc_l.csum_type != crc_r.csum_type || + crc_l.compression_type != crc_r.compression_type || + crc_l.nonce != crc_r.nonce) + return BCH_MERGE_NOMERGE; + + if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || + crc_r.offset) + return BCH_MERGE_NOMERGE; + + if (!bch2_checksum_mergeable(crc_l.csum_type)) + return BCH_MERGE_NOMERGE; + + if (crc_l.compression_type) + return BCH_MERGE_NOMERGE; + + if (crc_l.csum_type && + crc_l.uncompressed_size + + crc_r.uncompressed_size > c->sb.encoded_extent_max) + return BCH_MERGE_NOMERGE; + + if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return BCH_MERGE_NOMERGE; + + break; + default: + return BCH_MERGE_NOMERGE; + } } - return NULL; -} + extent_for_each_entry(l, en_l) { + struct bch_extent_crc_unpacked crc_l, crc_r; -unsigned bch2_extent_is_compressed(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ret = 0; + en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) - ret += p.crc.compressed_size; + if (!extent_entry_is_crc(en_l)) + continue; - return ret; -} + crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, - struct bch_extent_ptr m, u64 offset) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == m.dev && - p.ptr.gen == m.gen && - (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == - (s64) m.offset - offset) - return true; + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; - return false; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; } -static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, - union bch_extent_entry *entry) +/* KEY_TYPE_reservation: */ + +const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) { - union bch_extent_entry *i = ptrs.start; + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (i == entry) - return NULL; + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; - while (extent_entry_next(i) != entry) - i = extent_entry_next(i); - return i; + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + + return NULL; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; - bool drop_crc = true; - - EBUG_ON(ptr < &ptrs.start->ptr || - ptr >= &ptrs.end->ptr); - EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; + pr_buf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) - break; +enum merge_result bch2_reservation_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_reservation r = bkey_s_to_reservation(_r); - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; - break; - } + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return BCH_MERGE_NOMERGE; - dst = prev; + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + bch2_cut_front_s(l.k->p, r.s); + return BCH_MERGE_PARTIAL; } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + bch2_key_resize(l.k, l.k->size + r.k->size); - return dst; + return BCH_MERGE_MERGE; +} + +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); } static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, @@ -466,52 +524,404 @@ restart_narrow_pointers: return ret; } -/* returns true if not equal */ -static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, - struct bch_extent_crc_unpacked r) +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) { - return (l.csum_type != r.csum_type || - l.compression_type != r.compression_type || - l.compressed_size != r.compressed_size || - l.uncompressed_size != r.uncompressed_size || - l.offset != r.offset || - l.live_size != r.live_size || - l.nonce != r.nonce || - bch2_crc_cmp(l.csum, r.csum)); +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields } -void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) { - union bch_extent_entry *entry; - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size - 1 <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size - 1 <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size - 1 <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); - for (entry = (union bch_extent_entry *) d; - entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); - entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - break; - case BCH_EXTENT_ENTRY_crc32: - entry->crc32.csum = swab32(entry->crc32.csum); + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); +} + +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +{ + return bch2_bkey_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && + p.crc.compression_type == BCH_COMPRESSION_NONE; + } + + return ret; +} + +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && + p.crc.compression_type != BCH_COMPRESSION_NONE) + ret += p.crc.compressed_size; + + return ret; +} + +bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, + unsigned nr_replicas) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bpos end = pos; + struct bkey_s_c k; + bool ret = true; + int err; + + end.offset += size; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - case BCH_EXTENT_ENTRY_crc64: - entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); - entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + + if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { + ret = false; break; - case BCH_EXTENT_ENTRY_crc128: - entry->crc128.csum.hi = (__force __le64) - swab64((__force u64) entry->crc128.csum.hi); - entry->crc128.csum.lo = (__force __le64) - swab64((__force u64) entry->crc128.csum.lo); + } + } + bch2_trans_exit(&trans); + + return ret; +} + +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) +{ + unsigned durability = 0; + struct bch_dev *ca; + + if (p.ptr.cached) + return 0; + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + + if (p.has_ec) { + struct stripe *s = + genradix_ptr(&c->stripes[0], p.ec.idx); + + if (WARN_ON(!s)) + goto out; + + durability = max_t(unsigned, durability, s->nr_redundant); + } +out: + return durability; +} + +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, p); + + return durability; +} + +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; + + if (target && extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra && + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; + extra -= n; + } + } + + if (extra > 0) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int n = bch2_extent_ptr_durability(c, p); + + if (n && n <= extra) { + entry->ptr.cached = true; + extra -= n; + } + } +} + +void bch2_bkey_append_ptr(struct bkey_i *k, + struct bch_extent_ptr ptr) +{ + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->u64s++; + break; + default: + BUG(); + } +} + +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; + + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); + + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); + } +} + +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + src = extent_entry_next(to_entry(ptr)); + if (src != ptrs.end && + !extent_entry_is_crc(src)) + drop_crc = false; + + dst = to_entry(ptr); + while ((prev = extent_entry_prev(ptrs, dst))) { + if (extent_entry_is_ptr(prev)) break; - case BCH_EXTENT_ENTRY_stripe_ptr: + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; break; } + + dst = prev; } + + memmove_u64s_down(dst, src, + (u64 *) ptrs.end - (u64 *) src); + k.k->u64s -= (u64 *) src - (u64 *) dst; + + return dst; +} + +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} + +const struct bch_extent_ptr * +bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; + + return false; +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + /* will only happen if all pointers were cached: */ + if (!bch2_bkey_nr_ptrs(k.s_c)) + k.k->type = KEY_TYPE_discard; + + return bkey_whiteout(k.k); } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -662,70 +1072,50 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -/* Btree ptrs */ - -const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) - return "value too big"; - - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - - mark = ptr_bucket_mark(ca, ptr); + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - err = "stale"; - if (gen_after(mark.gen, ptr->gen)) - goto err; + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); - err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size) - goto err; + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + } } - - return; -err: - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); } -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} +/* Generic extent code: */ -/* Extents */ - -void __bch2_cut_front(struct bpos where, struct bkey_s k) +int bch2_cut_front_s(struct bpos where, struct bkey_s k) { + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; u64 sub; if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) - return; + return 0; EBUG_ON(bkey_cmp(where, k.k->p) > 0); @@ -733,15 +1123,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) k.k->size -= sub; - if (!k.k->size) + if (!k.k->size) { k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } switch (k.k->type) { - case KEY_TYPE_deleted: - case KEY_TYPE_discard: - case KEY_TYPE_error: - case KEY_TYPE_cookie: - break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); @@ -779,1119 +1166,59 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) le64_add_cpu(&p.v->idx, sub); break; } - case KEY_TYPE_reservation: - break; - default: - BUG(); - } -} - -bool bch2_cut_back(struct bpos where, struct bkey *k) -{ - u64 len = 0; - - if (bkey_cmp(where, k->p) >= 0) - return false; - - EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0); - - len = where.offset - bkey_start_offset(k); - - k->p = where; - k->size = len; - - if (!len) - k->type = KEY_TYPE_deleted; - - return true; -} - -static bool extent_i_save(struct btree *b, struct bkey_packed *dst, - struct bkey_i *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - struct bkey_packed tmp; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = src->k; - else if (bch2_bkey_pack_key(&tmp, &src->k, f)) - memcpy_u64s(dst, &tmp, f->key_u64s); - else - return false; - - memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k)); - return true; -} - -static bool bch2_extent_merge_inline(struct bch_fs *, - struct btree_iter *, - struct bkey_packed *, - struct bkey_packed *, - bool); - -static void verify_extent_nonoverlapping(struct bch_fs *c, - struct btree *b, - struct btree_node_iter *_iter, - struct bkey_i *insert) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct btree_node_iter iter; - struct bkey_packed *k; - struct bkey uk; - - if (!expensive_debug_checks(c)) - return; - - iter = *_iter; - k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); - - iter = *_iter; - k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); -#if 0 - BUG_ON(k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); -#else - if (k && - (uk = bkey_unpack_key(b, k), - bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { - char buf1[100]; - char buf2[100]; - - bch2_bkey_to_text(&PBUF(buf1), &insert->k); - bch2_bkey_to_text(&PBUF(buf2), &uk); - - bch2_dump_btree_node(b); - panic("insert > next :\n" - "insert %s\n" - "next %s\n", - buf1, buf2); - } -#endif - -#endif -} - -static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert) -{ - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; - struct bkey_packed *k; - - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); - - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - verify_extent_nonoverlapping(c, l->b, &l->iter, insert); - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); - - node_iter = l->iter; - k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true)) - return; - - node_iter = l->iter; - k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard); - if (k && !bkey_written(l->b, k) && - bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false)) - return; - - /* - * may have skipped past some deleted extents greater than the insert - * key, before we got to a non deleted extent and knew we could bail out - * rewind the iterator a bit if necessary: - */ - node_iter = l->iter; - while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) && - bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0) - l->iter = node_iter; - - k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); - - bch2_bset_insert(l->b, &l->iter, k, insert, 0); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); -} - -static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - unsigned ret = 0; - - bkey_extent_entry_for_each(ptrs, entry) { - switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - case BCH_EXTENT_ENTRY_stripe_ptr: - ret++; - } - } - - return ret; -} + case KEY_TYPE_inline_data: { + struct bkey_s_inline_data d = bkey_s_to_inline_data(k); -static int __bch2_extent_atomic_end(struct btree_trans *trans, - struct bkey_s_c k, - unsigned offset, - struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) -{ - int ret = 0; - - switch (k.k->type) { - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - - if (*nr_iters >= max_iters) { - *end = bpos_min(*end, k.k->p); - return 0; - } - - break; - case KEY_TYPE_reflink_p: { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); - unsigned sectors = end->offset - bkey_start_offset(p.k); - struct btree_iter *iter; - struct bkey_s_c r_k; - - for_each_btree_key(trans, iter, - BTREE_ID_REFLINK, POS(0, idx + offset), - BTREE_ITER_SLOTS, r_k, ret) { - if (bkey_cmp(bkey_start_pos(r_k.k), - POS(0, idx + sectors)) >= 0) - break; - - *nr_iters += 1; - if (*nr_iters >= max_iters) { - struct bpos pos = bkey_start_pos(k.k); - pos.offset += r_k.k->p.offset - idx; + sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); - *end = bpos_min(*end, pos); - break; - } - } + memmove(d.v->data, + d.v->data + sub, + bkey_val_bytes(d.k) - sub); - bch2_trans_iter_put(trans, iter); + new_val_u64s -= sub >> 3; break; } } - return ret; -} - -int bch2_extent_atomic_end(struct btree_iter *iter, - struct bkey_i *insert, - struct bpos *end) -{ - struct btree_trans *trans = iter->trans; - struct btree *b = iter->l[0].b; - struct btree_node_iter node_iter = iter->l[0].iter; - struct bkey_packed *_k; - unsigned nr_iters = - bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert)); - int ret = 0; - - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); - - *end = bpos_min(insert->k.p, b->key.k.p); - - ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert), - 0, end, &nr_iters, 10); - if (ret) - return ret; - - while (nr_iters < 20 && - (_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - unsigned offset = 0; + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); - if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) - break; - - if (bkey_cmp(bkey_start_pos(&insert->k), - bkey_start_pos(k.k)) > 0) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - - ret = __bch2_extent_atomic_end(trans, k, offset, - end, &nr_iters, 20); - if (ret) - return ret; - - if (nr_iters >= 20) - break; - - bch2_btree_node_iter_advance(&node_iter, b); - } - - return 0; + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } -int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_cut_back_s(struct bpos where, struct bkey_s k) { - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - bch2_cut_back(end, &k->k); - return 0; -} - -int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - return !bkey_cmp(end, k->k.p); -} - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *trans, - struct btree_insert_entry *insert, - unsigned *u64s) -{ - struct btree_iter_level *l = &insert->iter->l[0]; - struct btree_node_iter node_iter = l->iter; - enum bch_extent_overlap overlap; - struct bkey_packed *_k; - struct bkey unpacked; - struct bkey_s_c k; - int sectors; - - /* - * We avoid creating whiteouts whenever possible when deleting, but - * those optimizations mean we may potentially insert two whiteouts - * instead of one (when we overlap with the front of one extent and the - * back of another): - */ - if (bkey_whiteout(&insert->k->k)) - *u64s += BKEY_U64s; - - _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - /* account for having to split existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_extent_is_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: - break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); - } - } - - return BTREE_INSERT_OK; -} + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; -static void -extent_squash(struct bch_fs *c, struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_packed *_k, struct bkey_s k, - enum bch_extent_overlap overlap) -{ - struct btree_iter_level *l = &iter->l[0]; - - switch (overlap) { - case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - __bch2_cut_front(insert->k.p, k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - break; + if (bkey_cmp(where, k.k->p) >= 0) + return 0; - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - bch2_cut_back(bkey_start_pos(&insert->k), k.k); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); + EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - break; + len = where.offset - bkey_start_offset(k.k); - case BCH_EXTENT_OVERLAP_ALL: { - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); + k.k->p = where; + k.k->size = len; - k.k->size = 0; + if (!len) { k.k->type = KEY_TYPE_deleted; - - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; - - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, u64s, 0); - } else { - extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - - break; - } - case BCH_EXTENT_OVERLAP_MIDDLE: { - BKEY_PADDED(k) split; - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - bkey_reassemble(&split.k, k.s_c); - split.k.k.needs_whiteout |= bkey_written(l->b, _k); - - bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k); - BUG_ON(bkey_deleted(&split.k.k)); - - __bch2_cut_front(insert->k.p, k); - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - - extent_bset_insert(c, iter, &split.k); - break; - } - } -} - -struct extent_insert_state { - struct bkey_i whiteout; - bool update_journal; - bool update_btree; - bool deleting; -}; - -static void __bch2_insert_fixup_extent(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_i *insert, - struct extent_insert_state *s) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_packed *_k; - struct bkey unpacked; - - while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, - KEY_TYPE_discard))) { - struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - enum bch_extent_overlap overlap = - bch2_extent_overlap(&insert->k, k.k); - - if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) - break; - - if (!bkey_whiteout(k.k)) - s->update_journal = true; - - if (!s->update_journal) { - bch2_cut_front(cur_end, insert); - bch2_cut_front(cur_end, &s->whiteout); - bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (s->deleting && - !s->update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); - } - break; - } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - s->update_btree = true; - } - - if (s->update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; - } - - extent_squash(c, iter, insert, _k, k, overlap); - - if (!s->update_btree) - bch2_cut_front(cur_end, insert); -next: - if (overlap == BCH_EXTENT_OVERLAP_FRONT || - overlap == BCH_EXTENT_OVERLAP_MIDDLE) - break; - } -} - -/** - * bch_extent_insert_fixup - insert a new extent and deal with overlaps - * - * this may result in not actually doing the insert, or inserting some subset - * of the insert key. For cmpxchg operations this is where that logic lives. - * - * All subsets of @insert that need to be inserted are inserted using - * bch2_btree_insert_and_journal(). If @b or @res fills up, this function - * returns false, setting @iter->pos for the prefix of @insert that actually got - * inserted. - * - * BSET INVARIANTS: this function is responsible for maintaining all the - * invariants for bsets of extents in memory. things get really hairy with 0 - * size extents - * - * within one bset: - * - * bkey_start_pos(bkey_next(k)) >= k - * or bkey_start_offset(bkey_next(k)) >= k->offset - * - * i.e. strict ordering, no overlapping extents. - * - * multiple bsets (i.e. full btree node): - * - * ∀ k, j - * k.size != 0 ∧ j.size != 0 → - * ¬ (k > bkey_start_pos(j) ∧ k < j) - * - * i.e. no two overlapping keys _of nonzero size_ - * - * We can't realistically maintain this invariant for zero size keys because of - * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j - * there may be another 0 size key between them in another bset, and it will - * thus overlap with the merged key. - * - * In addition, the end of iter->pos indicates how much has been processed. - * If the end of iter->pos is not the same as the end of insert, then - * key insertion needs to continue/be retried. - */ -void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct extent_insert_state s = { - .whiteout = *insert->k, - .update_journal = !bkey_whiteout(&insert->k->k), - .update_btree = !bkey_whiteout(&insert->k->k), - .deleting = bkey_whiteout(&insert->k->k), - }; - BKEY_PADDED(k) tmp; - - EBUG_ON(iter->level); - EBUG_ON(!insert->k->k.size); - EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k))); - - __bch2_insert_fixup_extent(c, iter, insert->k, &s); - - bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p); - - if (s.update_btree) { - bkey_copy(&tmp.k, insert->k); - - if (s.deleting) - tmp.k.k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - - extent_bset_insert(c, iter, &tmp.k); - } - - if (s.update_journal) { - bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout); - - if (s.deleting) - tmp.k.k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size); - - bch2_btree_journal_key(trans, iter, &tmp.k); - } - - bch2_cut_front(insert->k->k.p, insert->k); -} - -const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - return bch2_bkey_ptrs_invalid(c, k); -} - -void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - char buf[160]; - - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - if (percpu_down_read_trylock(&c->mark_lock)) { - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - percpu_up_read(&c->mark_lock); - } - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - extent_for_each_ptr_decode(e, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); - unsigned stale = gen_after(mark.gen, p.ptr.gen); - unsigned disk_sectors = ptr_disk_sectors(p); - unsigned mark_sectors = p.ptr.cached - ? mark.cached_sectors - : mark.dirty_sectors; - - bch2_fs_bug_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); - - bch2_fs_bug_on(!stale && - (mark.data_type != BCH_DATA_USER || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); - } -} - -void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); -} - -static unsigned bch2_crc_field_size_max[] = { - [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, - [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -}; - -static void bch2_extent_crc_pack(union bch_extent_crc *dst, - struct bch_extent_crc_unpacked src) -{ -#define set_common_fields(_dst, _src) \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset - - switch (extent_entry_type(to_entry(dst))) { - case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = *((__le32 *) &src.csum.lo); - break; - case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); - break; - case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; - break; - default: - BUG(); - } -#undef set_common_fields -} - -static void bch2_extent_crc_append(struct bkey_i *k, - struct bch_extent_crc_unpacked new) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - union bch_extent_crc *crc = (void *) ptrs.end; - - if (bch_crc_bytes[new.csum_type] <= 4 && - new.uncompressed_size - 1 <= CRC32_SIZE_MAX && - new.nonce <= CRC32_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc32; - else if (bch_crc_bytes[new.csum_type] <= 10 && - new.uncompressed_size - 1 <= CRC64_SIZE_MAX && - new.nonce <= CRC64_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc64; - else if (bch_crc_bytes[new.csum_type] <= 16 && - new.uncompressed_size - 1 <= CRC128_SIZE_MAX && - new.nonce <= CRC128_NONCE_MAX) - crc->type = 1 << BCH_EXTENT_ENTRY_crc128; - else - BUG(); - - bch2_extent_crc_pack(crc, new); - - k->k.u64s += extent_entry_u64s(ptrs.end); - - EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -} - -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - - memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); -} - -void bch2_extent_ptr_decoded_append(struct bkey_i *k, - struct extent_ptr_decoded *p) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); - struct bch_extent_crc_unpacked crc = - bch2_extent_crc_unpack(&k->k, NULL); - union bch_extent_entry *pos; - unsigned i; - - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = ptrs.start; - goto found; - } - - bkey_for_each_crc(&k->k, ptrs, crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) { - pos = extent_entry_next(pos); - goto found; - } - - bch2_extent_crc_append(k, p->crc); - pos = bkey_val_end(bkey_i_to_s(k)); -found: - p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ptr)); - - for (i = 0; i < p->ec_nr; i++) { - p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; - __extent_entry_insert(k, pos, to_entry(&p->ec[i])); - } -} - -/* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. - * - * Returns true if @k should be dropped entirely - * - * For existing keys, only called when btree nodes are being rewritten, not when - * they're merely being compacted/resorted in memory. - */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -{ - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(k, ptr, - ptr->cached && - ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - - /* will only happen if all pointers were cached: */ - if (!bkey_val_u64s(k.k)) - k.k->type = KEY_TYPE_discard; - - return bkey_whiteout(k.k); -} - -void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, - unsigned target, - unsigned nr_desired_replicas) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *entry; - struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; - - if (target && extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra && - !bch2_dev_in_target(c, p.ptr.dev, target)) { - entry->ptr.cached = true; - extra -= n; - } - } - - if (extra > 0) - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - int n = bch2_extent_ptr_durability(c, p); - - if (n && n <= extra) { - entry->ptr.cached = true; - extra -= n; - } - } -} - -enum merge_result bch2_extent_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_extent l = bkey_s_to_extent(_l); - struct bkey_s_extent r = bkey_s_to_extent(_r); - union bch_extent_entry *en_l = l.v->start; - union bch_extent_entry *en_r = r.v->start; - struct bch_extent_crc_unpacked crc_l, crc_r; - - if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) - return BCH_MERGE_NOMERGE; - - crc_l = bch2_extent_crc_unpack(l.k, NULL); - - extent_for_each_entry(l, en_l) { - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (extent_entry_type(en_l) != extent_entry_type(en_r)) - return BCH_MERGE_NOMERGE; - - switch (extent_entry_type(en_l)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *lp = &en_l->ptr; - const struct bch_extent_ptr *rp = &en_r->ptr; - struct bch_dev *ca; - - if (lp->offset + crc_l.compressed_size != rp->offset || - lp->dev != rp->dev || - lp->gen != rp->gen) - return BCH_MERGE_NOMERGE; - - /* We don't allow extents to straddle buckets: */ - ca = bch_dev_bkey_exists(c, lp->dev); - - if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) - return BCH_MERGE_NOMERGE; - - break; - } - case BCH_EXTENT_ENTRY_stripe_ptr: - if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || - en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) - return BCH_MERGE_NOMERGE; - break; - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - if (crc_l.csum_type != crc_r.csum_type || - crc_l.compression_type != crc_r.compression_type || - crc_l.nonce != crc_r.nonce) - return BCH_MERGE_NOMERGE; - - if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || - crc_r.offset) - return BCH_MERGE_NOMERGE; - - if (!bch2_checksum_mergeable(crc_l.csum_type)) - return BCH_MERGE_NOMERGE; - - if (crc_l.compression_type) - return BCH_MERGE_NOMERGE; - - if (crc_l.csum_type && - crc_l.uncompressed_size + - crc_r.uncompressed_size > c->sb.encoded_extent_max) - return BCH_MERGE_NOMERGE; - - if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 > - bch2_crc_field_size_max[extent_entry_type(en_l)]) - return BCH_MERGE_NOMERGE; - - break; - default: - return BCH_MERGE_NOMERGE; - } + new_val_u64s = 0; } - extent_for_each_entry(l, en_l) { - struct bch_extent_crc_unpacked crc_l, crc_r; - - en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); - - if (!extent_entry_is_crc(en_l)) - continue; - - crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); - crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - - crc_l.csum = bch2_checksum_merge(crc_l.csum_type, - crc_l.csum, - crc_r.csum, - crc_r.uncompressed_size << 9); - - crc_l.uncompressed_size += crc_r.uncompressed_size; - crc_l.compressed_size += crc_r.compressed_size; - - bch2_extent_crc_pack(entry_to_crc(en_l), crc_l); - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; -} - -/* - * When merging an extent that we're inserting into a btree node, the new merged - * extent could overlap with an existing 0 size extent - if we don't fix that, - * it'll break the btree node iterator so this code finds those 0 size extents - * and shifts them out of the way. - * - * Also unpacks and repacks. - */ -static bool bch2_extent_merge_inline(struct bch_fs *c, - struct btree_iter *iter, - struct bkey_packed *l, - struct bkey_packed *r, - bool back_merge) -{ - struct btree *b = iter->l[0].b; - struct btree_node_iter *node_iter = &iter->l[0].iter; - BKEY_PADDED(k) li, ri; - struct bkey_packed *m = back_merge ? l : r; - struct bkey_i *mi = back_merge ? &li.k : &ri.k; - struct bset_tree *t = bch2_bkey_to_bset(b, m); - enum merge_result ret; - - EBUG_ON(bkey_written(b, m)); - - if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX || - bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX) - return BCH_MERGE_NOMERGE; - - /* - * We need to save copies of both l and r, because we might get a - * partial merge (which modifies both) and then fails to repack - */ - bch2_bkey_unpack(b, &li.k, l); - bch2_bkey_unpack(b, &ri.k, r); - - ret = bch2_bkey_merge(c, - bkey_i_to_s(&li.k), - bkey_i_to_s(&ri.k)); - if (ret == BCH_MERGE_NOMERGE) - return false; - - if (debug_check_bkeys(c)) - bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k)); - if (debug_check_bkeys(c) && - ret == BCH_MERGE_PARTIAL) - bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k)); - - /* - * check if we overlap with deleted extents - would break the sort - * order: - */ - if (back_merge) { - struct bkey_packed *n = bkey_next(m); - - if (n != btree_bkey_last(b, t) && - bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 && - bkey_deleted(n)) - return false; - } else if (ret == BCH_MERGE_MERGE) { - struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); - - if (prev && - bkey_cmp_left_packed_byval(b, prev, - bkey_start_pos(&li.k.k)) > 0) - return false; - } - - if (ret == BCH_MERGE_PARTIAL) { - if (!extent_i_save(b, m, mi)) - return false; - - if (!back_merge) - bkey_copy(packed_to_bkey(l), &li.k); - else - bkey_copy(packed_to_bkey(r), &ri.k); - } else { - if (!extent_i_save(b, m, &li.k)) - return false; - } - - bch2_bset_fix_invalidated_key(b, m); - bch2_btree_node_iter_fix(iter, b, node_iter, - m, m->u64s, m->u64s); - - return ret == BCH_MERGE_MERGE; -} - -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) { - ret = false; - break; - } - } - bch2_trans_exit(&trans); - - return ret; -} - -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -{ - unsigned ret = 0; - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) - ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_NONE; + case KEY_TYPE_inline_data: + new_val_u64s = min(new_val_u64s, k.k->size << 6); break; } - case KEY_TYPE_reservation: - ret = bkey_s_c_to_reservation(k).v->nr_replicas; - break; - } - - return ret; -} - -/* KEY_TYPE_reservation: */ - -const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) - return "incorrect value size"; - - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) - return "invalid nr_replicas"; - - return NULL; -} - -void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - pr_buf(out, "generation %u replicas %u", - le32_to_cpu(r.v->generation), - r.v->nr_replicas); -} -enum merge_result bch2_reservation_merge(struct bch_fs *c, - struct bkey_s _l, struct bkey_s _r) -{ - struct bkey_s_reservation l = bkey_s_to_reservation(_l); - struct bkey_s_reservation r = bkey_s_to_reservation(_r); + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); - if (l.v->generation != r.v->generation || - l.v->nr_replicas != r.v->nr_replicas) - return BCH_MERGE_NOMERGE; - - if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { - bch2_key_resize(l.k, KEY_SIZE_MAX); - __bch2_cut_front(l.k->p, r.s); - return BCH_MERGE_PARTIAL; - } - - bch2_key_resize(l.k, l.k->size + r.k->size); - - return BCH_MERGE_MERGE; + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 613d76af69d9..1140d01a42ab 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -40,6 +40,9 @@ struct btree_insert_entry; (union bch_extent_entry *) (_entry)); \ }) +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -185,10 +188,52 @@ struct bkey_ptrs { union bch_extent_entry *end; }; -/* iterate over bkey ptrs */ +static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) + }; + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), + }; + } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -#define extent_entry_next(_entry) \ - ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } +} + +static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +{ + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + + return (struct bkey_ptrs) { + (void *) p.start, + (void *) p.end + }; +} #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ @@ -228,7 +273,7 @@ struct bkey_ptrs { __label__ out; \ \ (_ptr).idx = 0; \ - (_ptr).ec_nr = 0; \ + (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (extent_entry_type(_entry)) { \ @@ -242,7 +287,8 @@ struct bkey_ptrs { entry_to_crc(_entry)); \ break; \ case BCH_EXTENT_ENTRY_stripe_ptr: \ - (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \ + (_ptr).ec = _entry->stripe_ptr; \ + (_ptr).has_ec = true; \ break; \ } \ out: \ @@ -280,96 +326,26 @@ out: \ #define bkey_for_each_crc(_k, _p, _crc, _iter) \ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) -/* utility code common to all keys with pointers: */ +/* Iterate over pointers in KEY_TYPE_extent: */ -static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: { - struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); - return (struct bkey_ptrs_c) { - to_entry(&e.v->start[0]), - to_entry(extent_entry_last(e)) - }; - } - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - return (struct bkey_ptrs_c) { - e.v->start, - extent_entry_last(e) - }; - } - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - return (struct bkey_ptrs_c) { - to_entry(&s.v->ptrs[0]), - to_entry(&s.v->ptrs[s.v->nr_blocks]), - }; - } - case KEY_TYPE_reflink_v: { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - return (struct bkey_ptrs_c) { - r.v->start, - bkey_val_end(r), - }; - } - default: - return (struct bkey_ptrs_c) { NULL, NULL }; - } -} - -static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -{ - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); - - return (struct bkey_ptrs) { - (void *) p.start, - (void *) p.end - }; -} - -static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - ret.devs[ret.nr++] = ptr->dev; - - return ret; -} - -static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(p, ptr) - if (!ptr->cached) - ret.devs[ret.nr++] = ptr->dev; +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ + extent_entry_last(_e),_entry) - return ret; -} +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) -static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -{ - struct bch_devs_list ret = (struct bch_devs_list) { 0 }; - struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; +#define extent_ptr_next(_e, _ptr) \ + __bkey_ptr_next(_ptr, extent_entry_last(_e)) - bkey_for_each_ptr(p, ptr) - if (ptr->cached) - ret.devs[ret.nr++] = ptr->dev; +#define extent_for_each_ptr(_e, _ptr) \ + __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) - return ret; -} +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ + extent_entry_last(_e), _ptr, _entry) -unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c); -unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); +/* utility code common to all keys with pointers: */ void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *); @@ -377,22 +353,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *); -void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -void bch2_bkey_drop_device(struct bkey_s, unsigned); -const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - -void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, - struct bkey_s_c); -const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); - -/* bch_btree_ptr: */ +/* KEY_TYPE_btree_ptr: */ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ .key_invalid = bch2_btree_ptr_invalid, \ @@ -401,12 +367,11 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); .swab = bch2_ptr_swab, \ } -/* bch_extent: */ +/* KEY_TYPE_extent: */ const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); enum merge_result bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s); @@ -419,7 +384,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *, .key_merge = bch2_extent_merge, \ } -/* bch_reservation: */ +/* KEY_TYPE_reservation: */ const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -432,27 +397,15 @@ enum merge_result bch2_reservation_merge(struct bch_fs *, .key_merge = bch2_reservation_merge, \ } -int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, - struct bpos *); -int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); - -enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, - unsigned *); -void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_insert_entry *); - -void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, - unsigned, unsigned); - -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent, unsigned); +/* Extent checksum entries: */ -unsigned bch2_extent_is_compressed(struct bkey_s_c); +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, + struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); +void bch2_extent_crc_append(struct bkey_i *, + struct bch_extent_crc_unpacked); -bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, - struct bch_extent_ptr, u64); +/* Generic code for keys with pointers: */ static inline bool bkey_extent_is_direct_data(const struct bkey *k) { @@ -469,6 +422,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k) static inline bool bkey_extent_is_data(const struct bkey *k) { return bkey_extent_is_direct_data(k) || + k->type == KEY_TYPE_inline_data || k->type == KEY_TYPE_reflink_p; } @@ -482,38 +436,64 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) case KEY_TYPE_reservation: case KEY_TYPE_reflink_p: case KEY_TYPE_reflink_v: + case KEY_TYPE_inline_data: return true; default: return false; } } -/* Extent entry iteration: */ +static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; -#define extent_for_each_entry_from(_e, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, \ - extent_entry_last(_e),_entry) + bkey_for_each_ptr(p, ptr) + ret.devs[ret.nr++] = ptr->dev; -#define extent_for_each_entry(_e, _entry) \ - extent_for_each_entry_from(_e, _entry, (_e).v->start) + return ret; +} -#define extent_ptr_next(_e, _ptr) \ - __bkey_ptr_next(_ptr, extent_entry_last(_e)) +static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; -#define extent_for_each_ptr(_e, _ptr) \ - __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + bkey_for_each_ptr(p, ptr) + if (!ptr->cached) + ret.devs[ret.nr++] = ptr->dev; -#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ - extent_entry_last(_e), _ptr, _entry) + return ret; +} -void bch2_extent_ptr_decoded_append(struct bkey_i *, - struct extent_ptr_decoded *); +static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; -bool bch2_can_narrow_extent_crcs(struct bkey_s_c, - struct bch_extent_crc_unpacked); -bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); + bkey_for_each_ptr(p, ptr) + if (ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + return ret; +} + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + +void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); + +void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); +void bch2_extent_ptr_decoded_append(struct bkey_i *, + struct extent_ptr_decoded *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -534,14 +514,34 @@ do { \ } \ } while (0) -void __bch2_cut_front(struct bpos, struct bkey_s); +void bch2_bkey_drop_device(struct bkey_s, unsigned); +const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); + +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); + +void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *); + +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos, struct bkey_s); +int bch2_cut_back_s(struct bpos, struct bkey_s); static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) { - __bch2_cut_front(where, bkey_i_to_s(k)); + bch2_cut_front_s(where, bkey_i_to_s(k)); } -bool bch2_cut_back(struct bpos, struct bkey *); +static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) +{ + bch2_cut_back_s(where, bkey_i_to_s(k)); +} /** * bch_key_resize - adjust size of @k @@ -573,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst, BUG_ON(!bch2_bkey_pack_key(dst, src, f)); } -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); -unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); - #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index a8dd6952d989..43d6c341ecca 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -21,10 +21,10 @@ struct bch_extent_crc_unpacked { struct extent_ptr_decoded { unsigned idx; - unsigned ec_nr; + bool has_ec; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; - struct bch_extent_stripe_ptr ec[4]; + struct bch_extent_stripe_ptr ec; }; struct bch_io_failures { diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 index 000000000000..a4497eeb1f1b --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "acl.h" +#include "btree_update.h" +#include "dirent.h" +#include "fs-common.h" +#include "inode.h" +#include "xattr.h" + +#include <linux/posix_acl.h> + +int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *new_inode, + const struct qstr *name, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, + struct posix_acl *acl) +{ + struct bch_fs *c = trans->c; + struct btree_iter *dir_iter; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + u64 now = bch2_current_time(trans->c); + int ret; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, new_inode, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (ret) + return ret; + + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + return ret; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + acl, ACL_TYPE_ACCESS); + if (ret) + return ret; + } + + if (name) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); + dir_u->bi_mtime = dir_u->bi_ctime = now; + + if (S_ISDIR(new_inode->bi_mode)) + dir_u->bi_nlink++; + + ret = bch2_inode_write(trans, dir_iter, dir_u); + if (ret) + return ret; + + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(new_inode->bi_mode), + name, new_inode->bi_inum, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + } + + return 0; +} + +int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, + u64 inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct btree_iter *dir_iter, *inode_iter; + struct bch_inode_unpacked dir_u; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(trans->c); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + + dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + /* XXX: shouldn't we be updating mtime/ctime on the directory? */ + + dir_hash = bch2_hash_info_init(trans->c, &dir_u); + bch2_trans_iter_put(trans, dir_iter); + + return bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write(trans, inode_iter, inode_u); +} + +int bch2_unlink_trans(struct btree_trans *trans, + u64 dir_inum, struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct btree_iter *dir_iter, *dirent_iter, *inode_iter; + struct bch_hash_info dir_hash; + u64 inum, now = bch2_current_time(trans->c); + struct bkey_s_c k; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + dir_hash = bch2_hash_info_init(trans->c, dir_u); + + dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, + name, BTREE_ITER_INTENT); + if (IS_ERR(dirent_iter)) + return PTR_ERR(dirent_iter); + + k = bch2_btree_iter_peek_slot(dirent_iter); + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + bch2_inode_nlink_dec(inode_u); + + return (S_ISDIR(inode_u->bi_mode) + ? bch2_empty_dir_trans(trans, inum) + : 0) ?: + bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: + bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); +} + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + struct bch_inode_unpacked *src_u) +{ + u64 src, dst; + unsigned id; + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { + if (dst_u->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(src_u, id); + dst = bch2_inode_opt_get(dst_u, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(dst_u, id, src); + ret = true; + } + + return ret; +} + +int bch2_rename_trans(struct btree_trans *trans, + u64 src_dir, struct bch_inode_unpacked *src_dir_u, + u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + struct bch_inode_unpacked *src_inode_u, + struct bch_inode_unpacked *dst_inode_u, + const struct qstr *src_name, + const struct qstr *dst_name, + enum bch_rename_mode mode) +{ + struct btree_iter *src_dir_iter, *dst_dir_iter = NULL; + struct btree_iter *src_inode_iter, *dst_inode_iter = NULL; + struct bch_hash_info src_hash, dst_hash; + u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + int ret; + + src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, + BTREE_ITER_INTENT); + if (IS_ERR(src_dir_iter)) + return PTR_ERR(src_dir_iter); + + src_hash = bch2_hash_info_init(trans->c, src_dir_u); + + if (dst_dir != src_dir) { + dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); + if (IS_ERR(dst_dir_iter)) + return PTR_ERR(dst_dir_iter); + + dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + } else { + dst_dir_u = src_dir_u; + dst_hash = src_hash; + } + + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, + src_name, &src_inode, + dst_name, &dst_inode, + mode); + if (ret) + return ret; + + src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, + BTREE_ITER_INTENT); + if (IS_ERR(src_inode_iter)) + return PTR_ERR(src_inode_iter); + + if (dst_inode) { + dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, + BTREE_ITER_INTENT); + if (IS_ERR(dst_inode_iter)) + return PTR_ERR(dst_inode_iter); + } + + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != + S_ISDIR(dst_inode_u->bi_mode)) + return -ENOTDIR; + + if (S_ISDIR(dst_inode_u->bi_mode) && + bch2_empty_dir_trans(trans, dst_inode)) + return -ENOTEMPTY; + } + + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) + return -EXDEV; + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) + return -EXDEV; + + if (S_ISDIR(src_inode_u->bi_mode)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + + if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } + + if (mode == BCH_RENAME_OVERWRITE) + bch2_inode_nlink_dec(dst_inode_u); + + src_dir_u->bi_mtime = now; + src_dir_u->bi_ctime = now; + + if (src_dir != dst_dir) { + dst_dir_u->bi_mtime = now; + dst_dir_u->bi_ctime = now; + } + + src_inode_u->bi_ctime = now; + + if (dst_inode) + dst_inode_u->bi_ctime = now; + + return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: + (src_dir != dst_dir + ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) + : 0 ) ?: + bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: + (dst_inode + ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) + : 0 ); +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h new file mode 100644 index 000000000000..c1621485a526 --- /dev/null +++ b/fs/bcachefs/fs-common.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_COMMON_H +#define _BCACHEFS_FS_COMMON_H + +struct posix_acl; + +int bch2_create_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, + struct posix_acl *); + +int bch2_link_trans(struct btree_trans *, u64, + u64, struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_unlink_trans(struct btree_trans *, + u64, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_rename_trans(struct btree_trans *, + u64, struct bch_inode_unpacked *, + u64, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + const struct qstr *, + enum bch_rename_mode); + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 04bcf061ca12..160644ccf439 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -3,11 +3,13 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "fs.h" #include "fs-io.h" #include "fsck.h" @@ -36,27 +38,16 @@ struct quota_res { u64 sectors; }; -struct bchfs_write_op { - struct bch_inode_info *inode; - s64 sectors_added; - bool is_dio; - bool unalloc; - u64 new_i_size; - - /* must be last: */ - struct bch_write_op op; -}; - struct bch_writepage_io { struct closure cl; - u64 new_sectors; + struct bch_inode_info *inode; /* must be last: */ - struct bchfs_write_op op; + struct bch_write_op op; }; struct dio_write { - struct closure cl; + struct completion done; struct kiocb *req; struct mm_struct *mm; unsigned loop:1, @@ -68,7 +59,7 @@ struct dio_write { struct iovec inline_vecs[2]; /* must be last: */ - struct bchfs_write_op iop; + struct bch_write_op op; }; struct dio_read { @@ -229,277 +220,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, mutex_unlock(&inode->ei_quota_lock); } -/* normal i_size/i_sectors update machinery: */ - -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, bool *allocating, - s64 *delta) -{ - struct btree_iter *iter; - struct bkey_s_c old; - - *delta = 0; - - iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - old = bch2_btree_iter_peek_slot(iter); - - while (1) { - /* - * should not be possible to get an error here, since we're - * carefully not advancing past @new and thus whatever leaf node - * @_iter currently points to: - */ - BUG_ON(bkey_err(old)); - - if (allocating && - !*allocating && - bch2_bkey_nr_ptrs_allocated(old) < - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) - *allocating = true; - - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * - (bkey_extent_is_allocation(&new->k) - - bkey_extent_is_allocation(old.k)); - - if (bkey_cmp(old.k->p, new->k.p) >= 0) - break; - - old = bch2_btree_iter_next_slot(iter); - } - - bch2_trans_iter_free(trans, iter); - return 0; -} - -int bch2_extent_update(struct btree_trans *trans, - struct bch_inode_info *inode, - struct disk_reservation *disk_res, - struct quota_res *quota_res, - struct btree_iter *extent_iter, - struct bkey_i *k, - u64 new_i_size, - bool may_allocate, - bool direct, - s64 *total_delta) -{ - struct bch_fs *c = trans->c; - struct btree_iter *inode_iter = NULL; - struct bch_inode_unpacked inode_u; - struct bkey_inode_buf inode_p; - bool allocating = false; - bool extended = false; - bool inode_locked = false; - s64 i_sectors_delta; - int ret; - - ret = bch2_btree_iter_traverse(extent_iter); - if (ret) - return ret; - - ret = bch2_extent_trim_atomic(k, extent_iter); - if (ret) - return ret; - - ret = sum_sector_overwrites(trans, extent_iter, - k, &allocating, - &i_sectors_delta); - if (ret) - return ret; - - if (!may_allocate && allocating) - return -ENOSPC; - - bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k)); - - new_i_size = min(k->k.p.offset << 9, new_i_size); - - /* XXX: inode->i_size locking */ - if (i_sectors_delta || - new_i_size > inode->ei_inode.bi_size) { - if (c->opts.new_inode_updates) { - bch2_trans_unlock(trans); - mutex_lock(&inode->ei_update_lock); - - if (!bch2_trans_relock(trans)) { - mutex_unlock(&inode->ei_update_lock); - return -EINTR; - } - - inode_locked = true; - - if (!inode->ei_inode_update) - inode->ei_inode_update = - bch2_deferred_update_alloc(c, - BTREE_ID_INODES, 64); - - inode_u = inode->ei_inode; - inode_u.bi_sectors += i_sectors_delta; - - /* XXX: this is slightly suspect */ - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) { - inode_u.bi_size = new_i_size; - extended = true; - } - - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, - BTREE_INSERT_DEFERRED(inode->ei_inode_update, - &inode_p.inode.k_i)); - } else { - inode_iter = bch2_trans_get_iter(trans, - BTREE_ID_INODES, - POS(k->k.p.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); - - ret = bch2_btree_iter_traverse(inode_iter); - if (ret) - goto err; - - inode_u = inode->ei_inode; - inode_u.bi_sectors += i_sectors_delta; - - /* XXX: this is slightly suspect */ - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) { - inode_u.bi_size = new_i_size; - extended = true; - } - - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); - } - } - - ret = bch2_trans_commit(trans, disk_res, - &inode->ei_journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_USE_RESERVE); - if (ret) - goto err; - - inode->ei_inode.bi_sectors += i_sectors_delta; - - EBUG_ON(i_sectors_delta && - inode->ei_inode.bi_sectors != inode_u.bi_sectors); - - if (extended) { - inode->ei_inode.bi_size = new_i_size; - - if (direct) { - spin_lock(&inode->v.i_lock); - if (new_i_size > inode->v.i_size) - i_size_write(&inode->v, new_i_size); - spin_unlock(&inode->v.i_lock); - } - } - - if (direct) - i_sectors_acct(c, inode, quota_res, i_sectors_delta); - - if (total_delta) - *total_delta += i_sectors_delta; -err: - if (!IS_ERR_OR_NULL(inode_iter)) - bch2_trans_iter_put(trans, inode_iter); - if (inode_locked) - mutex_unlock(&inode->ei_update_lock); - - return ret; -} - -static int bchfs_write_index_update(struct bch_write_op *wop) -{ - struct bch_fs *c = wop->c; - struct bchfs_write_op *op = container_of(wop, - struct bchfs_write_op, op); - struct quota_res *quota_res = op->is_dio - ? &container_of(op, struct dio_write, iop)->quota_res - : NULL; - struct bch_inode_info *inode = op->inode; - struct keylist *keys = &op->op.insert_keys; - struct bkey_i *k = bch2_keylist_front(keys); - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - BUG_ON(k->k.p.inode != inode->v.i_ino); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, - BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - do { - BKEY_PADDED(k) tmp; - - bkey_copy(&tmp.k, bch2_keylist_front(keys)); - - bch2_trans_begin_updates(&trans); - - ret = bch2_extent_update(&trans, inode, - &wop->res, quota_res, - iter, &tmp.k, - op->new_i_size, - !op->unalloc, - op->is_dio, - &op->sectors_added); - if (ret == -EINTR) - continue; - if (ret) - break; - - if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); - else - bch2_keylist_pop_front(keys); - } while (!bch2_keylist_empty(keys)); - - bch2_trans_exit(&trans); - - return ret; -} - -static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, - struct bch_fs *c, - struct bch_inode_info *inode, - struct bch_io_opts opts, - bool is_dio) -{ - op->inode = inode; - op->sectors_added = 0; - op->is_dio = is_dio; - op->unalloc = false; - op->new_i_size = U64_MAX; - - bch2_write_op_init(&op->op, c, opts); - op->op.target = opts.foreground_target; - op->op.index_update_fn = bchfs_write_index_update; - op_journal_seq_set(&op->op, &inode->ei_journal_seq); -} - -static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode) -{ - struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); - - bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode)); - return opts; -} - /* page state: */ /* stored in page->private: */ @@ -521,6 +241,7 @@ struct bch_page_sector { }; struct bch_page_state { + spinlock_t lock; atomic_t write_count; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -576,6 +297,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, if (!s) return NULL; + spin_lock_init(&s->lock); /* * migrate_page_move_mapping() assumes that pages with private data * have their count elevated by 1. @@ -723,6 +445,9 @@ static void bch2_clear_page_bits(struct page *page) if (!s) return; + EBUG_ON(!PageLocked(page)); + EBUG_ON(PageWriteback(page)); + for (i = 0; i < ARRAY_SIZE(s->s); i++) { disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; @@ -749,13 +474,23 @@ static void bch2_set_page_dirty(struct bch_fs *c, struct bch_page_state *s = bch2_page_state(page); unsigned i, dirty_sectors = 0; + WARN_ON((u64) page_offset(page) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + spin_lock(&s->lock); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { unsigned sectors = sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - BUG_ON(sectors > res->disk.sectors); + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; @@ -765,6 +500,8 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); } + spin_unlock(&s->lock); + if (dirty_sectors) i_sectors_acct(c, inode, &res->quota, dirty_sectors); @@ -772,12 +509,25 @@ static void bch2_set_page_dirty(struct bch_fs *c, __set_page_dirty_nobuffers(page); } +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + int ret; + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + ret = filemap_fault(vmf); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + return ret; +} + vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct file *file = vmf->vma->vm_file; struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = inode->v.i_mapping; + struct address_space *mapping = file->f_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_page_reservation res; unsigned len; @@ -795,8 +545,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) * a write_invalidate_inode_pages_range() that works without dropping * page lock before invalidating page */ - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); lock_page(page); isize = i_size_read(&inode->v); @@ -807,11 +556,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) goto out; } - /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) <= isize) - len = PAGE_SIZE; - else - len = offset_in_page(isize); + len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); @@ -820,23 +565,19 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) } bch2_set_page_dirty(c, inode, page, &res, 0, len); + bch2_page_reservation_put(c, inode, &res); + wait_for_stable_page(page); out: - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); sb_end_pagefault(inode->v.i_sb); - bch2_page_reservation_put(c, inode, &res); - return ret; } void bch2_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - if (offset || length < PAGE_SIZE) return; @@ -845,10 +586,6 @@ void bch2_invalidatepage(struct page *page, unsigned int offset, int bch2_releasepage(struct page *page, gfp_t gfp_mask) { - /* XXX: this can't take locks that are held while we allocate memory */ - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); - if (PageDirty(page)) return 0; @@ -865,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -890,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -995,7 +732,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) struct bvec_iter iter; struct bio_vec bv; unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_allocated(k); + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = k.k->type == KEY_TYPE_reservation ? SECTOR_RESERVED : SECTOR_ALLOCATED; @@ -1013,6 +750,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) } } +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + static void readpage_bio_extend(struct readpages_iter *iter, struct bio *bio, unsigned sectors_this_extent, @@ -1033,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1069,15 +815,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; + struct bkey_on_stack sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; int ret = 0; rbio->c = c; rbio->start_time = local_clock(); + + bkey_on_stack_init(&sk); retry: while (1) { - BKEY_PADDED(k) tmp; struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -1089,15 +837,15 @@ retry: if (ret) break; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) break; @@ -1105,22 +853,9 @@ retry: bch2_trans_unlock(trans); - if (readpages_iter) { - bool want_full_extent = false; - - if (bkey_extent_is_data(k.k)) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *i; - struct extent_ptr_decoded p; - - bkey_for_each_ptr_decode(k.k, ptrs, p, i) - want_full_extent |= ((p.crc.csum_type != 0) | - (p.crc.compression_type != 0)); - } - - readpage_bio_extend(readpages_iter, &rbio->bio, - sectors, want_full_extent); - } + if (readpages_iter) + readpage_bio_extend(readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -1134,7 +869,7 @@ retry: bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); @@ -1143,8 +878,12 @@ retry: if (ret == -EINTR) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); + if (ret) { + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); + } + + bkey_on_stack_exit(&sk, c); } int bch2_readpages(struct file *file, struct address_space *mapping, @@ -1152,7 +891,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct page *page; @@ -1167,8 +906,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_SLOTS); - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); while ((page = readpage_iter_next(&readpages_iter))) { pgoff_t index = readpages_iter.offset + readpages_iter.idx; @@ -1191,8 +929,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, &readpages_iter); } - if (current->pagecache_lock != &mapping->add_lock) - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); bch2_trans_exit(&trans); kfree(readpages_iter.pages); @@ -1226,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page) { struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct bch_read_bio *rbio; rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); @@ -1251,7 +988,7 @@ static int bch2_read_single_page(struct page *page, DECLARE_COMPLETION_ONSTACK(done); rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), - io_opts(c, inode)); + io_opts(c, &inode->ei_inode)); rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; @@ -1278,7 +1015,9 @@ struct bch_writepage_state { static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, struct bch_inode_info *inode) { - return (struct bch_writepage_state) { .opts = io_opts(c, inode) }; + return (struct bch_writepage_state) { + .opts = io_opts(c, &inode->ei_inode) + }; } static void bch2_writepage_io_free(struct closure *cl) @@ -1286,30 +1025,43 @@ static void bch2_writepage_io_free(struct closure *cl) struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); - bio_put(&io->op.op.wbio.bio); + bio_put(&io->op.wbio.bio); } static void bch2_writepage_io_done(struct closure *cl) { struct bch_writepage_io *io = container_of(cl, struct bch_writepage_io, cl); - struct bch_fs *c = io->op.op.c; - struct bio *bio = &io->op.op.wbio.bio; + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; - if (io->op.op.error) { - bio_for_each_segment_all(bvec, bio, i) { + if (io->op.error) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); mapping_set_error(bvec->bv_page->mapping, -EIO); - lock_page(bvec->bv_page); - s = bch2_page_state(bvec->bv_page); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; - unlock_page(bvec->bv_page); + s = __bch2_page_state(bvec->bv_page); + spin_lock(&s->lock); + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_segment_all(bvec, bio, iter) { + struct bch_page_state *s; + + s = __bch2_page_state(bvec->bv_page); + spin_lock(&s->lock); + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); } } @@ -1317,24 +1069,22 @@ static void bch2_writepage_io_done(struct closure *cl) * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ - BUG_ON(io->op.sectors_added > (s64) io->new_sectors); + BUG_ON(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up * slightly) * XXX wtf? - BUG_ON(io->op.sectors_added - io->new_sectors >= (s64) PAGE_SECTORS); + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); */ /* * PageWriteback is effectively our ref on the inode - fixup i_blocks * before calling end_page_writeback: */ - if (io->op.sectors_added != io->new_sectors) - i_sectors_acct(c, io->op.inode, NULL, - io->op.sectors_added - (s64) io->new_sectors); + i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1349,7 +1099,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) struct bch_writepage_io *io = w->io; w->io = NULL; - closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl); + closure_call(&io->op.cl, bch2_write, NULL, &io->cl); continue_at(&io->cl, bch2_writepage_io_done, NULL); } @@ -1358,6 +1108,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w) * possible, else allocating a new one: */ static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, struct bch_writepage_state *w, struct bch_inode_info *inode, u64 sector, @@ -1368,17 +1119,21 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &c->writepage_bioset), - struct bch_writepage_io, op.op.wbio.bio); + struct bch_writepage_io, op.wbio.bio); closure_init(&w->io->cl, NULL); - w->io->new_sectors = 0; - bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false); - op = &w->io->op.op; + w->io->inode = inode; + + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op_journal_seq_set(op, &inode->ei_journal_seq); op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->pos = POS(inode->v.i_ino, sector); op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } static int __bch2_writepage(struct page *page, @@ -1482,33 +1237,30 @@ do_io: } if (w->io && - (w->io->op.op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.op.wbio.bio) || - bio_end_sector(&w->io->op.op.wbio.bio) != sector)) + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || + w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); if (!w->io) - bch2_writepage_io_alloc(c, w, inode, sector, + bch2_writepage_io_alloc(c, wbc, w, inode, sector, nr_replicas_this_write); - w->io->new_sectors += dirty_sectors; - atomic_inc(&s->write_count); - BUG_ON(inode != w->io->op.inode); - BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page, + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, sectors << 9, offset << 9)); /* Check for writing past i_size: */ - BUG_ON((bio_end_sector(&w->io->op.op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c))); + WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c))); - w->io->op.op.res.sectors += reserved_sectors; + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; - if (wbc->sync_mode == WB_SYNC_ALL) - w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC; - offset += sectors; } @@ -1569,8 +1321,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_page_reservation_init(c, inode, res); *fsdata = res; - /* Not strictly necessary - same reason as mkwrite(): */ - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -1622,7 +1373,7 @@ err: put_page(page); *pagep = NULL; err_unlock: - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); kfree(res); *fsdata = NULL; return ret; @@ -1666,7 +1417,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); bch2_page_reservation_put(c, inode, res); kfree(res); @@ -1700,8 +1451,13 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); if (!pages[i]) { nr_pages = i; - ret = -ENOMEM; - goto out; + if (!i) { + ret = -ENOMEM; + goto out; + } + len = min_t(unsigned, len, + nr_pages * PAGE_SIZE - offset); + break; } } @@ -1766,14 +1522,6 @@ retry_reservation: if (!copied) goto out; - nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); - inode->ei_last_dirtied = (unsigned long) current; - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - if (copied < len && ((offset + copied) & (PAGE_SIZE - 1))) { struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; @@ -1784,6 +1532,11 @@ retry_reservation: } } + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); + while (set_dirty < copied) { struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); @@ -1799,6 +1552,9 @@ retry_reservation: set_dirty += pg_len; } + + nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); + inode->ei_last_dirtied = (unsigned long) current; out: for (i = nr_pages_copied; i < nr_pages; i++) { unlock_page(pages[i]); @@ -1819,7 +1575,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t written = 0; int ret = 0; - pagecache_add_get(&mapping->add_lock); + bch2_pagecache_add_get(&inode->ei_pagecache_lock); do { unsigned offset = pos & (PAGE_SIZE - 1); @@ -1876,7 +1632,7 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); - pagecache_add_put(&mapping->add_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); return written ? written : ret; } @@ -1912,7 +1668,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, inode); + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct dio_read *dio; struct bio *bio; loff_t offset = req->ki_pos; @@ -2000,67 +1756,98 @@ start: } } -/* O_DIRECT writes */ +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + return ret; + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + } + + return ret; +} -static void bch2_dio_write_loop_async(struct closure *); +/* O_DIRECT writes */ static long bch2_dio_write_loop(struct dio_write *dio) { bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct address_space *mapping = req->ki_filp->f_mapping; - struct bch_inode_info *inode = dio->iop.inode; - struct bio *bio = &dio->iop.op.wbio.bio; + struct bch_inode_info *inode = file_bch_inode(req->ki_filp); + struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - loff_t offset; + unsigned unaligned; + u64 new_i_size; bool sync; long ret; - int i; if (dio->loop) goto loop; - inode_dio_begin(&inode->v); - __pagecache_block_get(&mapping->add_lock); - - /* Write and invalidate pagecache range that we're writing to: */ - offset = req->ki_pos + (dio->iop.op.written << 9); - ret = write_invalidate_inode_pages_range(mapping, - offset, - offset + iov_iter_count(&dio->iter) - 1); - if (unlikely(ret)) - goto err; - while (1) { - offset = req->ki_pos + (dio->iop.op.written << 9); - - BUG_ON(current->pagecache_lock); - current->pagecache_lock = &mapping->add_lock; if (kthread) use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; ret = bio_iov_iter_get_pages(bio, &dio->iter); + current->faults_disabled_mapping = NULL; if (kthread) unuse_mm(dio->mm); - current->pagecache_lock = NULL; if (unlikely(ret < 0)) goto err; - /* gup might have faulted pages back in: */ - ret = write_invalidate_inode_pages_range(mapping, - offset, - offset + bio->bi_iter.bi_size - 1); - if (unlikely(ret)) + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + bio_for_each_segment_all(bv, bio, iter) + put_page(bv->bv_page); + ret = -EFAULT; goto err; + } - dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9); + dio->op.pos = POS(inode->v.i_ino, + (req->ki_pos >> 9) + dio->op.written); task_io_account_write(bio->bi_iter.bi_size); - closure_call(&dio->iop.op.cl, bch2_write, NULL, &dio->cl); - if (!dio->sync && !dio->loop && dio->iter.count) { struct iovec *iov = dio->inline_vecs; @@ -2068,8 +1855,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), GFP_KERNEL); if (unlikely(!iov)) { - dio->iop.op.error = -ENOMEM; - goto err_wait_io; + dio->sync = true; + goto do_io; } dio->free_iov = true; @@ -2078,34 +1865,44 @@ static long bch2_dio_write_loop(struct dio_write *dio) memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); dio->iter.iov = iov; } -err_wait_io: +do_io: dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); - if (!dio->sync) { - continue_at(&dio->cl, bch2_dio_write_loop_async, NULL); + if (dio->sync) + wait_for_completion(&dio->done); + else return -EIOCBQUEUED; - } - - closure_sync(&dio->cl); loop: - bio_for_each_segment_all(bv, bio, i) + i_sectors_acct(c, inode, &dio->quota_res, + dio->op.i_sectors_delta); + dio->op.i_sectors_delta = 0; + + new_i_size = req->ki_pos + ((u64) dio->op.written << 9); + + spin_lock(&inode->v.i_lock); + if (new_i_size > inode->v.i_size) + i_size_write(&inode->v, new_i_size); + spin_unlock(&inode->v.i_lock); + + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->iop.op.error) + if (!dio->iter.count || dio->op.error) break; + bio_reset(bio); + reinit_completion(&dio->done); } - ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9); + ret = dio->op.error ?: ((long) dio->op.written << 9); err: - __pagecache_block_put(&mapping->add_lock); - bch2_disk_reservation_put(dio->iop.op.c, &dio->iop.op.res); - bch2_quota_reservation_put(dio->iop.op.c, inode, &dio->quota_res); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_disk_reservation_put(c, &dio->op.res); + bch2_quota_reservation_put(c, inode, &dio->quota_res); if (dio->free_iov) kfree(dio->iter.iov); - closure_debug_destroy(&dio->cl); - sync = dio->sync; bio_put(bio); @@ -2119,141 +1916,155 @@ err: return ret; } -static void bch2_dio_write_loop_async(struct closure *cl) +static void bch2_dio_write_loop_async(struct bch_write_op *op) { - struct dio_write *dio = container_of(cl, struct dio_write, cl); + struct dio_write *dio = container_of(op, struct dio_write, op); - bch2_dio_write_loop(dio); + if (dio->sync) + complete(&dio->done); + else + bch2_dio_write_loop(dio); } -static int bch2_direct_IO_write(struct kiocb *req, - struct iov_iter *iter, - bool swap) +static noinline +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) { struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct dio_write *dio; struct bio *bio; + bool locked = true, extending; ssize_t ret; - lockdep_assert_held(&inode->v.i_rwsem); + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); - if (unlikely(!iter->count)) - return 0; + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - return -EINVAL; + goto err; + + inode_dio_begin(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, iop.op.wbio.bio); - closure_init(&dio->cl, NULL); + dio = container_of(bio, struct dio_write, op.wbio.bio); + init_completion(&dio->done); dio->req = req; dio->mm = current->mm; dio->loop = false; - dio->sync = is_sync_kiocb(req) || - req->ki_pos + iter->count > inode->v.i_size; + dio->sync = is_sync_kiocb(req) || extending; dio->free_iov = false; dio->quota_res.sectors = 0; dio->iter = *iter; - bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true); - dio->iop.op.write_point = writepoint_hashed((unsigned long) current); - dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION; + + bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = bch2_dio_write_loop_async; + dio->op.target = opts.foreground_target; + op_journal_seq_set(&dio->op, &inode->ei_journal_seq); + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.flags |= BCH_WRITE_NOPUT_RESERVATION; if ((req->ki_flags & IOCB_DSYNC) && !c->opts.journal_flush_disabled) - dio->iop.op.flags |= BCH_WRITE_FLUSH; + dio->op.flags |= BCH_WRITE_FLUSH; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, iter->count >> 9, true); if (unlikely(ret)) - goto err; + goto err_put_bio; - dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas; + dio->op.nr_replicas = dio->op.opts.data_replicas; - ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, - dio->iop.op.opts.data_replicas, 0); - if (unlikely(ret)) { - if (!bch2_check_range_allocated(c, POS(inode->v.i_ino, - req->ki_pos >> 9), - iter->count >> 9, - dio->iop.op.opts.data_replicas)) - goto err; + ret = bch2_disk_reservation_get(c, &dio->op.res, iter->count >> 9, + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_check_range_allocated(c, POS(inode->v.i_ino, + req->ki_pos >> 9), + iter->count >> 9, + dio->op.opts.data_replicas)) + goto err_put_bio; - dio->iop.unalloc = true; - } + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; - return bch2_dio_write_loop(dio); + ret = bch2_dio_write_loop(dio); err: - bch2_disk_reservation_put(c, &dio->iop.op.res); + if (locked) + inode_unlock(&inode->v); + if (ret > 0) + req->ki_pos += ret; + return ret; +err_put_bio: + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_disk_reservation_put(c, &dio->op.res); bch2_quota_reservation_put(c, inode, &dio->quota_res); - closure_debug_destroy(&dio->cl); bio_put(bio); - return ret; -} - -ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter) -{ - struct blk_plug plug; - ssize_t ret; - - blk_start_plug(&plug); - ret = iov_iter_rw(iter) == WRITE - ? bch2_direct_IO_write(req, iter, false) - : bch2_direct_IO_read(req, iter); - blk_finish_plug(&plug); - - return ret; -} - -static ssize_t -bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter) -{ - return bch2_direct_IO_write(iocb, iter, true); + inode_dio_end(&inode->v); + goto err; } -static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) + return bch2_direct_write(iocb, from); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(&inode->v); + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); if (ret) - goto out; + goto unlock; ret = file_update_time(file); if (ret) - goto out; - - ret = iocb->ki_flags & IOCB_DIRECT - ? bch2_direct_write(iocb, from) - : bch2_buffered_write(iocb, from); + goto unlock; + ret = bch2_buffered_write(iocb, from); if (likely(ret > 0)) iocb->ki_pos += ret; -out: +unlock: + inode_unlock(&inode->v); current->backing_dev_info = NULL; - return ret; -} - -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp); - bool direct = iocb->ki_flags & IOCB_DIRECT; - ssize_t ret; - inode_lock(&inode->v); - ret = generic_write_checks(iocb, from); if (ret > 0) - ret = __bch2_write_iter(iocb, from); - inode_unlock(&inode->v); - - if (ret > 0 && !direct) ret = generic_write_sync(iocb, ret); return ret; @@ -2288,80 +2099,6 @@ out: /* truncate: */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, struct bch_inode_info *inode, - u64 new_i_size) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto btree_err; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); - - bch2_trans_begin_updates(trans); - - ret = bch2_extent_update(trans, inode, - &disk_res, NULL, iter, &delete, - new_i_size, false, true, NULL); - bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; - } - - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); - } - - return ret ?: ret2; -} - -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset) -{ - struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode->v.i_ino, start_offset), - BTREE_ITER_INTENT); - - ret = bch2_fpunch_at(&trans, iter, - POS(inode->v.i_ino, end_offset), - inode, 0); - - bch2_trans_exit(&trans); - - if (ret == -EINTR) - ret = 0; - - return ret; -} - static inline int range_has_data(struct bch_fs *c, struct bpos start, struct bpos end) @@ -2475,14 +2212,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } -static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) +static int bch2_extend(struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; int ret; - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); + /* + * sync appends: + * + * this has to be done _before_ extending i_size: + */ + ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); if (ret) return ret; @@ -2522,19 +2265,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bch_inode_unpacked inode_u; + struct btree_trans trans; + struct btree_iter *iter; u64 new_i_size = iattr->ia_size; - bool shrink; + s64 i_sectors_delta = 0; int ret = 0; inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + + /* + * fetch current on disk i_size: inode is locked, i_size can only + * increase underneath us: + */ + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); + ret = PTR_ERR_OR_ZERO(iter); + bch2_trans_exit(&trans); - BUG_ON(inode->v.i_size < inode->ei_inode.bi_size); + if (ret) + goto err; - shrink = iattr->ia_size <= inode->v.i_size; + BUG_ON(inode->v.i_size < inode_u.bi_size); - if (!shrink) { - ret = bch2_extend(inode, iattr); + if (iattr->ia_size > inode->v.i_size) { + ret = bch2_extend(inode, &inode_u, iattr); goto err; } @@ -2552,9 +2308,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) * userspace has to redirty it and call .mkwrite -> set_page_dirty * again to allocate the part of the page that was extended. */ - if (iattr->ia_size > inode->ei_inode.bi_size) + if (iattr->ia_size > inode_u.bi_size) ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, + inode_u.bi_size, iattr->ia_size - 1); else if (iattr->ia_size & (PAGE_SIZE - 1)) ret = filemap_write_and_wait_range(mapping, @@ -2573,9 +2329,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) truncate_setsize(&inode->v, iattr->ia_size); - ret = __bch2_fpunch(c, inode, + ret = bch2_fpunch(c, inode->v.i_ino, round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX); + U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + if (unlikely(ret)) goto err; @@ -2586,23 +2344,22 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); err: - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); return ret; } /* fallocate: */ -static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct address_space *mapping = inode->v.i_mapping; u64 discard_start = round_up(offset, block_bytes(c)) >> 9; u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; int ret = 0; inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); ret = __bch2_truncate_page(inode, offset >> PAGE_SHIFT, @@ -2621,21 +2378,29 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); - if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end); + if (discard_start < discard_end) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode->v.i_ino, + discard_start, discard_end, + &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } err: - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; } -static long bch2_fcollapse_finsert(struct bch_inode_info *inode, +static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, loff_t offset, loff_t len, bool insert) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; + struct bkey_on_stack copy; struct btree_trans trans; struct btree_iter *src, *dst, *del = NULL; loff_t shift, new_size; @@ -2645,6 +2410,7 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; + bkey_on_stack_init(©); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); /* @@ -2655,7 +2421,7 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, */ inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); if (insert) { ret = -EFBIG; @@ -2690,8 +2456,14 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); } else { - ret = __bch2_fpunch(c, inode, offset >> 9, - (offset + len) >> 9); + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode->v.i_ino, + offset >> 9, (offset + len) >> 9, + &inode->ei_journal_seq, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + if (ret) goto err; } @@ -2707,7 +2479,6 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, while (1) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); - BKEY_PADDED(k) copy; struct bkey_i delete; struct bkey_s_c k; struct bpos next_pos; @@ -2732,38 +2503,34 @@ static long bch2_fcollapse_finsert(struct bch_inode_info *inode, bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) break; reassemble: - bkey_reassemble(©.k, k); + bkey_on_stack_reassemble(©, c, k); if (insert && bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) { - bch2_cut_front(move_pos, ©.k); - bch2_btree_iter_set_pos(src, bkey_start_pos(©.k.k)); + bch2_cut_front(move_pos, copy.k); + bch2_btree_iter_set_pos(src, bkey_start_pos(©.k->k)); } - copy.k.k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k.k)); + copy.k->k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); - ret = bch2_btree_iter_traverse(dst); + ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); if (ret) goto bkey_err; - ret = bch2_extent_atomic_end(dst, ©.k, &atomic_end); - if (ret) - goto bkey_err; - - if (bkey_cmp(atomic_end, copy.k.k.p)) { + if (bkey_cmp(atomic_end, copy.k->k.p)) { if (insert) { move_pos = atomic_end; move_pos.offset -= shift >> 9; goto reassemble; } else { - bch2_cut_back(atomic_end, ©.k.k); + bch2_cut_back(atomic_end, copy.k); } } bkey_init(&delete.k); delete.k.p = src->pos; - bch2_key_resize(&delete.k, copy.k.k.size); + bch2_key_resize(&delete.k, copy.k->k.size); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2776,12 +2543,12 @@ reassemble: * by the triggers machinery: */ if (insert && - bkey_cmp(bkey_start_pos(©.k.k), delete.k.p) < 0) { - bch2_cut_back(bkey_start_pos(©.k.k), &delete.k); + bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { + bch2_cut_back(bkey_start_pos(©.k->k), &delete); } else if (!insert && - bkey_cmp(copy.k.k.p, + bkey_cmp(copy.k->k.p, bkey_start_pos(&delete.k)) > 0) { - bch2_cut_front(copy.k.k.p, &delete); + bch2_cut_front(copy.k->k.p, &delete); del = bch2_trans_copy_iter(&trans, src); BUG_ON(IS_ERR_OR_NULL(del)); @@ -2790,11 +2557,10 @@ reassemble: bkey_start_pos(&delete.k)); } - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, ©.k)); - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(del ?: src, &delete)); + bch2_trans_update(&trans, dst, copy.k); + bch2_trans_update(&trans, del ?: src, &delete); - if (copy.k.k.size == k.k->size) { + if (copy.k->k.size == k.k->size) { /* * If we're moving the entire extent, we can skip * running triggers: @@ -2803,10 +2569,10 @@ reassemble: } else { /* We might end up splitting compressed extents: */ unsigned nr_ptrs = - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(©.k)); + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ret = bch2_disk_reservation_get(c, &disk_res, - copy.k.k.size, nr_ptrs, + copy.k->k.size, nr_ptrs, BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); } @@ -2817,7 +2583,7 @@ reassemble: bch2_disk_reservation_put(c, &disk_res); bkey_err: if (del) - bch2_trans_iter_free(&trans, del); + bch2_trans_iter_put(&trans, del); del = NULL; if (!ret) @@ -2841,13 +2607,14 @@ bkey_err: } err: bch2_trans_exit(&trans); - pagecache_block_put(&mapping->add_lock); + bkey_on_stack_exit(©, c); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; } -static long bch2_fallocate(struct bch_inode_info *inode, int mode, - loff_t offset, loff_t len) +static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) { struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -2858,14 +2625,14 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, loff_t block_start = round_down(offset, block_bytes(c)); loff_t block_end = round_up(end, block_bytes(c)); unsigned sectors; - unsigned replicas = io_opts(c, inode).data_replicas; + unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); inode_lock(&inode->v); inode_dio_wait(&inode->v); - pagecache_block_get(&mapping->add_lock); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); @@ -2896,6 +2663,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, end_pos = POS(inode->v.i_ino, block_end >> 9); while (bkey_cmp(iter->pos, end_pos) < 0) { + s64 i_sectors_delta = 0; struct disk_reservation disk_res = { 0 }; struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; @@ -2923,11 +2691,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter->pos, &reservation.k_i); - bch2_cut_back(end_pos, &reservation.k); + bch2_cut_front(iter->pos, &reservation.k_i); + bch2_cut_back(end_pos, &reservation.k_i); sectors = reservation.k.size; - reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k); + reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); if (!bkey_extent_is_allocation(k.k)) { ret = bch2_quota_reservation_add(c, inode, @@ -2938,7 +2706,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, } if (reservation.v.nr_replicas < replicas || - bch2_extent_is_compressed(k)) { + bch2_bkey_sectors_compressed(k)) { ret = bch2_disk_reservation_get(c, &disk_res, sectors, replicas, 0); if (unlikely(ret)) @@ -2949,10 +2717,10 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin_updates(&trans); - ret = bch2_extent_update(&trans, inode, - &disk_res, "a_res, - iter, &reservation.k_i, - 0, true, true, NULL); + ret = bch2_extent_update(&trans, iter, &reservation.k_i, + &disk_res, &inode->ei_journal_seq, + 0, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); @@ -2961,37 +2729,53 @@ bkey_err: if (ret) goto err; } - bch2_trans_unlock(&trans); - if (!(mode & FALLOC_FL_KEEP_SIZE) && - end > inode->v.i_size) { - i_size_write(&inode->v, end); + /* + * Do we need to extend the file? + * + * If we zeroed up to the end of the file, we dropped whatever writes + * were going to write out the current i_size, so we have to extend + * manually even if FL_KEEP_SIZE was set: + */ + if (end >= inode->v.i_size && + (!(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_ZERO_RANGE))) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0); - mutex_unlock(&inode->ei_update_lock); - } + do { + bch2_trans_begin(&trans); + inode_iter = bch2_inode_peek(&trans, &inode_u, + inode->v.i_ino, 0); + ret = PTR_ERR_OR_ZERO(inode_iter); + } while (ret == -EINTR); - /* blech */ - if ((mode & FALLOC_FL_KEEP_SIZE) && - (mode & FALLOC_FL_ZERO_RANGE) && - inode->ei_inode.bi_size != inode->v.i_size) { - /* sync appends.. */ + bch2_trans_unlock(&trans); + + if (ret) + goto err; + + /* + * Sync existing appends before extending i_size, + * as in bch2_extend(): + */ ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); + inode_u.bi_size, S64_MAX); if (ret) goto err; - if (inode->ei_inode.bi_size != inode->v.i_size) { - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, - inode->v.i_size, 0); - mutex_unlock(&inode->ei_update_lock); - } + if (mode & FALLOC_FL_KEEP_SIZE) + end = inode->v.i_size; + else + i_size_write(&inode->v, end); + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, end, 0); + mutex_unlock(&inode->ei_update_lock); } err: bch2_trans_exit(&trans); - pagecache_block_put(&mapping->add_lock); + bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; } @@ -3000,20 +2784,26 @@ long bch2_fallocate_dispatch(struct file *file, int mode, loff_t offset, loff_t len) { struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; - if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) - return bch2_fallocate(inode, mode, offset, len); - - if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) - return bch2_fpunch(inode, offset, len); + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; - if (mode == FALLOC_FL_INSERT_RANGE) - return bch2_fcollapse_finsert(inode, offset, len, true); + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + ret = bchfs_fpunch(inode, offset, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else + ret = -EOPNOTSUPP; - if (mode == FALLOC_FL_COLLAPSE_RANGE) - return bch2_fcollapse_finsert(inode, offset, len, false); + percpu_ref_put(&c->writes); - return -EOPNOTSUPP; + return ret; } static void mark_range_unallocated(struct bch_inode_info *inode, @@ -3040,9 +2830,12 @@ static void mark_range_unallocated(struct bch_inode_info *inode, lock_page(page); s = bch2_page_state(page); - if (s) + if (s) { + spin_lock(&s->lock); for (j = 0; j < PAGE_SECTORS; j++) s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } unlock_page(page); } @@ -3050,235 +2843,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3286,8 +2850,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct bch_inode_info *src = file_bch_inode(file_src); struct bch_inode_info *dst = file_bch_inode(file_dst); struct bch_fs *c = src->v.i_sb->s_fs_info; + s64 i_sectors_delta = 0; + u64 aligned_len; loff_t ret = 0; - loff_t aligned_len; if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; @@ -3303,42 +2868,51 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK, src, dst); + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + file_update_time(file_dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); - __pagecache_block_get(&src->v.i_mapping->add_lock); - __pagecache_block_get(&dst->v.i_mapping->add_lock); - ret = generic_remap_file_range_prep(file_src, pos_src, file_dst, pos_dst, &len, remap_flags); if (ret < 0 || len == 0) - goto out_unlock; + goto err; - aligned_len = round_up(len, block_bytes(c)); + aligned_len = round_up((u64) len, block_bytes(c)); ret = write_invalidate_inode_pages_range(dst->v.i_mapping, - pos_dst, pos_dst + aligned_len); + pos_dst, pos_dst + len - 1); if (ret) - goto out_unlock; + goto err; mark_range_unallocated(src, pos_src, pos_src + aligned_len); - ret = bch2_remap_range(c, dst, + ret = bch2_remap_range(c, POS(dst->v.i_ino, pos_dst >> 9), POS(src->v.i_ino, pos_src >> 9), aligned_len >> 9, - pos_dst + len); - if (ret > 0) - ret = min(ret << 9, len); + &dst->ei_journal_seq, + pos_dst + len, &i_sectors_delta); + if (ret < 0) + goto err; -out_unlock: - __pagecache_block_put(&dst->v.i_mapping->add_lock); - __pagecache_block_put(&src->v.i_mapping->add_lock); + /* + * due to alignment, we might have remapped slightly more than requsted + */ + ret = min((u64) ret << 9, (u64) len); + + /* XXX get a quota reservation */ + i_sectors_acct(c, dst, NULL, i_sectors_delta); - bch2_unlock_inodes(INODE_LOCK, src, dst); + spin_lock(&dst->v.i_lock); + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); +err: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); return ret; } @@ -3461,7 +3035,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); @@ -3567,13 +3141,13 @@ int bch2_fs_fsio_init(struct bch_fs *c) pr_verbose_init(c->opts, ""); if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.op.wbio.bio), + 4, offsetof(struct bch_writepage_io, op.wbio.bio), BIOSET_NEED_BVECS) || bioset_init(&c->dio_read_bioset, 4, offsetof(struct dio_read, rbio.bio), BIOSET_NEED_BVECS) || bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, iop.op.wbio.bio), + 4, offsetof(struct dio_write, op.wbio.bio), BIOSET_NEED_BVECS)) ret = -ENOMEM; diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index b0cbc849a59b..7063556d289b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -11,16 +11,6 @@ struct quota_res; -int bch2_extent_update(struct btree_trans *, - struct bch_inode_info *, - struct disk_reservation *, - struct quota_res *, - struct btree_iter *, - struct bkey_i *, - u64, bool, bool, s64 *); -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_inode_info *, u64); - int __must_check bch2_write_inode_size(struct bch_fs *, struct bch_inode_info *, loff_t, unsigned); @@ -37,8 +27,7 @@ int bch2_write_begin(struct file *, struct address_space *, loff_t, int bch2_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); -ssize_t bch2_direct_IO(struct kiocb *, struct iov_iter *); - +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); int bch2_fsync(struct file *, loff_t, loff_t, int); @@ -46,15 +35,12 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); loff_t bch2_llseek(struct file *, loff_t, int); +vm_fault_t bch2_page_fault(struct vm_fault *); vm_fault_t bch2_page_mkwrite(struct vm_fault *); void bch2_invalidatepage(struct page *, unsigned int, unsigned int); int bch2_releasepage(struct page *, gfp_t); diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index e80576f5a980..031e6d931171 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,6 +5,7 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" +#include "fs-common.h" #include "fs-ioctl.h" #include "quota.h" @@ -164,6 +165,15 @@ err: return ret; } +static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_inode_info *dir = p; + + return !bch2_reinherit_attrs(bi, &dir->ei_inode); +} + static int bch2_ioc_reinherit_attrs(struct bch_fs *c, struct file *file, struct bch_inode_info *src, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 83bad578ad0f..6fc6d504b094 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -3,12 +3,14 @@ #include "bcachefs.h" #include "acl.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" #include "dirent.h" #include "extents.h" #include "fs.h" +#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fsck.h" @@ -48,42 +50,59 @@ static void journal_seq_copy(struct bch_inode_info *dst, } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); } -/* - * I_SIZE_DIRTY requires special handling: - * - * To the recovery code, the flag means that there is stale data past i_size - * that needs to be deleted; it's used for implementing atomic appends and - * truncates. - * - * On append, we set I_SIZE_DIRTY before doing the write, then after the write - * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size - * that exposes the data we just wrote. - * - * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting - * i_size to the new smaller size, then we delete the data that we just made - * invisible, and then we clear I_SIZE_DIRTY. - * - * Because there can be multiple appends in flight at a time, we need a refcount - * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero - * refcount means I_SIZE_DIRTY is set, zero means it's cleared. - * - * Because write_inode() can be called at any time, i_size_dirty_count means - * something different to the runtime code - it means to write_inode() "don't - * update i_size yet". - * - * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when - * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must - * be set explicitly. - */ +static void __pagecache_lock_put(struct pagecache_lock *lock, long i) +{ + BUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) +{ + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +static void __pagecache_lock_get(struct pagecache_lock *lock, long i) +{ + wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); +} + +void bch2_pagecache_add_put(struct pagecache_lock *lock) +{ + __pagecache_lock_put(lock, 1); +} + +void bch2_pagecache_add_get(struct pagecache_lock *lock) +{ + __pagecache_lock_get(lock, 1); +} + +void bch2_pagecache_block_put(struct pagecache_lock *lock) +{ + __pagecache_lock_put(lock, -1); +} + +void bch2_pagecache_block_get(struct pagecache_lock *lock) +{ + __pagecache_lock_get(lock, -1); +} void bch2_inode_update_after_write(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { - set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED - ? 0 - : bi->bi_nlink + nlink_bias(inode->v.i_mode)); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); inode->v.i_mode = bi->bi_mode; @@ -100,68 +119,13 @@ void bch2_inode_update_after_write(struct bch_fs *c, bch2_inode_flags_to_vfs(inode); } -int __must_check bch2_write_inode_trans(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *inode_u, - inode_set_fn set, - void *p) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; - struct bkey_inode_buf *inode_p; - int ret; - - lockdep_assert_held(&inode->ei_update_lock); - - if (c->opts.new_inode_updates) { - /* XXX: Don't do this with btree locks held */ - if (!inode->ei_inode_update) - inode->ei_inode_update = - bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64); - } else { - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(inode->v.i_ino, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - /* The btree node lock is our lock on the inode: */ - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - } - - *inode_u = inode->ei_inode; - - if (set) { - ret = set(inode, inode_u, p); - if (ret) - return ret; - } - - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode_u); - - if (!inode->ei_inode_update) - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); - else - bch2_trans_update(trans, - BTREE_INSERT_DEFERRED(inode->ei_inode_update, - &inode_p->inode.k_i)); - - return 0; -} - int __must_check bch2_write_inode(struct bch_fs *c, struct bch_inode_info *inode, inode_set_fn set, void *p, unsigned fields) { struct btree_trans trans; + struct btree_iter *iter; struct bch_inode_unpacked inode_u; int ret; @@ -169,7 +133,11 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(&trans); - ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?: + iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter) ?: + (set ? set(inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| @@ -224,32 +192,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_inode_info *dir = p; - u64 src, dst; - unsigned id; - int ret = 1; - - for (id = 0; id < Inode_opt_nr; id++) { - if (bi->bi_fields_set & (1 << id)) - continue; - - src = bch2_inode_opt_get(&dir->ei_inode, id); - dst = bch2_inode_opt_get(bi, id); - - if (src == dst) - continue; - - bch2_inode_opt_set(bi, id, src); - ret = 0; - } - - return ret; -} - struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; @@ -277,82 +219,37 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } -static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u, - const struct inode *dir, umode_t mode) -{ - kuid_t uid = current_fsuid(); - kgid_t gid; - - if (dir && dir->i_mode & S_ISGID) { - gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - gid = current_fsgid(); - - inode_u->bi_uid = from_kuid(dir->i_sb->s_user_ns, uid); - inode_u->bi_gid = from_kgid(dir->i_sb->s_user_ns, gid); - inode_u->bi_mode = mode; -} - -static int inode_update_for_create_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_inode_unpacked *new_inode = p; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - - if (S_ISDIR(new_inode->bi_mode)) - bi->bi_nlink++; - - return 0; -} - static struct bch_inode_info * __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) { struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct user_namespace *ns = dir->v.i_sb->s_user_ns; struct btree_trans trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *default_acl = NULL, *acl = NULL; u64 journal_seq = 0; int ret; - bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); - bch2_inode_init_owner(&inode_u, &dir->v, mode); - - hash_info = bch2_hash_info_init(c, &inode_u); - - if (tmpfile) - inode_u.bi_flags |= BCH_INODE_UNLINKED; - - ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - return ERR_PTR(ret); - + /* + * preallocate acls + vfs inode before btree transaction, so that + * nothing can fail after the transaction succeeds: + */ #ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl); + ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); if (ret) - goto err; + return ERR_PTR(ret); #endif - - /* - * preallocate vfs inode before btree transaction, so that nothing can - * fail after the transaction succeeds: - */ inode = to_bch_ei(new_inode(c->vfs_sb)); if (unlikely(!inode)) { - ret = -ENOMEM; + inode = ERR_PTR(-ENOMEM); goto err; } + bch2_inode_init_early(c, &inode_u); + if (!tmpfile) mutex_lock(&dir->ei_update_lock); @@ -360,38 +257,28 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, retry: bch2_trans_begin(&trans); - ret = __bch2_inode_create(&trans, &inode_u, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint) ?: - (default_acl - ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, - default_acl, ACL_TYPE_DEFAULT) - : 0) ?: - (acl - ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, - acl, ACL_TYPE_ACCESS) - : 0) ?: - (!tmpfile - ? __bch2_dirent_create(&trans, dir->v.i_ino, - &dir->ei_str_hash, - mode_to_type(mode), - &dentry->d_name, - inode_u.bi_inum, - BCH_HASH_SET_MUST_CREATE) - : 0) ?: - (!tmpfile - ? bch2_write_inode_trans(&trans, dir, &dir_u, - inode_update_for_create_fn, - &inode_u) - : 0) ?: - bch2_trans_commit(&trans, NULL, - &journal_seq, + ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, + !tmpfile ? &dentry->d_name : NULL, + from_kuid(ns, current_fsuid()), + from_kgid(ns, current_fsgid()), + mode, rdev, + default_acl, acl) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_before_quota; + + ret = bch2_trans_commit(&trans, NULL, &journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); - if (ret == -EINTR) - goto retry; - if (unlikely(ret)) + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +err_before_quota: + if (ret == -EINTR) + goto retry; goto err_trans; + } if (!tmpfile) { bch2_inode_update_after_write(c, dir, &dir_u, @@ -418,7 +305,7 @@ retry: * We raced, another process pulled the new inode into cache * before us: */ - old->ei_journal_seq = inode->ei_journal_seq; + journal_seq_copy(old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -432,7 +319,7 @@ retry: } bch2_trans_exit(&trans); -out: +err: posix_acl_release(default_acl); posix_acl_release(acl); return inode; @@ -443,10 +330,8 @@ err_trans: bch2_trans_exit(&trans); make_bad_inode(&inode->v); iput(&inode->v); -err: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); inode = ERR_PTR(ret); - goto out; + goto err; } /* methods */ @@ -469,11 +354,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, return d_splice_alias(vinode, dentry); } -static int bch2_create(struct inode *vdir, struct dentry *dentry, - umode_t mode, bool excl) +static int bch2_mknod(struct inode *vdir, struct dentry *dentry, + umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false); + __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -482,20 +367,10 @@ static int bch2_create(struct inode *vdir, struct dentry *dentry, return 0; } -static int inode_update_for_link_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) +static int bch2_create(struct inode *vdir, struct dentry *dentry, + umode_t mode, bool excl) { - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_ctime = bch2_current_time(c); - - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; - else - bi->bi_nlink++; - - return 0; + return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, @@ -509,25 +384,18 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); -retry: - bch2_trans_begin(&trans); - ret = __bch2_dirent_create(&trans, dir->v.i_ino, - &dir->ei_str_hash, - mode_to_type(inode->v.i_mode), - &dentry->d_name, - inode->v.i_ino, - BCH_HASH_SET_MUST_CREATE) ?: - bch2_write_inode_trans(&trans, inode, &inode_u, - inode_update_for_link_fn, - NULL) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK); - - if (ret == -EINTR) - goto retry; + do { + bch2_trans_begin(&trans); + ret = bch2_link_trans(&trans, + dir->v.i_ino, + inode->v.i_ino, &inode_u, + &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + } while (ret == -EINTR); if (likely(!ret)) bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); @@ -556,35 +424,6 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, return 0; } -static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_inode_info *unlink_inode = p; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - - bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode); - - return 0; -} - -static int inode_update_for_unlink_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - - bi->bi_ctime = bch2_current_time(c); - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; - - return 0; -} - static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { struct bch_fs *c = vdir->i_sb->s_fs_info; @@ -596,36 +435,30 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); -retry: - bch2_trans_begin(&trans); - - ret = __bch2_dirent_delete(&trans, dir->v.i_ino, - &dir->ei_str_hash, - &dentry->d_name) ?: - bch2_write_inode_trans(&trans, dir, &dir_u, - inode_update_dir_for_unlink_fn, - inode) ?: - bch2_write_inode_trans(&trans, inode, &inode_u, - inode_update_for_unlink_fn, - NULL) ?: - bch2_trans_commit(&trans, NULL, - &dir->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOFAIL); - if (ret == -EINTR) - goto retry; - if (ret) - goto err; - if (dir->ei_journal_seq > inode->ei_journal_seq) - inode->ei_journal_seq = dir->ei_journal_seq; + do { + bch2_trans_begin(&trans); + + ret = bch2_unlink_trans(&trans, + dir->v.i_ino, &dir_u, + &inode_u, &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &dir->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + } while (ret == -EINTR); + + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + + journal_seq_copy(inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_MTIME); + } - bch2_inode_update_after_write(c, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, - ATTR_MTIME); -err: bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -669,98 +502,7 @@ err: static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) { - struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false); - - if (IS_ERR(inode)) - return PTR_ERR(inode); - - d_instantiate(dentry, &inode->v); - return 0; -} - -static int bch2_rmdir(struct inode *vdir, struct dentry *dentry) -{ - struct bch_fs *c = vdir->i_sb->s_fs_info; - - if (bch2_empty_dir(c, dentry->d_inode->i_ino)) - return -ENOTEMPTY; - - return bch2_unlink(vdir, dentry); -} - -static int bch2_mknod(struct inode *vdir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct bch_inode_info *inode = - __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); - - if (IS_ERR(inode)) - return PTR_ERR(inode); - - d_instantiate(dentry, &inode->v); - return 0; -} - -struct rename_info { - u64 now; - struct bch_inode_info *src_dir; - struct bch_inode_info *dst_dir; - struct bch_inode_info *src_inode; - struct bch_inode_info *dst_inode; - enum bch_rename_mode mode; -}; - -static int inode_update_for_rename_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct rename_info *info = p; - int ret; - - if (inode == info->src_dir) { - bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode); - bi->bi_nlink += info->dst_inode && - S_ISDIR(info->dst_inode->v.i_mode) && - info->mode == BCH_RENAME_EXCHANGE; - } - - if (inode == info->dst_dir) { - bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode); - bi->bi_nlink -= info->dst_inode && - S_ISDIR(info->dst_inode->v.i_mode); - } - - if (inode == info->src_inode) { - ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir); - - BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode)); - } - - if (inode == info->dst_inode && - info->mode == BCH_RENAME_EXCHANGE) { - ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir); - - BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode)); - } - - if (inode == info->dst_inode && - info->mode == BCH_RENAME_OVERWRITE) { - BUG_ON(bi->bi_nlink && - S_ISDIR(info->dst_inode->v.i_mode)); - - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; - } - - if (inode == info->src_dir || - inode == info->dst_dir) - bi->bi_mtime = info->now; - bi->bi_ctime = info->now; - - return 0; + return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); } static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, @@ -768,35 +510,25 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, unsigned flags) { struct bch_fs *c = src_vdir->i_sb->s_fs_info; - struct rename_info i = { - .src_dir = to_bch_ei(src_vdir), - .dst_dir = to_bch_ei(dst_vdir), - .src_inode = to_bch_ei(src_dentry->d_inode), - .dst_inode = to_bch_ei(dst_dentry->d_inode), - .mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE - : dst_dentry->d_inode - ? BCH_RENAME_OVERWRITE : BCH_RENAME, - }; - struct btree_trans trans; + struct bch_inode_info *src_dir = to_bch_ei(src_vdir); + struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); + struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); + struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct btree_trans trans; + enum bch_rename_mode mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME; u64 journal_seq = 0; int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) return -EINVAL; - if (i.mode == BCH_RENAME_OVERWRITE) { - if (S_ISDIR(i.src_inode->v.i_mode) != - S_ISDIR(i.dst_inode->v.i_mode)) - return -ENOTDIR; - - if (S_ISDIR(i.src_inode->v.i_mode) && - bch2_empty_dir(c, i.dst_inode->v.i_ino)) - return -ENOTEMPTY; - - ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping, + if (mode == BCH_RENAME_OVERWRITE) { + ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 0, LLONG_MAX); if (ret) return ret; @@ -805,37 +537,24 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, bch2_trans_init(&trans, c, 8, 2048); bch2_lock_inodes(INODE_UPDATE_LOCK, - i.src_dir, - i.dst_dir, - i.src_inode, - i.dst_inode); - - if (S_ISDIR(i.src_inode->v.i_mode) && - inode_attrs_changing(i.dst_dir, i.src_inode)) { - ret = -EXDEV; - goto err; - } - - if (i.mode == BCH_RENAME_EXCHANGE && - S_ISDIR(i.dst_inode->v.i_mode) && - inode_attrs_changing(i.src_dir, i.dst_inode)) { - ret = -EXDEV; - goto err; - } - - if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, i.src_inode, - i.dst_dir->ei_qid, + src_dir, + dst_dir, + src_inode, + dst_inode); + + if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, src_inode, + dst_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } - if (i.mode == BCH_RENAME_EXCHANGE && - inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, i.dst_inode, - i.src_dir->ei_qid, + if (mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst_inode, + src_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) @@ -844,24 +563,14 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, retry: bch2_trans_begin(&trans); - i.now = bch2_current_time(c); - - ret = bch2_dirent_rename(&trans, - i.src_dir, &src_dentry->d_name, - i.dst_dir, &dst_dentry->d_name, - i.mode) ?: - bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u, - inode_update_for_rename_fn, &i) ?: - (i.src_dir != i.dst_dir - ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u, - inode_update_for_rename_fn, &i) - : 0 ) ?: - bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u, - inode_update_for_rename_fn, &i) ?: - (i.dst_inode - ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u, - inode_update_for_rename_fn, &i) - : 0 ) ?: + ret = bch2_rename_trans(&trans, + src_dir->v.i_ino, &src_dir_u, + dst_dir->v.i_ino, &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode) ?: bch2_trans_commit(&trans, NULL, &journal_seq, BTREE_INSERT_ATOMIC| @@ -871,59 +580,62 @@ retry: if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, i.src_dir, &src_dir_u, + BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); + BUG_ON(dst_inode && + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(c, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(i.src_dir, journal_seq); + journal_seq_copy(src_dir, journal_seq); - if (i.src_dir != i.dst_dir) { - bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u, + if (src_dir != dst_dir) { + bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(i.dst_dir, journal_seq); + journal_seq_copy(dst_dir, journal_seq); } - journal_seq_copy(i.src_inode, journal_seq); - if (i.dst_inode) - journal_seq_copy(i.dst_inode, journal_seq); - - bch2_inode_update_after_write(c, i.src_inode, &src_inode_u, + bch2_inode_update_after_write(c, src_inode, &src_inode_u, ATTR_CTIME); - if (i.dst_inode) - bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u, + journal_seq_copy(src_inode, journal_seq); + + if (dst_inode) { + bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ATTR_CTIME); + journal_seq_copy(dst_inode, journal_seq); + } err: bch2_trans_exit(&trans); - bch2_fs_quota_transfer(c, i.src_inode, - bch_qid(&i.src_inode->ei_inode), + bch2_fs_quota_transfer(c, src_inode, + bch_qid(&src_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); - if (i.dst_inode) - bch2_fs_quota_transfer(c, i.dst_inode, - bch_qid(&i.dst_inode->ei_inode), + if (dst_inode) + bch2_fs_quota_transfer(c, dst_inode, + bch_qid(&dst_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); bch2_unlock_inodes(INODE_UPDATE_LOCK, - i.src_dir, - i.dst_dir, - i.src_inode, - i.dst_inode); + src_dir, + dst_dir, + src_inode, + dst_inode); return ret; } -static int inode_update_for_setattr_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) +void bch2_setattr_copy(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct iattr *attr = p; unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid); + bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid); + bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); @@ -943,15 +655,15 @@ static int inode_update_for_setattr_fn(struct bch_inode_info *inode, mode &= ~S_ISGID; bi->bi_mode = mode; } - - return 0; } -static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr) +static int bch2_setattr_nonsize(struct bch_inode_info *inode, + struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans trans; + struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; int ret; @@ -960,11 +672,11 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt qid = inode->ei_qid; - if (iattr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid); + if (attr->ia_valid & ATTR_UID) + qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); - if (iattr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid); + if (attr->ia_valid & ATTR_GID) + qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -977,22 +689,33 @@ retry: kfree(acl); acl = NULL; - ret = bch2_write_inode_trans(&trans, inode, &inode_u, - inode_update_for_setattr_fn, iattr) ?: - (iattr->ia_valid & ATTR_MODE - ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl) - : 0) ?: + inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + goto btree_err; + + bch2_setattr_copy(inode, &inode_u, attr); + + if (attr->ia_valid & ATTR_MODE) { + ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); + if (ret) + goto btree_err; + } + + ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); +btree_err: if (ret == -EINTR) goto retry; if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid); + bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -1031,10 +754,15 @@ static int bch2_getattr(const struct path *path, struct kstat *stat, if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; + if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; + if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; return 0; } @@ -1123,7 +851,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) cur, prev; + struct bkey_on_stack cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; @@ -1132,6 +860,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; + bkey_on_stack_init(&cur); + bkey_on_stack_init(&prev); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -1146,15 +876,17 @@ retry: continue; } - bkey_reassemble(&cur.k, k); - k = bkey_i_to_s_c(&cur.k); + bkey_on_stack_realloc(&cur, c, k.k->u64s); + bkey_on_stack_realloc(&prev, c, k.k->u64s); + bkey_reassemble(cur.k, k); + k = bkey_i_to_s_c(cur.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &cur.k); + &offset_into_extent, cur.k); if (ret) break; @@ -1164,19 +896,19 @@ retry: bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + offset_into_extent), - &cur.k); - bch2_key_resize(&cur.k.k, sectors); - cur.k.k.p = iter->pos; - cur.k.k.p.offset += cur.k.k.size; + cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter->pos; + cur.k->k.p.offset += cur.k->k.size; if (have_extent) { ret = bch2_fill_extent(c, info, - bkey_i_to_s_c(&prev.k), 0); + bkey_i_to_s_c(prev.k), 0); if (ret) break; } - bkey_copy(&prev.k, &cur.k); + bkey_copy(prev.k, cur.k); have_extent = true; if (k.k->type == KEY_TYPE_reflink_v) @@ -1189,15 +921,17 @@ retry: goto retry; if (!ret && have_extent) - ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&cur, c); + bkey_on_stack_exit(&prev, c); return ret < 0 ? ret : 0; } static const struct vm_operations_struct bch_vm_ops = { - .fault = filemap_fault, + .fault = bch2_page_fault, .map_pages = filemap_map_pages, .page_mkwrite = bch2_page_mkwrite, }; @@ -1220,35 +954,33 @@ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { - struct bch_fs *c = file_inode(file)->i_sb->s_fs_info; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; - return bch2_readdir(c, file, ctx); -} + if (!dir_emit_dots(file, ctx)) + return 0; -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); + return bch2_readdir(c, inode->v.i_ino, ctx); } static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, .open = generic_file_open, .fsync = bch2_fsync, .splice_read = generic_file_splice_read, + /* + * Broken, on v5.3: .splice_write = iter_file_splice_write, + */ .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1269,7 +1001,7 @@ static const struct inode_operations bch_dir_inode_operations = { .unlink = bch2_unlink, .symlink = bch2_symlink, .mkdir = bch2_mkdir, - .rmdir = bch2_rmdir, + .rmdir = bch2_unlink, .mknod = bch2_mknod, .rename = bch2_rename2, .getattr = bch2_getattr, @@ -1285,7 +1017,7 @@ static const struct inode_operations bch_dir_inode_operations = { static const struct file_operations bch_dir_file_operations = { .llseek = bch2_dir_llseek, .read = generic_read_dir, - .iterate = bch2_vfs_readdir, + .iterate_shared = bch2_vfs_readdir, .fsync = bch2_fsync, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT @@ -1324,7 +1056,7 @@ static const struct address_space_operations bch_address_space_operations = { .write_end = bch2_write_end, .invalidatepage = bch2_invalidatepage, .releasepage = bch2_releasepage, - .direct_IO = bch2_direct_IO, + .direct_IO = noop_direct_IO, #ifdef CONFIG_MIGRATION .migratepage = bch2_migrate_page, #endif @@ -1420,8 +1152,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); + pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); - inode->ei_inode_update = NULL; inode->ei_journal_seq = 0; return &inode->v; @@ -1479,10 +1211,6 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - if (inode->ei_inode_update) - bch2_deferred_update_free(c, inode->ei_inode_update); - inode->ei_inode_update = NULL; - if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); @@ -1783,7 +1511,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 226223b058a9..eda903a45325 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -10,15 +10,36 @@ #include <linux/seqlock.h> #include <linux/stat.h> +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +struct pagecache_lock { + atomic_long_t v; + wait_queue_head_t wait; +}; + +static inline void pagecache_lock_init(struct pagecache_lock *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +void bch2_pagecache_add_put(struct pagecache_lock *); +void bch2_pagecache_add_get(struct pagecache_lock *); +void bch2_pagecache_block_put(struct pagecache_lock *); +void bch2_pagecache_block_get(struct pagecache_lock *); + struct bch_inode_info { struct inode v; struct mutex ei_update_lock; - struct deferred_update *ei_inode_update; u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; + struct pagecache_lock ei_pagecache_lock; + struct mutex ei_quota_lock; struct bch_qid ei_qid; @@ -38,7 +59,8 @@ static inline int ptrcmp(void *l, void *r) enum bch_inode_lock_op { INODE_LOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), }; #define bch2_lock_inodes(_locks, ...) \ @@ -50,9 +72,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ } while (0) @@ -66,9 +90,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ } while (0) @@ -78,16 +104,6 @@ static inline struct bch_inode_info *file_bch_inode(struct file *file) return to_bch_ei(file_inode(file)); } -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - static inline bool inode_attr_changing(struct bch_inode_info *dir, struct bch_inode_info *inode, enum inode_opt_id id) @@ -142,17 +158,9 @@ void bch2_inode_update_after_write(struct bch_fs *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); -int __must_check bch2_write_inode_trans(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, - inode_set_fn, void *); int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -int bch2_reinherit_attrs_fn(struct bch_inode_info *, - struct bch_inode_unpacked *, - void *); - void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 50a7d8c1faba..0f2308e53d65 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -4,7 +4,7 @@ #include "btree_update.h" #include "dirent.h" #include "error.h" -#include "fs.h" +#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" @@ -80,9 +80,7 @@ static int reattach_inode(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode, u64 inum) { - struct bch_hash_info lostfound_hash_info = - bch2_hash_info_init(c, lostfound_inode); - struct bkey_inode_buf packed; + struct bch_inode_unpacked inode_u; char name_buf[20]; struct qstr name; int ret; @@ -90,30 +88,14 @@ static int reattach_inode(struct bch_fs *c, snprintf(name_buf, sizeof(name_buf), "%llu", inum); name = (struct qstr) QSTR(name_buf); - lostfound_inode->bi_nlink++; - - bch2_inode_pack(&packed, lostfound_inode); - - ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error %i reattaching inode %llu while updating lost+found", - ret, inum); - return ret; - } + ret = bch2_trans_do(c, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_LAZY_RW, + bch2_link_trans(&trans, lostfound_inode->bi_inum, + inum, &inode_u, &name)); + if (ret) + bch_err(c, "error %i reattaching inode %llu", ret, inum); - ret = bch2_dirent_create(c, lostfound_inode->bi_inum, - &lostfound_hash_info, - DT_DIR, &name, inum, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error %i reattaching inode %llu while creating new dirent", - ret, inum); - return ret; - } return ret; } @@ -165,6 +147,7 @@ struct hash_check { static void hash_check_init(struct hash_check *h) { h->chain = NULL; + h->chain_end = 0; } static void hash_stop_chain(struct btree_trans *trans, @@ -248,7 +231,7 @@ static int hash_check_duplicates(struct btree_trans *trans, iter = bch2_trans_copy_iter(trans, h->chain); BUG_ON(IS_ERR(iter)); - for_each_btree_key_continue(iter, 0, k2) { + for_each_btree_key_continue(iter, 0, k2, ret) { if (bkey_cmp(k2.k->p, k.k->p) >= 0) break; @@ -393,7 +376,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", buf, strlen(buf), d->v.d_name, len)) { - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i)); + bch2_trans_update(trans, iter, &d->k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| @@ -458,7 +441,7 @@ static int check_extents(struct bch_fs *c) iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS(BCACHEFS_ROOT_INO, 0), 0); retry: - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key_continue(iter, 0, k, ret) { ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -553,7 +536,7 @@ static int check_dirents(struct bch_fs *c) iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, POS(BCACHEFS_ROOT_INO, 0), 0); retry: - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key_continue(iter, 0, k, ret) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; @@ -663,8 +646,7 @@ retry: bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = mode_to_type(target.bi_mode); - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &n->k_i)); + bch2_trans_update(&trans, iter, &n->k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL| @@ -707,7 +689,7 @@ static int check_xattrs(struct bch_fs *c) iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS(BCACHEFS_ROOT_INO, 0), 0); retry: - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key_continue(iter, 0, k, ret) { ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -759,7 +741,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) fsck_err: return ret; create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, + bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; @@ -779,7 +761,6 @@ static int check_lostfound(struct bch_fs *c, struct qstr lostfound = QSTR("lost+found"); struct bch_hash_info root_hash_info = bch2_hash_info_init(c, root_inode); - struct bkey_inode_buf packed; u64 inum; int ret; @@ -807,33 +788,20 @@ static int check_lostfound(struct bch_fs *c, fsck_err: return ret; create_lostfound: - root_inode->bi_nlink++; - - bch2_inode_pack(&packed, root_inode); - - ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + bch2_inode_init_early(c, lostfound_inode); + + ret = bch2_trans_do(c, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_create_trans(&trans, + BCACHEFS_ROOT_INO, root_inode, + lostfound_inode, &lostfound, + 0, 0, S_IFDIR|0700, 0, NULL, NULL)); if (ret) - return ret; + bch_err(c, "error creating lost+found: %i", ret); - bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, - 0, root_inode); - - ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); - if (ret) - return ret; - - ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode->bi_inum, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) - return ret; - - return 0; + return ret; } struct inode_bitmap { @@ -995,7 +963,7 @@ up: iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); retry: - for_each_btree_key_continue(iter, 0, k) { + for_each_btree_key_continue(iter, 0, k, ret) { if (k.k->type != KEY_TYPE_inode) continue; @@ -1021,7 +989,7 @@ retry: had_unreachable = true; } } - ret = bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_free(&trans, iter); if (ret) goto err; @@ -1116,9 +1084,7 @@ static int check_inode_nlink(struct bch_fs *c, struct nlink *link, bool *do_update) { - u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED - ? 0 - : u->bi_nlink + nlink_bias(u->bi_mode); + u32 i_nlink = bch2_inode_nlink_get(u); u32 real_i_nlink = link->count * nlink_bias(u->bi_mode) + link->dir_count; @@ -1197,14 +1163,7 @@ static int check_inode_nlink(struct bch_fs *c, u->bi_inum, i_nlink, real_i_nlink); set_i_nlink: if (i_nlink != real_i_nlink) { - if (real_i_nlink) { - u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode); - u->bi_flags &= ~BCH_INODE_UNLINKED; - } else { - u->bi_nlink = 0; - u->bi_flags |= BCH_INODE_UNLINKED; - } - + bch2_inode_nlink_set(u, real_i_nlink); *do_update = true; } fsck_err: @@ -1302,7 +1261,7 @@ static int check_inode(struct btree_trans *trans, struct bkey_inode_buf p; bch2_inode_pack(&p, &u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); + bch2_trans_update(trans, iter, &p.inode.k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 05b7f6594113..c0642ff46ba0 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -6,8 +6,7 @@ #include "error.h" #include "extents.h" #include "inode.h" -#include "io.h" -#include "keylist.h" +#include "str_hash.h" #include <linux/random.h> @@ -96,6 +95,7 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; bkey_inode_init(&packed->inode.k_i); packed->inode.k.p.inode = inode->bi_inum; @@ -118,10 +118,9 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, out = last_nonzero_field; nr_fields = last_nonzero_fieldnr; - set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v); - memset(out, 0, - (u8 *) &packed->inode.v + - bkey_val_bytes(&packed->inode.k) - out); + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); @@ -181,6 +180,53 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, return 0; } +struct btree_iter *bch2_inode_peek(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u64 inum, unsigned flags) +{ + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0), + BTREE_ITER_SLOTS|flags); + if (IS_ERR(iter)) + return iter; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + if (ret) + goto err; + + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + if (ret) + goto err; + + return iter; +err: + bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret); +} + +int bch2_inode_write(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode) +{ + struct bkey_inode_buf *inode_p; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + bch2_trans_update(trans, iter, &inode_p->inode.k_i); + return 0; +} + const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -251,19 +297,24 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } -void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) +void bch2_inode_init_early(struct bch_fs *c, + struct bch_inode_unpacked *inode_u) { - s64 now = bch2_current_time(c); + enum bch_str_hash_type str_hash = + bch2_str_hash_opt_to_type(c, c->opts.str_hash); memset(inode_u, 0, sizeof(*inode_u)); /* ick */ - inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET; + inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); +} +void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ inode_u->bi_mode = mode; inode_u->bi_uid = uid; inode_u->bi_gid = gid; @@ -273,6 +324,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->bi_ctime = now; inode_u->bi_otime = now; + if (parent && parent->bi_mode & S_ISGID) { + inode_u->bi_gid = parent->bi_gid; + if (S_ISDIR(mode)) + inode_u->bi_mode |= S_ISGID; + } + if (parent) { #define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; BCH_INODE_OPTS() @@ -280,6 +337,15 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, } } +void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + bch2_inode_init_early(c, inode_u); + bch2_inode_init_late(inode_u, bch2_current_time(c), + uid, gid, mode, rdev, parent); +} + static inline u32 bkey_generation(struct bkey_s_c k) { switch (k.k->type) { @@ -292,9 +358,9 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -int __bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) +int bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) { struct bch_fs *c = trans->c; struct bkey_inode_buf *inode_p; @@ -345,8 +411,7 @@ again: inode_u->bi_generation = bkey_generation(k); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + bch2_trans_update(trans, iter, &inode_p->inode.k_i); return 0; } } @@ -361,13 +426,6 @@ out: return -ENOSPC; } -int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) -{ - return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, - __bch2_inode_create(&trans, inode_u, min, max, hint)); -} - int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) { struct btree_trans trans; @@ -435,8 +493,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) delete.v.bi_generation = cpu_to_le32(bi_generation); } - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &delete.k_i)); + bch2_trans_update(&trans, iter, &delete.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -452,7 +509,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, { struct btree_iter *iter; struct bkey_s_c k; - int ret = -ENOENT; + int ret; iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inode_nr, 0), BTREE_ITER_SLOTS); @@ -460,8 +517,13 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, return PTR_ERR(iter); k = bch2_btree_iter_peek_slot(iter); - if (k.k->type == KEY_TYPE_inode) - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bkey_err(k); + if (ret) + return ret; + + ret = k.k->type == KEY_TYPE_inode + ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + : -ENOENT; bch2_trans_iter_put(trans, iter); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index af0c355f2f04..bb759a46dc41 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -46,14 +46,22 @@ struct bkey_inode_buf { void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); +struct btree_iter *bch2_inode_peek(struct btree_trans *, + struct bch_inode_unpacked *, u64, unsigned); +int bch2_inode_write(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *); + +void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); +void bch2_inode_init_late(struct bch_inode_unpacked *, u64, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int __bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, - u64, u64, u64 *); -int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, +int bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *, u64, u64, u64 *); int bch2_inode_rm(struct bch_fs *, u64); @@ -103,6 +111,63 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, } } +static inline struct bch_io_opts +io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) +{ + struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); + + bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); + return opts; +} + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* i_nlink: */ + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else + bi->bi_nlink++; +} + +static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) +{ + BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; +} + +static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) +{ + return bi->bi_flags & BCH_INODE_UNLINKED + ? 0 + : bi->bi_nlink + nlink_bias(bi->bi_mode); +} + +static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + unsigned nlink) +{ + if (nlink) { + bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); + bi->bi_flags &= ~BCH_INODE_UNLINKED; + } else { + bi->bi_nlink = 0; + bi->bi_flags |= BCH_INODE_UNLINKED; + } +} + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_inode_pack_test(void); #else diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index d5c7e45ed5b9..f483312acd0d 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -18,7 +19,8 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" +#include "inode.h" #include "io.h" #include "journal.h" #include "keylist.h" @@ -122,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -168,6 +170,262 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, mutex_unlock(&c->bio_bounce_pages_lock); } +/* Extent update path: */ + +static int sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool may_allocate, + bool *maybe_extending, + s64 *delta) +{ + struct btree_iter *iter; + struct bkey_s_c old; + int ret = 0; + + *maybe_extending = true; + *delta = 0; + + iter = bch2_trans_copy_iter(trans, extent_iter); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + if (!may_allocate && + bch2_bkey_nr_ptrs_fully_allocated(old) < + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { + ret = -ENOSPC; + break; + } + + *delta += (min(new->k.p.offset, + old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k))) * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + if (bkey_cmp(old.k->p, new->k.p) >= 0) { + /* + * Check if there's already data above where we're + * going to be writing to - this means we're definitely + * not extending the file: + * + * Note that it's not sufficient to check if there's + * data up to the sector offset we're going to be + * writing to, because i_size could be up to one block + * less: + */ + if (!bkey_cmp(old.k->p, new->k.p)) + old = bch2_btree_iter_next(iter); + + if (old.k && !bkey_err(old) && + old.k->p.inode == extent_iter->pos.inode && + bkey_extent_is_data(old.k)) + *maybe_extending = false; + + break; + } + } + + bch2_trans_iter_put(trans, iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, + u64 new_i_size, + s64 *i_sectors_delta) +{ + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; + bool extending = false; + s64 delta = 0; + int ret; + + ret = bch2_extent_trim_atomic(k, iter); + if (ret) + return ret; + + ret = sum_sector_overwrites(trans, iter, k, + disk_res && disk_res->sectors != 0, + &extending, &delta); + if (ret) + return ret; + + new_i_size = extending + ? min(k->k.p.offset << 9, new_i_size) + : 0; + + if (delta || new_i_size) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; + + inode_iter = bch2_inode_peek(trans, &inode_u, + k->k.p.inode, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + /* + * XXX: + * writeback can race a bit with truncate, because truncate + * first updates the inode then truncates the pagecache. This is + * ugly, but lets us preserve the invariant that the in memory + * i_size is always >= the on disk i_size. + * + BUG_ON(new_i_size > inode_u.bi_size && + (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); + */ + BUG_ON(new_i_size > inode_u.bi_size && !extending); + + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; + else + new_i_size = 0; + + inode_u.bi_sectors += delta; + + if (delta || new_i_size) { + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i); + } + + bch2_trans_iter_put(trans, inode_iter); + } + + bch2_trans_update(trans, iter, k); + + ret = bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE); + if (!ret && i_sectors_delta) + *i_sectors_delta += delta; + + return ret; +} + +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u64 *journal_seq, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + + while ((k = bch2_btree_iter_peek(iter)).k && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto btree_err; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_trans_begin_updates(trans); + + ret = bch2_extent_update(trans, iter, &delete, + &disk_res, journal_seq, + 0, i_sectors_delta); + bch2_disk_reservation_put(c, &disk_res); +btree_err: + if (ret == -EINTR) { + ret2 = ret; + ret = 0; + } + if (ret) + break; + } + + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + journal_seq, i_sectors_delta); + bch2_trans_exit(&trans); + + if (ret == -EINTR) + ret = 0; + + return ret; +} + +int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_on_stack sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { + k = bch2_keylist_front(keys); + + bkey_on_stack_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); + + bch2_trans_begin_updates(&trans); + + ret = bch2_extent_update(&trans, iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta); + if (ret == -EINTR) + continue; + if (ret) + break; + + if (bkey_cmp(iter->pos, k->k.p) >= 0) + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); + + return ret; +} + /* Writes */ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -243,60 +501,12 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - closure_return(cl); -} - -int bch2_write_index_default(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct btree_trans trans; - struct btree_iter *iter; - struct keylist *keys = &op->insert_keys; - int ret; - - BUG_ON(bch2_keylist_empty(keys)); - bch2_verify_keylist_sorted(keys); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); -retry: - bch2_trans_begin(&trans); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); - - do { - BKEY_PADDED(k) split; - - bkey_copy(&split.k, bch2_keylist_front(keys)); - - ret = bch2_extent_trim_atomic(&split.k, iter); - if (ret) - break; - - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &split.k)); - - ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); - if (ret) - break; - - if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0) - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); - else - bch2_keylist_pop_front(keys); - } while (!bch2_keylist_empty(keys)); - - if (ret == -EINTR) { - ret = 0; - goto retry; - } - - bch2_trans_exit(&trans); - - return ret; + if (op->end_io) + op->end_io(op); + if (cl->parent) + closure_return(cl); + else + closure_debug_destroy(cl); } /** @@ -313,16 +523,19 @@ static void __bch2_write_index(struct bch_write_op *op) for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); - bkey_copy(dst, src); - bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr, - test_bit(ptr->dev, op->failed.d)); + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) { - ret = -EIO; - goto err; + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { + ret = -EIO; + goto err; + } } + if (dst != src) + memmove_u64s_down(dst, src, src->u64s); dst = bkey_next(dst); } @@ -340,6 +553,7 @@ static void __bch2_write_index(struct bch_write_op *op) u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); + BUG_ON(ret == -EINTR); BUG_ON(keylist_sectors(keys) && !ret); op->written += sectors_start - keylist_sectors(keys); @@ -404,8 +618,10 @@ static void bch2_write_endio(struct bio *bio) if (parent) bio_endio(&parent->bio); - else + else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) closure_put(cl); + else + continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); } static void init_append_extent(struct bch_write_op *op, @@ -414,27 +630,36 @@ static void init_append_extent(struct bch_write_op *op, struct bch_extent_crc_unpacked crc) { struct bch_fs *c = op->c; - struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); - struct extent_ptr_decoded p = { .crc = crc }; + struct bkey_i_extent *e; struct open_bucket *ob; unsigned i; + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; e->k.version = version; - BUG_ON(crc.compressed_size > wp->sectors_free); - wp->sectors_free -= crc.compressed_size; + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); open_bucket_for_each(c, &wp->ptrs, ob, i) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + union bch_extent_entry *end = + bkey_val_end(bkey_i_to_s(&e->k_i)); - p.ptr = ob->ptr; - p.ptr.cached = !ca->mi.durability || + end->ptr = ob->ptr; + end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + end->ptr.cached = !ca->mi.durability || (op->flags & BCH_WRITE_CACHED) != 0; - p.ptr.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_extent_ptr_decoded_append(&e->k_i, &p); + end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; + + e->k.u64s++; BUG_ON(crc.compressed_size > ob->sectors_free); ob->sectors_free -= crc.compressed_size; @@ -615,15 +840,14 @@ static enum prep_encoded_ret { return PREP_ENCODED_OK; } -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) { struct bch_fs *c = op->c; struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; - struct bkey_i *key_to_write; void *ec_buf; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; + struct bpos ec_pos = op->pos; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; @@ -642,6 +866,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ if (ec_buf) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, @@ -791,21 +1016,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter.bi_size = total_output; do_write: /* might have done a realloc... */ + bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); - key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - - bch2_ec_add_backpointer(c, wp, - bkey_start_pos(&key_to_write->k), - total_input >> 9); - - dst->bi_end_io = bch2_write_endio; - dst->bi_private = &op->cl; - bio_set_op_attrs(dst, REQ_OP_WRITE, 0); - - closure_get(dst->bi_private); - - bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER, - key_to_write); + *_dst = dst; return more; csum_err: bch_err(c, "error verifying existing checksum while " @@ -825,11 +1038,17 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct write_point *wp; + struct bio *bio; + bool skip_put = true; int ret; again: memset(&op->failed, 0, sizeof(op->failed)); do { + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) @@ -862,23 +1081,39 @@ again: goto flush_io; } - ret = bch2_write_extent(op, wp); - bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); if (ret < 0) goto err; + + if (ret) + skip_put = false; + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + + if (!skip_put) + closure_get(bio->bi_private); + else + op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + key_to_write); } while (ret); - continue_at(cl, bch2_write_index, index_update_wq(op)); + if (!skip_put) + continue_at(cl, bch2_write_index, index_update_wq(op)); return; err: op->error = ret; - continue_at(cl, !bch2_keylist_empty(&op->insert_keys) - ? bch2_write_index - : bch2_write_done, index_update_wq(op)); + continue_at(cl, bch2_write_index, index_update_wq(op)); return; flush_io: closure_sync(cl); @@ -895,6 +1130,47 @@ flush_io: goto again; } +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct closure *cl = &op->cl; + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + continue_at_nobarrier(cl, bch2_write_index, NULL); + return; +err: + bch2_write_done(&op->cl); +} + /** * bch_write - handle a write to a cache device or flash only volume * @@ -916,22 +1192,22 @@ void bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; + unsigned data_len; BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + if (bio_sectors(bio) & (c->opts.block_size - 1)) { __bcache_io_error(c, "misaligned write"); op->error = -EIO; goto err; } - op->start_time = local_clock(); - - bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(bio)->put_bio = false; - if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { __bcache_io_error(c, "read only"); @@ -941,12 +1217,25 @@ void bch2_write(struct closure *cl) bch2_increment_clock(c, bio_sectors(bio), WRITE); + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } + continue_at_nobarrier(cl, __bch2_write, NULL); return; err: if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) bch2_disk_reservation_put(c, &op->res); - closure_return(cl); + if (op->end_io) + op->end_io(op); + if (cl->parent) + closure_return(cl); + else + closure_debug_destroy(cl); } /* Cache promotion on read */ @@ -1042,7 +1331,6 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) closure_return_with_destructor(cl, promote_done); } -noinline static struct promote_op *__promote_alloc(struct bch_fs *c, enum btree_id btree_id, struct bpos pos, @@ -1116,7 +1404,8 @@ err: return NULL; } -static inline struct promote_op *promote_alloc(struct bch_fs *c, +noinline +static struct promote_op *promote_alloc(struct bch_fs *c, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, @@ -1228,12 +1517,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio { struct btree_trans trans; struct btree_iter *iter; - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, @@ -1245,11 +1536,11 @@ retry: if (bkey_err(k)) goto err; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); - if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k), + if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, rbio->pos.offset - rbio->pick.crc.offset)) { @@ -1266,6 +1557,7 @@ retry: out: bch2_rbio_done(rbio); bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; @@ -1278,12 +1570,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, { struct btree_trans trans; struct btree_iter *iter; + struct bkey_on_stack sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -1291,18 +1585,17 @@ retry: for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { - BKEY_PADDED(k) tmp; unsigned bytes, sectors, offset_into_extent; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) break; @@ -1329,6 +1622,8 @@ retry: bio_advance_iter(&rbio->bio, &bvec_iter, bytes); } + if (ret == -EINTR) + goto retry; /* * If we get here, it better have been because there was an error * reading a btree node @@ -1339,6 +1634,7 @@ err: rbio->bio.bi_status = BLK_STS_IOERR; out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); bch2_rbio_done(rbio); } @@ -1395,7 +1691,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(k) new; + struct bkey_on_stack new; struct bch_extent_crc_unpacked new_crc; u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; int ret; @@ -1403,18 +1699,19 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) if (rbio->pick.crc.compression_type) return; + bkey_on_stack_init(&new); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(iter); + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); if (IS_ERR_OR_NULL(k.k)) goto out; - bkey_reassemble(&new.k, k); - k = bkey_i_to_s_c(&new.k); + bkey_on_stack_reassemble(&new, c, k); + k = bkey_i_to_s_c(new.k); if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) @@ -1433,10 +1730,10 @@ retry: goto out; } - if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) + if (!bch2_bkey_narrow_crcs(new.k, new_crc)) goto out; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k)); + bch2_trans_update(&trans, iter, new.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -1445,6 +1742,7 @@ retry: goto retry; out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&new, c); } /* Inner part that may run in process context */ @@ -1602,9 +1900,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + *offset_into_extent; - iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK, - POS(0, reflink_offset), - BTREE_ITER_SLOTS, 1); + iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); ret = PTR_ERR_OR_ZERO(iter); if (ret) return ret; @@ -1641,6 +1939,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bpos pos = bkey_start_pos(k.k); int pick_ret; + if (k.k->type == KEY_TYPE_inline_data) { + struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_val_bytes(d.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, d.v->data); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -1677,7 +1988,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (pick.crc.compression_type != BCH_COMPRESSION_NONE || (pick.crc.csum_type != BCH_CSUM_NONE && @@ -1689,8 +2000,9 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bounce = true; } - promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); + if (orig->opts.promote_target) + promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); if (!read_full) { EBUG_ON(pick.crc.compression_type); @@ -1718,7 +2030,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, * data in the write path, but we're not going to use it all * here: */ - BUG_ON(rbio->bio.bi_iter.bi_size < + EBUG_ON(rbio->bio.bi_iter.bi_size < pick.crc.compressed_size << 9); rbio->bio.bi_iter.bi_size = pick.crc.compressed_size << 9; @@ -1751,10 +2063,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, noclone: rbio = orig; rbio->bio.bi_iter = iter; - BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; rbio->submit_time = local_clock(); @@ -1770,6 +2082,7 @@ noclone: rbio->hole = 0; rbio->retry = 0; rbio->context = 0; + /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->pos = pos; @@ -1786,11 +2099,11 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - percpu_down_read(&c->mark_lock); + rcu_read_lock(); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - percpu_up_read(&c->mark_lock); + rcu_read_unlock(); - if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); trace_read_split(&orig->bio); } @@ -1867,14 +2180,13 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) { struct btree_trans trans; struct btree_iter *iter; + struct bkey_on_stack sk; struct bkey_s_c k; unsigned flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED; int ret; - bch2_trans_init(&trans, c, 0, 0); - BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); BUG_ON(flags & BCH_READ_IN_RETRY); @@ -1882,12 +2194,15 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); + bkey_on_stack_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS(inode, rbio->bio.bi_iter.bi_sector), BTREE_ITER_SLOTS); - while (1) { - BKEY_PADDED(k) tmp; unsigned bytes, sectors, offset_into_extent; bch2_btree_iter_set_pos(iter, @@ -1898,15 +2213,15 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) if (ret) goto err; - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; + bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &tmp.k); + &offset_into_extent, sk.k); if (ret) goto err; @@ -1938,8 +2253,12 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) } out: bch2_trans_exit(&trans); + bkey_on_stack_exit(&sk, c); return; err: + if (ret == -EINTR) + goto retry; + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_rbio_done(rbio); goto out; diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 80b72dbf1a0c..45c950942d78 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -30,9 +30,11 @@ enum bch_write_flags { BCH_WRITE_PAGES_OWNED = (1 << 5), BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), BCH_WRITE_NOPUT_RESERVATION = (1 << 7), + BCH_WRITE_WROTE_DATA_INLINE = (1 << 8), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), + BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -54,13 +56,20 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) : op->c->wq; } +int bch2_extent_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct disk_reservation *, + u64 *, u64, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, u64 *, s64 *); +int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); + int bch2_write_index_default(struct bch_write_op *); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct bch_io_opts opts) { op->c = c; - op->io_wq = index_update_wq(op); + op->end_io = NULL; op->flags = 0; op->written = 0; op->error = 0; @@ -78,6 +87,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->write_point = (struct write_point_specifier) { 0 }; op->res = (struct disk_reservation) { 0 }; op->journal_seq = 0; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; op->index_update_fn = bch2_write_index_default; } diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index 2d397e5e5b9e..c37b7d7401e9 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -93,7 +93,7 @@ struct bch_write_bio { struct bch_write_op { struct closure cl; struct bch_fs *c; - struct workqueue_struct *io_wq; + void (*end_io)(struct bch_write_op *); u64 start_time; unsigned written; /* sectors */ @@ -109,7 +109,6 @@ struct bch_write_op { struct bch_devs_list devs_have; u16 target; u16 nonce; - struct bch_io_opts opts; struct bpos pos; @@ -132,6 +131,8 @@ struct bch_write_op { u64 *journal_seq_p; u64 journal_seq; }; + u64 new_i_size; + s64 i_sectors_delta; int (*index_update_fn)(struct bch_write_op *); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5c3e146e3942..9f03a479c9a2 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -945,7 +945,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) w = j->buf + !state.idx; ret = state.prev_buf_unwritten && - bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx); + bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); spin_unlock(&j->lock); return ret; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index ec5ba2b9ef42..ec61137df00a 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -269,7 +269,7 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _RET_IP_); + lock_release(&j->res_map, 0, _THIS_IP_); while (res->u64s) bch2_journal_add_entry(j, res, diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 387377dadab5..7112a25d0600 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1100,7 +1100,7 @@ void bch2_journal_write(struct closure *cl) for_each_rw_member(ca, c, i) if (journal_flushes_device(ca) && - !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) { + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index dc3b03d6e627..4b59dcd04cce 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,6 +4,7 @@ */ #include "bcachefs.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - BKEY_PADDED(key) tmp; + struct bkey_on_stack sk; int ret = 0; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, @@ -58,9 +60,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags continue; } - bkey_reassemble(&tmp.key, k); + bkey_on_stack_reassemble(&sk, c, k); - ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key), + ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), dev_idx, flags, false); if (ret) break; @@ -70,12 +72,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); + bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - /* XXX not sketchy at all */ - iter->pos = bkey_start_pos(&tmp.key.k); + bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key)); + bch2_trans_update(&trans, iter, sk.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -93,6 +94,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags } ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&sk, c); BUG_ON(ret == -EINTR); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 70b2b686e0a8..fad3cc4d587c 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_on_stack.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -96,10 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); + bch2_cut_front(iter->pos, &new->k_i); - bch2_cut_front(iter->pos, insert); - bch2_cut_back(new->k.p, &insert->k); - bch2_cut_back(insert->k.p, &new->k); + bch2_cut_front(iter->pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); if (m->data_cmd == DATA_REWRITE) bch2_bkey_drop_device(bkey_i_to_s(insert), @@ -133,11 +135,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - + nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && - bch2_extent_is_compressed(k) && + bch2_bkey_sectors_compressed(k) && nr > 0) { ret = bch2_disk_reservation_add(c, &op->res, keylist_sectors(keys) * nr, 0); @@ -148,8 +150,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto next; } - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, insert)); + bch2_trans_update(&trans, iter, insert); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), @@ -169,8 +170,6 @@ next: if (bch2_keylist_empty(keys)) goto out; } - - bch2_cut_front(iter->pos, bch2_keylist_front(keys)); continue; nomatch: if (m->ctxt) @@ -252,7 +251,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, */ #if 0 int nr = (int) io_opts.data_replicas - - bch2_bkey_nr_dirty_ptrs(k); + bch2_bkey_nr_ptrs_allocated(k); #endif int nr = (int) io_opts.data_replicas; @@ -302,12 +301,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - int i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); @@ -491,7 +490,7 @@ static int __bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - BKEY_PADDED(k) tmp; + struct bkey_on_stack sk; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -500,6 +499,7 @@ static int __bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; + bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; @@ -551,7 +551,8 @@ peek: if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - if (cur_inum != k.k->p.inode) { + if (btree_id == BTREE_ID_EXTENTS && + cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; /* don't hold btree locks while looking up inode: */ @@ -578,8 +579,8 @@ peek: } /* unlock before doing IO: */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bkey_on_stack_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, @@ -598,7 +599,7 @@ peek: if (rate) bch2_ratelimit_increment(rate, k.k->size); next: - atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k), + atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), &stats->sectors_seen); next_nondata: bch2_btree_iter_next(iter); @@ -606,6 +607,7 @@ next_nondata: } out: ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&sk, c); return ret; } diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 710296044194..abdeef20fde9 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -107,10 +107,10 @@ static bool have_copygc_reserve(struct bch_dev *ca) { bool ret; - spin_lock(&ca->freelist_lock); + spin_lock(&ca->fs->freelist_lock); ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ca->allocator_state != ALLOCATOR_RUNNING; - spin_unlock(&ca->freelist_lock); + spin_unlock(&ca->fs->freelist_lock); return ret; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 97a782f44f6e..0ec0999a6214 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -68,6 +68,12 @@ enum opt_type { * - helptext */ +#ifdef __KERNEL__ +#define RATELIMIT_ERRORS true +#else +#define RATELIMIT_ERRORS false +#endif + #define BCH_OPTS() \ x(block_size, u16, \ OPT_FORMAT, \ @@ -127,7 +133,7 @@ enum opt_type { x(str_hash, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_str_hash_types), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH, \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ NULL, "Hash function for directory entries and xattrs")\ x(foreground_target, u16, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ @@ -227,6 +233,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, RATELIMIT_ERRORS, \ + NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ @@ -289,13 +300,7 @@ enum opt_type { OPT_UINT(0, BCH_REPLICAS_MAX), \ NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(new_inode_updates, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false, \ - NULL, "Enable new btree write-cache for inode updates") - + "to have already been replicated n times") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index f0da0fac09bf..0fa6f33c049b 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, if (qdq->d_fieldmask & QC_INO_HARD) new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i)); + bch2_trans_update(&trans, iter, &new_quota.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 98d9a1432e50..d4002b7fc917 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -10,6 +10,7 @@ #include "dirent.h" #include "ec.h" #include "error.h" +#include "fs-common.h" #include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -176,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?: cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) { if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) { - bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k); } else { struct bkey_i *split = kmalloc(bkey_bytes(i[0].k), GFP_KERNEL); @@ -185,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) goto err; bkey_copy(split, i[0].k); - bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k); + bch2_cut_back(bkey_start_pos(&i[1].k->k), split); keys_deduped.d[keys_deduped.nr++] = (struct journal_key) { .btree_id = i[0].btree_id, .allocated = true, @@ -253,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, * Some extents aren't equivalent - w.r.t. what the triggers do * - if they're split: */ - bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) || + bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || k->k.type == KEY_TYPE_reflink_p; bool remark = false; int ret; @@ -271,6 +272,8 @@ retry: if (ret) goto err; + atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); + split_iter = bch2_trans_copy_iter(&trans, iter); ret = PTR_ERR_OR_ZERO(split_iter); if (ret) @@ -281,16 +284,12 @@ retry: if (ret) goto err; - ret = bch2_extent_atomic_end(split_iter, k, &atomic_end); - if (ret) - goto err; - if (!remark && remark_if_split && bkey_cmp(atomic_end, k->k.p) < 0) { ret = bch2_disk_reservation_add(c, &disk_res, k->k.size * - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)), + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), BCH_DISK_RESERVATION_NOFAIL); BUG_ON(ret); @@ -299,9 +298,9 @@ retry: bkey_copy(split, k); bch2_cut_front(split_iter->pos, split); - bch2_cut_back(atomic_end, &split->k); + bch2_cut_back(atomic_end, split); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); + bch2_trans_update(&trans, split_iter, split); bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); @@ -865,6 +864,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } bch_verbose(c, "alloc write done"); + + set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); } if (!c->sb.clean) { @@ -912,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c) write_sb = true; } + if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA); + write_sb = true; + } + if (!test_bit(BCH_FS_ERROR, &c->flags)) { c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; write_sb = true; @@ -952,7 +959,6 @@ int bch2_fs_initialize(struct bch_fs *c) { struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; - struct bch_hash_info root_hash_info; struct qstr lostfound = QSTR("lost+found"); const char *err = "cannot allocate memory"; struct bch_dev *ca; @@ -997,7 +1003,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; - root_inode.bi_nlink++; /* lost+found */ bch2_inode_pack(&packed_inode, &root_inode); err = "error creating root directory"; @@ -1007,24 +1012,15 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - bch2_inode_init(c, &lostfound_inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, - &root_inode); - lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1; - bch2_inode_pack(&packed_inode, &lostfound_inode); + bch2_inode_init_early(c, &lostfound_inode); err = "error creating lost+found"; - ret = bch2_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, 0); - if (ret) - goto err; - - root_hash_info = bch2_hash_info_init(c, &root_inode); - - ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode.bi_inum, NULL, - BTREE_INSERT_NOFAIL); + ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, S_IFDIR|0700, 0, + NULL, NULL)); if (ret) goto err; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index dcca9c1d0f47..2812fa305c0e 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -1,9 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_on_stack.h" #include "btree_update.h" #include "extents.h" -#include "fs.h" -#include "fs-io.h" +#include "inode.h" +#include "io.h" #include "reflink.h" #include <linux/sched/signal.h> @@ -39,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c, if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { bch2_key_resize(l.k, KEY_SIZE_MAX); - __bch2_cut_front(l.k->p, _r); + bch2_cut_front_s(l.k->p, _r); return BCH_MERGE_PARTIAL; } @@ -70,12 +71,6 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -/* - * bch2_remap_range() depends on bch2_extent_update(), which depends on various - * things tied to the linux vfs for inode updates, for now: - */ -#ifndef NO_BCACHEFS_FS - static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i_extent *e) @@ -120,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->v.refcount = 0; memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + bch2_trans_update(trans, reflink_iter, &r_v->k_i); r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); if (IS_ERR(r_p)) @@ -131,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); + bch2_trans_update(trans, extent_iter, &r_p->k_i); err: if (!IS_ERR(reflink_iter)) { c->reflink_hint = reflink_iter->pos.offset; @@ -144,35 +139,37 @@ err: static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) { struct bkey_s_c k = bch2_btree_iter_peek(iter); + int ret; - while (1) { - if (bkey_err(k)) - return k; - + for_each_btree_key_continue(iter, 0, k, ret) { if (bkey_cmp(iter->pos, end) >= 0) return bkey_s_c_null; if (k.k->type == KEY_TYPE_extent || k.k->type == KEY_TYPE_reflink_p) - return k; - - k = bch2_btree_iter_next(iter); + break; } + + return k; } s64 bch2_remap_range(struct bch_fs *c, - struct bch_inode_info *dst_inode, struct bpos dst_start, struct bpos src_start, - u64 remap_sectors, u64 new_i_size) + u64 remap_sectors, u64 *journal_seq, + u64 new_i_size, s64 *i_sectors_delta) { struct btree_trans trans; struct btree_iter *dst_iter, *src_iter; struct bkey_s_c src_k; - BKEY_PADDED(k) new_dst, new_src; + BKEY_PADDED(k) new_dst; + struct bkey_on_stack new_src; struct bpos dst_end = dst_start, src_end = src_start; struct bpos dst_want, src_want; u64 src_done, dst_done; - int ret = 0; + int ret = 0, ret2 = 0; + + if (!percpu_ref_tryget(&c->writes)) + return -EROFS; if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { mutex_lock(&c->sb_lock); @@ -188,12 +185,13 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset += remap_sectors; src_end.offset += remap_sectors; + bkey_on_stack_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, - BTREE_ITER_INTENT, 1); - dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, - BTREE_ITER_INTENT, 2); + src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + BTREE_ITER_INTENT); + dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + BTREE_ITER_INTENT); while (1) { bch2_trans_begin_updates(&trans); @@ -215,7 +213,7 @@ s64 bch2_remap_range(struct bch_fs *c, if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ret = bch2_fpunch_at(&trans, dst_iter, dst_want, - dst_inode, new_i_size); + journal_seq, i_sectors_delta); if (ret) goto btree_err; continue; @@ -227,14 +225,14 @@ s64 bch2_remap_range(struct bch_fs *c, break; if (src_k.k->type == KEY_TYPE_extent) { - bkey_reassemble(&new_src.k, src_k); - src_k = bkey_i_to_s_c(&new_src.k); + bkey_on_stack_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); - bch2_cut_front(src_iter->pos, &new_src.k); - bch2_cut_back(src_end, &new_src.k.k); + bch2_cut_front(src_iter->pos, new_src.k); + bch2_cut_back(src_end, new_src.k); ret = bch2_make_extent_indirect(&trans, src_iter, - bkey_i_to_extent(&new_src.k)); + bkey_i_to_extent(new_src.k)); if (ret) goto btree_err; @@ -261,9 +259,9 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_iter->pos.offset, dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, - dst_iter, &new_dst.k, - new_i_size, false, true, NULL); + ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, + NULL, journal_seq, + new_i_size, i_sectors_delta); if (ret) goto btree_err; @@ -284,17 +282,29 @@ err: dst_done = dst_iter->pos.offset - dst_start.offset; new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + bch2_trans_begin(&trans); + + do { + struct bch_inode_unpacked inode_u; + struct btree_iter *inode_iter; + + inode_iter = bch2_inode_peek(&trans, &inode_u, + dst_start.inode, BTREE_ITER_INTENT); + ret2 = PTR_ERR_OR_ZERO(inode_iter); + + if (!ret2 && + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, journal_seq, + BTREE_INSERT_ATOMIC); + } + } while (ret2 == -EINTR); + ret = bch2_trans_exit(&trans) ?: ret; + bkey_on_stack_exit(&new_src, c); - mutex_lock(&dst_inode->ei_update_lock); - if (dst_inode->v.i_size < new_i_size) { - i_size_write(&dst_inode->v, new_i_size); - ret = bch2_write_inode_size(c, dst_inode, new_i_size, - ATTR_MTIME|ATTR_CTIME); - } - mutex_unlock(&dst_inode->ei_update_lock); + percpu_ref_put(&c->writes); - return dst_done ?: ret; + return dst_done ?: ret ?: ret2; } - -#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 327618c36d33..ac23b855858c 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -24,9 +24,7 @@ void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_reflink_v_to_text, \ } -#ifndef NO_BCACHEFS_FS -s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, - struct bpos, struct bpos, u64, u64); -#endif /* NO_BCACHEFS_FS */ +s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, + u64, u64 *, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index bb9da2bb5a92..cb5ebb87c701 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -84,10 +84,8 @@ static void extent_to_replicas(struct bkey_s_c k, if (p.ptr.cached) continue; - if (p.ec_nr) { + if (p.has_ec) r->nr_required = 0; - break; - } r->devs[r->nr_devs++] = p.ptr.dev; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 091bf7a89577..582e718b6bd1 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -14,6 +14,23 @@ #include <crypto/hash.h> #include <crypto/sha.h> +static inline enum bch_str_hash_type +bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) +{ + switch (opt) { + case BCH_STR_HASH_OPT_CRC32C: + return BCH_STR_HASH_CRC32C; + case BCH_STR_HASH_OPT_CRC64: + return BCH_STR_HASH_CRC64; + case BCH_STR_HASH_OPT_SIPHASH: + return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH) + ? BCH_STR_HASH_SIPHASH + : BCH_STR_HASH_SIPHASH_OLD; + default: + BUG(); + } +} + struct bch_hash_info { u8 type; union { @@ -23,34 +40,24 @@ struct bch_hash_info { }; static inline struct bch_hash_info -bch2_hash_info_init(struct bch_fs *c, - const struct bch_inode_unpacked *bi) +bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { /* XXX ick */ struct bch_hash_info info = { .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & - ~(~0U << INODE_STR_HASH_BITS) + ~(~0U << INODE_STR_HASH_BITS), + .crc_key = bi->bi_hash_seed, }; - switch (info.type) { - case BCH_STR_HASH_CRC32C: - case BCH_STR_HASH_CRC64: - info.crc_key = bi->bi_hash_seed; - break; - case BCH_STR_HASH_SIPHASH: { + if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; desc->tfm = c->sha256; - desc->flags = 0; crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); - break; - } - default: - BUG(); } return info; @@ -74,6 +81,7 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, case BCH_STR_HASH_CRC64: ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); break; + case BCH_STR_HASH_SIPHASH_OLD: case BCH_STR_HASH_SIPHASH: SipHash24_Init(&ctx->siphash, &info->siphash_key); break; @@ -93,6 +101,7 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, case BCH_STR_HASH_CRC64: ctx->crc64 = crc64_be(ctx->crc64, data, len); break; + case BCH_STR_HASH_SIPHASH_OLD: case BCH_STR_HASH_SIPHASH: SipHash24_Update(&ctx->siphash, data, len); break; @@ -109,6 +118,7 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, return ctx->crc32c; case BCH_STR_HASH_CRC64: return ctx->crc64 >> 1; + case BCH_STR_HASH_SIPHASH_OLD: case BCH_STR_HASH_SIPHASH: return SipHash24_End(&ctx->siphash) >> 1; default: @@ -188,6 +198,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, { struct btree_iter *iter; struct bkey_s_c k; + int ret; iter = bch2_trans_copy_iter(trans, start); if (IS_ERR(iter)) @@ -195,19 +206,21 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, bch2_btree_iter_next_slot(iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) { + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_whiteout) break; if (k.k->type == desc.key_type && desc.hash_bkey(info, k) <= start->pos.offset) { - bch2_trans_iter_free_on_commit(trans, iter); - return 1; + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + ret = 1; + break; } } - return bch2_trans_iter_free(trans, iter); + bch2_trans_iter_put(trans, iter); + return ret; } static __always_inline @@ -246,11 +259,15 @@ int bch2_hash_set(struct btree_trans *trans, goto not_found; } - if (slot) - bch2_trans_iter_free(trans, slot); - bch2_trans_iter_free(trans, iter); + if (!ret) + ret = -ENOSPC; +out: + if (!IS_ERR_OR_NULL(slot)) + bch2_trans_iter_put(trans, slot); + if (!IS_ERR_OR_NULL(iter)) + bch2_trans_iter_put(trans, iter); - return ret ?: -ENOSPC; + return ret; found: found = true; not_found: @@ -260,17 +277,14 @@ not_found: } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { - if (!found && slot) { - bch2_trans_iter_free(trans, iter); - iter = slot; - } + if (!found && slot) + swap(iter, slot); insert->k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert)); - bch2_trans_iter_free_on_commit(trans, iter); + bch2_trans_update(trans, iter, insert); } - return ret; + goto out; } static __always_inline @@ -294,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete)); + bch2_trans_update(trans, iter, delete); return 0; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 3043def884ab..b36cfdf0b41c 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -949,6 +949,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c) return ret; } +static void +entry_init_u64s(struct jset_entry *entry, unsigned u64s) +{ + memset(entry, 0, u64s * sizeof(u64)); + + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = u64s - 1; +} + +static void +entry_init_size(struct jset_entry *entry, size_t size) +{ + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + entry_init_u64s(entry, u64s); +} + struct jset_entry * bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry *entry, @@ -963,7 +982,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, r < c->btree_roots + BTREE_ID_NR; r++) if (r->alive) { - entry->u64s = r->key.u64s; + entry_init_u64s(entry, r->key.u64s + 1); entry->btree_id = r - c->btree_roots; entry->level = r->level; entry->type = BCH_JSET_ENTRY_btree_root; @@ -988,8 +1007,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - memset(u, 0, sizeof(*u)); - u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; u->v = cpu_to_le64(c->usage_base->nr_inodes); @@ -1001,8 +1019,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - memset(u, 0, sizeof(*u)); - u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_KEY_VERSION; u->v = cpu_to_le64(atomic64_read(&c->key_version)); @@ -1014,8 +1031,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); - memset(u, 0, sizeof(*u)); - u->entry.u64s = DIV_ROUND_UP(sizeof(*u), sizeof(u64)) - 1; + entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; @@ -1030,9 +1046,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); - memset(u, 0, sizeof(*u)); - u->entry.u64s = DIV_ROUND_UP(sizeof(*u) + e->nr_devs, - sizeof(u64)) - 1; + entry_init_size(entry, sizeof(*u) + e->nr_devs); u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e)); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 4145832f4856..17bdf985559c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -506,6 +506,7 @@ static void bch2_fs_free(struct bch_fs *c) free_percpu(c->usage[0]); kfree(c->usage_base); free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->btree_interior_update_pool); @@ -758,6 +759,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -1090,7 +1092,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); - spin_lock_init(&ca->freelist_lock); bch2_dev_copygc_init(ca); INIT_WORK(&ca->io_error_work, bch2_io_error_work); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 27646c435e30..e7699afd99fc 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -775,7 +775,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) struct printbuf out = _PBUF(buf, PAGE_SIZE); enum alloc_reserve i; - spin_lock(&ca->freelist_lock); + spin_lock(&ca->fs->freelist_lock); pr_buf(&out, "free_inc:\t%zu\t%zu\n", fifo_used(&ca->free_inc), @@ -786,7 +786,7 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) fifo_used(&ca->free[i]), ca->free[i].size); - spin_unlock(&ca->freelist_lock); + spin_unlock(&ca->fs->freelist_lock); return out.pos - buf; } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index fe0b987902fb..724f41e6590c 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } @@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i)); + bch2_trans_update(&trans, iter, &insert.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i)); + bch2_trans_update(&trans, iter, &u.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 2cc433ec0e3a..e69d03d1109f 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -550,7 +550,7 @@ size_t bch2_rand_range(size_t max) return rand; } -void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src) +void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) { struct bio_vec bv; struct bvec_iter iter; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 4b33a527494d..0128daba5970 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -547,9 +547,19 @@ do { \ size_t bch2_rand_range(size_t); -void memcpy_to_bio(struct bio *, struct bvec_iter, void *); +void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +static inline void memcpy_u64s_small(void *dst, const void *src, + unsigned u64s) +{ + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +} + static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { @@ -591,6 +601,24 @@ static inline void memmove_u64s_down(void *dst, const void *src, __memmove_u64s_down(dst, src, u64s); } +static inline void __memmove_u64s_up_small(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s; + u64 *src = (u64 *) _src + u64s; + + while (u64s--) + *--dst = *--src; +} + +static inline void memmove_u64s_up_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up_small(dst, src, u64s); +} + static inline void __memmove_u64s_up(void *_dst, const void *_src, unsigned u64s) { @@ -628,35 +656,14 @@ static inline void memmove_u64s(void *dst, const void *src, __memmove_u64s_up(dst, src, u64s); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) +/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ +static inline void memset_u64s_tail(void *s, int c, unsigned bytes) { - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; + unsigned rem = round_up(bytes, sizeof(u64)) - bytes; - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; + memset(s + bytes, c, rem); } -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); |