diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-01-20 15:34:50 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2020-05-06 17:14:18 -0400 |
commit | 6f4bfbc65f9100a98dba1734dbff26129a4413a8 (patch) | |
tree | 2ff581f9abb56c912e7007577bb5813f3e5de1c4 | |
parent | 181bdfee8fea1f62a6544b6913d2291ecca5fcda (diff) |
Merge with fe198a9f39) bcachefs: Improve tracepoints slightly in commit path
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
67 files changed, 1875 insertions, 2079 deletions
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index dcd0dfe87b51..76c98ddbf628 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -323,7 +323,6 @@ retry: ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); btree_err: if (ret == -EINTR) @@ -378,7 +377,7 @@ int bch2_acl_chmod(struct btree_trans *trans, } new->k.p = iter->pos; - bch2_trans_update(trans, iter, &new->k_i); + bch2_trans_update(trans, iter, &new->k_i, 0); *new_acl = acl; acl = NULL; err: diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e252a039dc2b..c57df50168e0 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -222,8 +222,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret) bch2_mark_key(c, k, 0, 0, NULL, 0, - BCH_BUCKET_MARK_ALLOC_READ| - BCH_BUCKET_MARK_NOATOMIC); + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); ret = bch2_trans_exit(&trans) ?: ret; if (ret) { @@ -235,8 +235,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) if (j->btree_id == BTREE_ID_ALLOC) bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, 0, NULL, 0, - BCH_BUCKET_MARK_ALLOC_READ| - BCH_BUCKET_MARK_NOATOMIC); + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); percpu_down_write(&c->mark_lock); bch2_dev_usage_from_buckets(c); @@ -314,12 +314,10 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, new_u); - bch2_trans_update(trans, iter, &a->k_i); + bch2_trans_update(trans, iter, &a->k_i, + BTREE_TRIGGER_NORUN); ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOMARK| - flags); + BTREE_INSERT_NOFAIL|flags); err: if (ret == -EINTR) goto retry; @@ -384,8 +382,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ret = bch2_alloc_write_key(&trans, iter, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); + BTREE_INSERT_JOURNAL_REPLAY); bch2_trans_exit(&trans); return ret < 0 ? ret : 0; } @@ -902,7 +899,8 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i); + bch2_trans_update(trans, iter, &a->k_i, + BTREE_TRIGGER_BUCKET_INVALIDATE); /* * XXX: @@ -913,13 +911,11 @@ retry: */ ret = bch2_trans_commit(trans, NULL, invalidating_cached_data ? journal_seq : NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_BUCKET_INVALIDATE| flags); if (ret == -EINTR) goto retry; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7175e4b9e070..6ea26ae5c3b8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -714,12 +714,12 @@ struct bch_fs { struct rhashtable promote_table; mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_NR]; + mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; mempool_t decompress_workspace; ZSTD_parameters zstd_params; struct crypto_shash *sha256; - struct crypto_skcipher *chacha20; + struct crypto_sync_skcipher *chacha20; struct crypto_shash *poly1305; atomic64_t key_version; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 3d85012a15fd..f6141fde830b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -432,47 +432,6 @@ struct bch_csum { __le64 hi; } __attribute__((packed, aligned(8))); -enum bch_csum_type { - BCH_CSUM_NONE = 0, - BCH_CSUM_CRC32C_NONZERO = 1, - BCH_CSUM_CRC64_NONZERO = 2, - BCH_CSUM_CHACHA20_POLY1305_80 = 3, - BCH_CSUM_CHACHA20_POLY1305_128 = 4, - BCH_CSUM_CRC32C = 5, - BCH_CSUM_CRC64 = 6, - BCH_CSUM_NR = 7, -}; - -static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C_NONZERO] = 4, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64_NONZERO] = 8, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -}; - -static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -{ - switch (type) { - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: - return true; - default: - return false; - } -} - -enum bch_compression_type { - BCH_COMPRESSION_NONE = 0, - BCH_COMPRESSION_LZ4_OLD = 1, - BCH_COMPRESSION_GZIP = 2, - BCH_COMPRESSION_LZ4 = 3, - BCH_COMPRESSION_ZSTD = 4, - BCH_COMPRESSION_NR = 5, -}; - #define BCH_EXTENT_ENTRY_TYPES() \ x(ptr, 0) \ x(crc32, 1) \ @@ -1080,6 +1039,9 @@ struct bch_replicas_entry { __u8 devs[0]; } __attribute__((packed)); +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + struct bch_sb_field_replicas { struct bch_sb_field field; struct bch_replicas_entry entries[0]; @@ -1313,17 +1275,31 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -/* Features: */ -enum bch_sb_features { - BCH_FEATURE_LZ4 = 0, - BCH_FEATURE_GZIP = 1, - BCH_FEATURE_ZSTD = 2, - BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ - BCH_FEATURE_EC = 4, - BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, - BCH_FEATURE_REFLINK = 6, - BCH_FEATURE_NEW_SIPHASH = 7, - BCH_FEATURE_INLINE_DATA = 8, +/* + * Features: + * + * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist + * reflink: gates KEY_TYPE_reflink + * inline_data: gates KEY_TYPE_inline_data + * new_siphash: gates BCH_STR_HASH_SIPHASH + * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE + */ +#define BCH_SB_FEATURES() \ + x(lz4, 0) \ + x(gzip, 1) \ + x(zstd, 2) \ + x(atomic_nlink, 3) \ + x(ec, 4) \ + x(journal_seq_blacklist_v3, 5) \ + x(reflink, 6) \ + x(new_siphash, 7) \ + x(inline_data, 8) \ + x(new_extent_overwrite, 9) + +enum bch_sb_feature { +#define x(f, n) BCH_FEATURE_##f, + BCH_SB_FEATURES() +#undef x BCH_FEATURE_NR, }; @@ -1343,13 +1319,6 @@ enum bch_error_actions { BCH_NR_ERROR_ACTIONS = 3, }; -enum bch_csum_opts { - BCH_CSUM_OPT_NONE = 0, - BCH_CSUM_OPT_CRC32C = 1, - BCH_CSUM_OPT_CRC64 = 2, - BCH_CSUM_OPT_NR = 3, -}; - enum bch_str_hash_type { BCH_STR_HASH_CRC32C = 0, BCH_STR_HASH_CRC64 = 1, @@ -1365,16 +1334,69 @@ enum bch_str_hash_opts { BCH_STR_HASH_OPT_NR = 3, }; +enum bch_csum_type { + BCH_CSUM_NONE = 0, + BCH_CSUM_CRC32C_NONZERO = 1, + BCH_CSUM_CRC64_NONZERO = 2, + BCH_CSUM_CHACHA20_POLY1305_80 = 3, + BCH_CSUM_CHACHA20_POLY1305_128 = 4, + BCH_CSUM_CRC32C = 5, + BCH_CSUM_CRC64 = 6, + BCH_CSUM_NR = 7, +}; + +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C_NONZERO] = 4, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64_NONZERO] = 8, + [BCH_CSUM_CRC64] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +}; + +static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) +{ + switch (type) { + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: + return true; + default: + return false; + } +} + +enum bch_csum_opts { + BCH_CSUM_OPT_NONE = 0, + BCH_CSUM_OPT_CRC32C = 1, + BCH_CSUM_OPT_CRC64 = 2, + BCH_CSUM_OPT_NR = 3, +}; + #define BCH_COMPRESSION_TYPES() \ - x(NONE) \ - x(LZ4) \ - x(GZIP) \ - x(ZSTD) + x(none, 0) \ + x(lz4_old, 1) \ + x(gzip, 2) \ + x(lz4, 3) \ + x(zstd, 4) -enum bch_compression_opts { -#define x(t) BCH_COMPRESSION_OPT_##t, +enum bch_compression_type { +#define x(t, n) BCH_COMPRESSION_TYPE_##t, BCH_COMPRESSION_TYPES() #undef x + BCH_COMPRESSION_TYPE_NR +}; + +#define BCH_COMPRESSION_OPTS() \ + x(none, 0) \ + x(lz4, 1) \ + x(gzip, 2) \ + x(zstd, 3) + +enum bch_compression_opts { +#define x(t, n) BCH_COMPRESSION_OPT_##t, + BCH_COMPRESSION_OPTS() +#undef x BCH_COMPRESSION_OPT_NR }; @@ -1593,7 +1615,9 @@ struct btree_node { LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -/* 8-32 unused */ +LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, + struct btree_node, flags, 8, 9); +/* 9-32 unused */ LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); struct btree_node_entry { diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index d668ede5491a..ba8c75706bf1 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -68,7 +68,8 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) #define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage) +#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) +#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) @@ -224,46 +225,59 @@ struct bch_ioctl_data_event { }; } __attribute__((packed, aligned(8))); -struct bch_ioctl_dev_usage { - __u8 state; - __u8 alive; - __u8 pad[6]; - __u32 dev; +struct bch_replicas_usage { + __u64 sectors; + struct bch_replicas_entry r; +} __attribute__((packed)); - __u32 bucket_size; - __u64 nr_buckets; - - __u64 buckets[BCH_DATA_NR]; - __u64 sectors[BCH_DATA_NR]; -}; +static inline struct bch_replicas_usage * +replicas_usage_next(struct bch_replicas_usage *u) +{ + return (void *) u + replicas_entry_bytes(&u->r) + 8; +} +/* + * BCH_IOCTL_FS_USAGE: query filesystem disk space usage + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @replica_entries_bytes - size, in bytes, allocated for replica usage entries + * + * On success, @replica_entries_bytes will be changed to indicate the number of + * bytes actually used. + * + * Returns -ERANGE if @replica_entries_bytes was too small + */ struct bch_ioctl_fs_usage { __u64 capacity; __u64 used; __u64 online_reserved; __u64 persistent_reserved[BCH_REPLICAS_MAX]; - __u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX]; + + __u32 replica_entries_bytes; + __u32 pad; + + struct bch_replicas_usage replicas[0]; }; /* - * BCH_IOCTL_USAGE: query filesystem disk space usage - * - * Returns disk space usage broken out by data type, number of replicas, and - * by component device + * BCH_IOCTL_DEV_USAGE: query device disk space usage * - * @nr_devices - number of devices userspace allocated space for in @devs - * - * On success, @fs and @devs will be filled out appropriately and devs[i].alive - * will indicate if a device was present in that slot - * - * Returns -ERANGE if @nr_devices was too small + * Returns disk space usage broken out by data type - both by buckets and + * sectors. */ -struct bch_ioctl_usage { - __u16 nr_devices; - __u16 pad[3]; +struct bch_ioctl_dev_usage { + __u64 dev; + __u32 flags; + __u8 state; + __u8 pad[7]; + + __u32 bucket_size; + __u64 nr_buckets; - struct bch_ioctl_fs_usage fs; - struct bch_ioctl_dev_usage devs[0]; + __u64 buckets[BCH_DATA_NR]; + __u64 sectors[BCH_DATA_NR]; }; /* diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index ed448fad83c5..320e17d108d2 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -156,7 +156,7 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid); + bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); return; } diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index 2e205db5433d..7cbb57042af1 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -5,90 +5,15 @@ #include "bset.h" #include "extents.h" -/* too many iterators, need to clean this up */ - -/* btree_node_iter_large: */ - -#define btree_node_iter_cmp_heap(h, _l, _r) btree_node_iter_cmp(b, _l, _r) +typedef int (*sort_cmp_fn)(struct btree *, + struct bkey_packed *, + struct bkey_packed *); -static inline bool -bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter) +static inline bool sort_iter_end(struct sort_iter *iter) { return !iter->used; } -static inline struct bkey_packed * -bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter, - struct btree *b) -{ - return bch2_btree_node_iter_large_end(iter) - ? NULL - : __btree_node_offset_to_key(b, iter->data->k); -} - -static void -bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter, - struct btree *b) -{ - iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s; - - EBUG_ON(!iter->used); - EBUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - heap_del(iter, 0, btree_node_iter_cmp_heap, NULL); - else - heap_sift_down(iter, 0, btree_node_iter_cmp_heap, NULL); -} - -static inline struct bkey_packed * -bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter, - struct btree *b) -{ - struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b); - - if (ret) - bch2_btree_node_iter_large_advance(iter, b); - - return ret; -} - -void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter, - struct btree *b, - const struct bkey_packed *k, - const struct bkey_packed *end) -{ - if (k != end) { - struct btree_node_iter_set n = - ((struct btree_node_iter_set) { - __btree_node_key_to_offset(b, k), - __btree_node_key_to_offset(b, end) - }); - - __heap_add(iter, n, btree_node_iter_cmp_heap, NULL); - } -} - -static void sort_key_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - i->k += __btree_node_offset_to_key(b, i->k)->u64s; - - while (i->k != i->end && - !__btree_node_offset_to_key(b, i->k)->u64s) - i->k++; - - if (i->k == i->end) - *i = iter->data[--iter->used]; -} - -/* regular sort_iters */ - -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); - static inline void __sort_iter_sift(struct sort_iter *iter, unsigned from, sort_cmp_fn cmp) @@ -118,19 +43,29 @@ static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) { - return iter->used ? iter->data->k : NULL; + return !sort_iter_end(iter) ? iter->data->k : NULL; } -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +static inline void __sort_iter_advance(struct sort_iter *iter, + unsigned idx, sort_cmp_fn cmp) { - iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end); + struct sort_iter_set *i = iter->data + idx; + + BUG_ON(idx >= iter->used); + + i->k = bkey_next_skip_noops(i->k, i->end); - BUG_ON(iter->data->k > iter->data->end); + BUG_ON(i->k > i->end); - if (iter->data->k == iter->data->end) - array_remove_item(iter->data, iter->used, 0); + if (i->k == i->end) + array_remove_item(iter->data, iter->used, idx); else - sort_iter_sift(iter, cmp); + __sort_iter_sift(iter, idx, cmp); +} + +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +{ + __sort_iter_advance(iter, 0, cmp); } static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, @@ -145,107 +80,56 @@ static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, } /* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. + * If keys compare equal, compare by pointer order: */ -#define key_sort_cmp(h, l, r) \ -({ \ - bkey_cmp_packed(b, \ - __btree_node_offset_to_key(b, (l).k), \ - __btree_node_offset_to_key(b, (r).k)) \ - \ - ?: (l).k - (r).k; \ -}) - -static inline bool should_drop_next_key(struct btree_node_iter_large *iter, - struct btree *b) +static inline int key_sort_fix_overlapping_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) { - struct btree_node_iter_set *l = iter->data, *r = iter->data + 1; - struct bkey_packed *k = __btree_node_offset_to_key(b, l->k); - - if (bkey_whiteout(k)) - return true; - - if (iter->used < 2) - return false; - - if (iter->used > 2 && - key_sort_cmp(iter, r[0], r[1]) >= 0) - r++; + return bkey_cmp_packed(b, l, r) ?: + cmp_int((unsigned long) l, (unsigned long) r); +} +static inline bool should_drop_next_key(struct sort_iter *iter) +{ /* * key_sort_cmp() ensures that when keys compare equal the older key - * comes first; so if l->k compares equal to r->k then l->k is older and - * should be dropped. + * comes first; so if l->k compares equal to r->k then l->k is older + * and should be dropped. */ - return !bkey_cmp_packed(b, - __btree_node_offset_to_key(b, l->k), - __btree_node_offset_to_key(b, r->k)); + return iter->used >= 2 && + !bkey_cmp_packed(iter->b, + iter->data[0].k, + iter->data[1].k); } -struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct sort_iter *iter) { struct bkey_packed *out = dst->start; + struct bkey_packed *k; struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); - heap_resort(iter, key_sort_cmp, NULL); - - while (!bch2_btree_node_iter_large_end(iter)) { - if (!should_drop_next_key(iter, b)) { - struct bkey_packed *k = - __btree_node_offset_to_key(b, iter->data->k); + sort_iter_sort(iter, key_sort_fix_overlapping_cmp); + while ((k = sort_iter_peek(iter))) { + if (!bkey_whiteout(k) && + !should_drop_next_key(iter)) { bkey_copy(out, k); btree_keys_account_key_add(&nr, 0, out); out = bkey_next(out); } - sort_key_next(iter, b, iter->data); - heap_sift_down(iter, 0, key_sort_cmp, NULL); + sort_iter_advance(iter, key_sort_fix_overlapping_cmp); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); return nr; } -/* - * If keys compare equal, compare by pointer order: - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -#define extent_sort_cmp(h, l, r) \ -({ \ - struct bkey _ul = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (l).k)); \ - struct bkey _ur = bkey_unpack_key(b, \ - __btree_node_offset_to_key(b, (r).k)); \ - \ - bkey_cmp(bkey_start_pos(&_ul), \ - bkey_start_pos(&_ur)) ?: (r).k - (l).k; \ -}) - -static inline void extent_sort_sift(struct btree_node_iter_large *iter, - struct btree *b, size_t i) -{ - heap_sift_down(iter, i, extent_sort_cmp, NULL); -} - -static inline void extent_sort_next(struct btree_node_iter_large *iter, - struct btree *b, - struct btree_node_iter_set *i) -{ - sort_key_next(iter, b, i); - heap_sift_down(iter, i - iter->data, extent_sort_cmp, NULL); -} - static void extent_sort_advance_prev(struct bkey_format *f, struct btree_nr_keys *nr, struct bkey_packed *start, @@ -286,104 +170,6 @@ static void extent_sort_append(struct bch_fs *c, bkey_reassemble((void *) *prev, k.s_c); } -struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, - struct bset *dst, - struct btree *b, - struct btree_node_iter_large *iter) -{ - struct bkey_format *f = &b->format; - struct btree_node_iter_set *_l = iter->data, *_r; - struct bkey_packed *prev = NULL, *lk, *rk; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - struct bkey_on_stack split; - - memset(&nr, 0, sizeof(nr)); - bkey_on_stack_init(&split); - - heap_resort(iter, extent_sort_cmp, NULL); - - while (!bch2_btree_node_iter_large_end(iter)) { - lk = __btree_node_offset_to_key(b, _l->k); - l = __bkey_disassemble(b, lk, &l_unpacked); - - if (iter->used == 1) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); - extent_sort_next(iter, b, _l); - continue; - } - - _r = iter->data + 1; - if (iter->used > 2 && - extent_sort_cmp(iter, _r[0], _r[1]) >= 0) - _r++; - - rk = __btree_node_offset_to_key(b, _r->k); - r = __bkey_disassemble(b, rk, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, f, &nr, dst->start, &prev, l); - extent_sort_next(iter, b, _l); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_sort_next(iter, b, _r); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - sort_key_next(iter, b, _r); - } else { - bch2_cut_front_s(l.k->p, r); - extent_save(b, rk, r.k); - } - - extent_sort_sift(iter, b, _r - iter->data); - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_on_stack_reassemble(&split, c, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), split.k); - - bch2_cut_front_s(r.k->p, l); - extent_save(b, lk, l.k); - - extent_sort_sift(iter, b, 0); - - extent_sort_append(c, f, &nr, dst->start, - &prev, bkey_i_to_s(split.k)); - } else { - bch2_cut_back_s(bkey_start_pos(r.k), l); - extent_save(b, lk, l.k); - } - } - - extent_sort_advance_prev(f, &nr, dst->start, &prev); - - dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); - - bkey_on_stack_exit(&split, c); - return nr; -} - /* Sort + repack in a new format: */ struct btree_nr_keys bch2_sort_repack(struct bset *dst, struct btree *src, @@ -424,28 +210,38 @@ bch2_sort_repack_merge(struct bch_fs *c, bool filter_whiteouts) { struct bkey_packed *prev = NULL, *k_packed; - struct bkey_s k; + struct bkey_on_stack k; struct btree_nr_keys nr; - struct bkey unpacked; memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&k); while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { if (filter_whiteouts && bkey_whiteout(k_packed)) continue; - k = __bkey_disassemble(src, k_packed, &unpacked); + /* + * NOTE: + * bch2_bkey_normalize may modify the key we pass it (dropping + * stale pointers) and we don't have a write lock on the src + * node; we have to make a copy of the entire key before calling + * normalize + */ + bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); + bch2_bkey_unpack(src, k.k, k_packed); if (filter_whiteouts && - bch2_bkey_normalize(c, k)) + bch2_bkey_normalize(c, bkey_i_to_s(k.k))) continue; - extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k); + extent_sort_append(c, out_f, &nr, vstruct_last(dst), + &prev, bkey_i_to_s(k.k)); } extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev); dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + bkey_on_stack_exit(&k, c); return nr; } @@ -454,7 +250,7 @@ static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *r) { return bkey_cmp_packed(b, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (int) l->needs_whiteout - (int) r->needs_whiteout; } @@ -468,23 +264,18 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, sort_iter_sort(iter, sort_keys_cmp); while ((in = sort_iter_next(iter, sort_keys_cmp))) { + bool needs_whiteout = false; + if (bkey_whiteout(in) && (filter_whiteouts || !in->needs_whiteout)) continue; - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { + while ((next = sort_iter_peek(iter)) && + !bkey_cmp_packed(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; + needs_whiteout |= in->needs_whiteout; + in = sort_iter_next(iter, sort_keys_cmp); } if (bkey_whiteout(in)) { @@ -493,12 +284,129 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, } else { bkey_copy(out, in); } + out->needs_whiteout |= needs_whiteout; out = bkey_next(out); } return (u64 *) out - (u64 *) dst; } +/* Compat code for btree_node_old_extent_overwrite: */ + +/* + * If keys compare equal, compare by pointer order: + * + * Necessary for sort_fix_overlapping() - if there are multiple keys that + * compare equal in different sets, we have to process them newest to oldest. + */ +static inline int extent_sort_fix_overlapping_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + struct bkey ul = bkey_unpack_key(b, l); + struct bkey ur = bkey_unpack_key(b, r); + + return bkey_cmp(bkey_start_pos(&ul), + bkey_start_pos(&ur)) ?: + cmp_int((unsigned long) r, (unsigned long) l); +} + +struct btree_nr_keys +bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct sort_iter *iter) +{ + struct btree *b = iter->b; + struct bkey_format *f = &b->format; + struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; + struct bkey_packed *prev = NULL; + struct bkey l_unpacked, r_unpacked; + struct bkey_s l, r; + struct btree_nr_keys nr; + struct bkey_on_stack split; + + memset(&nr, 0, sizeof(nr)); + bkey_on_stack_init(&split); + + sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); + + while (!sort_iter_end(iter)) { + l = __bkey_disassemble(b, _l->k, &l_unpacked); + + if (iter->used == 1) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); + sort_iter_advance(iter, + extent_sort_fix_overlapping_cmp); + continue; + } + + r = __bkey_disassemble(b, _r->k, &r_unpacked); + + /* If current key and next key don't overlap, just append */ + if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { + extent_sort_append(c, f, &nr, dst->start, &prev, l); + sort_iter_advance(iter, + extent_sort_fix_overlapping_cmp); + continue; + } + + /* Skip 0 size keys */ + if (!r.k->size) { + __sort_iter_advance(iter, 1, + extent_sort_fix_overlapping_cmp); + continue; + } + + /* + * overlap: keep the newer key and trim the older key so they + * don't overlap. comparing pointers tells us which one is + * newer, since the bsets are appended one after the other. + */ + + /* can't happen because of comparison func */ + BUG_ON(_l->k < _r->k && + !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); + + if (_l->k > _r->k) { + /* l wins, trim r */ + if (bkey_cmp(l.k->p, r.k->p) >= 0) { + __sort_iter_advance(iter, 1, + extent_sort_fix_overlapping_cmp); + } else { + bch2_cut_front_s(l.k->p, r); + extent_save(b, _r->k, r.k); + __sort_iter_sift(iter, 1, + extent_sort_fix_overlapping_cmp); + } + } else if (bkey_cmp(l.k->p, r.k->p) > 0) { + + /* + * r wins, but it overlaps in the middle of l - split l: + */ + bkey_on_stack_reassemble(&split, c, l.s_c); + bch2_cut_back(bkey_start_pos(r.k), split.k); + + bch2_cut_front_s(r.k->p, l); + extent_save(b, _l->k, l.k); + + __sort_iter_sift(iter, 0, + extent_sort_fix_overlapping_cmp); + + extent_sort_append(c, f, &nr, dst->start, + &prev, bkey_i_to_s(split.k)); + } else { + bch2_cut_back_s(bkey_start_pos(r.k), l); + extent_save(b, _l->k, l.k); + } + } + + extent_sort_advance_prev(f, &nr, dst->start, &prev); + + dst->u64s = cpu_to_le16((u64 *) prev - dst->_data); + + bkey_on_stack_exit(&split, c); + return nr; +} + static inline int sort_extents_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) @@ -530,28 +438,6 @@ unsigned bch2_sort_extents(struct bkey_packed *dst, return (u64 *) out - (u64 *) dst; } -static inline int sort_key_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r); -} - -unsigned bch2_sort_key_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_key_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - static inline int sort_extent_whiteouts_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h index 397009181eae..458a051fdac5 100644 --- a/fs/bcachefs/bkey_sort.h +++ b/fs/bcachefs/bkey_sort.h @@ -2,20 +2,10 @@ #ifndef _BCACHEFS_BKEY_SORT_H #define _BCACHEFS_BKEY_SORT_H -struct btree_node_iter_large { - u16 used; - - struct btree_node_iter_set data[MAX_BSETS]; -}; - -void bch2_btree_node_iter_large_push(struct btree_node_iter_large *, - struct btree *, - const struct bkey_packed *, - const struct bkey_packed *); - struct sort_iter { - struct btree *b; + struct btree *b; unsigned used; + unsigned size; struct sort_iter_set { struct bkey_packed *k, *end; @@ -24,27 +14,27 @@ struct sort_iter { static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) { - memset(iter, 0, sizeof(*iter)); iter->b = b; + iter->used = 0; + iter->size = ARRAY_SIZE(iter->data); } static inline void sort_iter_add(struct sort_iter *iter, struct bkey_packed *k, struct bkey_packed *end) { - BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); + BUG_ON(iter->used >= iter->size); if (k != end) iter->data[iter->used++] = (struct sort_iter_set) { k, end }; } struct btree_nr_keys -bch2_key_sort_fix_overlapping(struct bset *, struct btree *, - struct btree_node_iter_large *); +bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, + struct sort_iter *); struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, - struct btree *, - struct btree_node_iter_large *); + struct sort_iter *); struct btree_nr_keys bch2_sort_repack(struct bset *, struct btree *, @@ -61,8 +51,6 @@ unsigned bch2_sort_keys(struct bkey_packed *, unsigned bch2_sort_extents(struct bkey_packed *, struct sort_iter *, bool); -unsigned bch2_sort_key_whiteouts(struct bkey_packed *, - struct sort_iter *); unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, struct sort_iter *); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index a0f0b0eadffb..cf8fa59fada1 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -253,10 +253,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bch2_bkey_to_text(&PBUF(buf2), &k2); panic("prev > insert:\n" - "prev key %5u %s\n" - "insert key %5u %s\n", - __btree_node_key_to_offset(b, prev), buf1, - __btree_node_key_to_offset(b, insert), buf2); + "prev key %s\n" + "insert key %s\n", + buf1, buf2); } #endif #if 0 @@ -275,10 +274,9 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, bch2_bkey_to_text(&PBUF(buf2), &k2); panic("insert > next:\n" - "insert key %5u %s\n" - "next key %5u %s\n", - __btree_node_key_to_offset(b, insert), buf1, - __btree_node_key_to_offset(b, next), buf2); + "insert key %s\n" + "next key %s\n", + buf1, buf2); } #endif } @@ -1399,21 +1397,21 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, { if (lossy_packed_search) while (m != btree_bkey_last(b, t) && - bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search, - m) > 0) + bkey_iter_cmp_p_or_unp(b, m, + lossy_packed_search, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (!packed_search) while (m != btree_bkey_last(b, t) && - bkey_iter_pos_cmp(b, search, m) > 0) + bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); if (btree_keys_expensive_checks(b)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && - bkey_iter_cmp_p_or_unp(b, search, packed_search, - prev) <= 0); + bkey_iter_cmp_p_or_unp(b, prev, + packed_search, search) >= 0); } return m; diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 2653a74b3b14..7338ccbc8cbd 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -199,12 +199,6 @@ __bkey_unpack_key_format_checked(const struct btree *b, if (btree_keys_expensive_checks(b)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); - /* - * hack around a harmless race when compacting whiteouts - * for a write: - */ - dst2.needs_whiteout = dst->needs_whiteout; - BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); } } @@ -360,7 +354,7 @@ void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); static inline int bkey_cmp_p_or_unp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r_packed, - struct bpos *r) + const struct bpos *r) { EBUG_ON(r_packed && !bkey_packed(r_packed)); @@ -449,7 +443,7 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) * XXX: only need to compare pointers for keys that are both within a * btree_node_iterator - we need to break ties for prev() to work correctly */ -static inline int bkey_iter_cmp(struct btree *b, +static inline int bkey_iter_cmp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { @@ -458,7 +452,7 @@ static inline int bkey_iter_cmp(struct btree *b, ?: cmp_int(l, r); } -static inline int btree_node_iter_cmp(struct btree *b, +static inline int btree_node_iter_cmp(const struct btree *b, struct btree_node_iter_set l, struct btree_node_iter_set r) { @@ -467,22 +461,22 @@ static inline int btree_node_iter_cmp(struct btree *b, __btree_node_offset_to_key(b, r.k)); } -/* These assume l (the search key) is not a deleted key: */ -static inline int bkey_iter_pos_cmp(struct btree *b, - struct bpos *l, - const struct bkey_packed *r) +/* These assume r (the search key) is not a deleted key: */ +static inline int bkey_iter_pos_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) { - return -bkey_cmp_left_packed(b, r, l) - ?: (int) bkey_deleted(r); + return bkey_cmp_left_packed(b, l, r) + ?: -((int) bkey_deleted(l)); } -static inline int bkey_iter_cmp_p_or_unp(struct btree *b, - struct bpos *l, - const struct bkey_packed *l_packed, - const struct bkey_packed *r) +static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) { - return -bkey_cmp_p_or_unp(b, r, l_packed, l) - ?: (int) bkey_deleted(r); + return bkey_cmp_p_or_unp(b, l, r_packed, r) + ?: -((int) bkey_deleted(l)); } static inline struct bkey_packed * diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 5d3acba525c2..0c737f35f430 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -557,7 +557,6 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - b->uncompacted_whiteout_u64s = 0; bch2_btree_keys_init(b, &c->expensive_debug_checks); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index c5873c58439c..83358d6a4df8 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -75,7 +75,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) return c->opts.btree_node_size >> c->block_bits; } -#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4) +#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4) #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 8bbf60b07736..05879b66d6af 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -116,8 +116,8 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; unsigned flags = - BCH_BUCKET_MARK_GC| - (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; if (initial) { @@ -294,8 +294,8 @@ static int mark_journal_key(struct bch_fs *c, enum btree_id id, BTREE_ITER_SLOTS, k, ret) { percpu_down_read(&c->mark_lock); ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, - BCH_BUCKET_MARK_GC| - BCH_BUCKET_MARK_NOATOMIC); + BTREE_TRIGGER_GC| + BTREE_TRIGGER_NOATOMIC); percpu_up_read(&c->mark_lock); if (!ret) @@ -407,7 +407,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); + bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); mutex_unlock(&c->sb_lock); } @@ -424,7 +424,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) if (d->index_update_done) bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, 0, NULL, 0, - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -445,7 +445,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); @@ -453,7 +453,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); } spin_unlock(&c->freelist_lock); @@ -467,7 +467,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); } spin_unlock(&ob->lock); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index fe6eb45ddfc2..5f1c3183fa85 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -23,7 +23,8 @@ static void verify_no_dups(struct btree *b, struct bkey_packed *start, - struct bkey_packed *end) + struct bkey_packed *end, + bool extents) { #ifdef CONFIG_BCACHEFS_DEBUG struct bkey_packed *k, *p; @@ -37,7 +38,7 @@ static void verify_no_dups(struct btree *b, struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(btree_node_is_extents(b) + BUG_ON(extents ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); @@ -80,27 +81,103 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); } -static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, - enum compact_mode mode) +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) { - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; + unsigned n = nr, a = nr / 2, b, c, d; - if (mode == COMPACT_LAZY) { - if (should_compact_bset_lazy(b, t) || - (compacting && !bset_written(b, bset(b, t)))) - return dead_u64s; - } else { - if (bset_written(b, bset(b, t))) - return dead_u64s; + if (!a) + return; + + /* Heap sort: see lib/sort.c: */ + while (1) { + if (a) + a--; + else if (--n) + swap(ptrs[0], ptrs[n]); + else + break; + + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) + b = bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && + bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; + c = b; + while (b != a) { + b = (b - 1) / 2; + swap(ptrs[b], ptrs[c]); + } } +} + +static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; + bool used_mempool = false; + unsigned order; + + if (!b->whiteout_u64s) + return; + + order = get_order(b->whiteout_u64s * sizeof(u64)); + + new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); + + ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); + + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); + k = bkey_next(k)) + *--ptrs = k; - return 0; + sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); + + k = new_whiteouts; + + while (ptrs != ptrs_end) { + bkey_copy(k, *ptrs); + k = bkey_next(k); + ptrs++; + } + + verify_no_dups(b, new_whiteouts, + (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), + btree_node_old_extent_overwrite(b)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); + + btree_bounce_free(c, order, used_mempool, new_whiteouts); } -bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) +static bool should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, enum compact_mode mode) +{ + if (!bset_dead_u64s(b, t)) + return false; + + switch (mode) { + case COMPACT_LAZY: + return should_compact_bset_lazy(b, t) || + (compacting && !bset_written(b, bset(b, t))); + case COMPACT_ALL: + return true; + default: + BUG(); + } +} + +static bool bch2_compact_extent_whiteouts(struct bch_fs *c, + struct btree *b, + enum compact_mode mode) { const struct bkey_format *f = &b->format; struct bset_tree *t; @@ -110,13 +187,17 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, unsigned order, whiteout_u64s = 0, u64s; bool used_mempool, compacting = false; + BUG_ON(!btree_node_is_extents(b)); + for_each_bset(b, t) - whiteout_u64s += should_compact_bset(b, t, - whiteout_u64s != 0, mode); + if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) + whiteout_u64s += bset_dead_u64s(b, t); if (!whiteout_u64s) return false; + bch2_sort_whiteouts(c, b); + sort_iter_init(&sort_iter, b); whiteout_u64s += b->whiteout_u64s; @@ -139,9 +220,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, if (t != b->set && !bset_written(b, i)) { src = container_of(i, struct btree_node_entry, keys); dst = max(write_block(b), - (void *) btree_bkey_last(b, t -1)); + (void *) btree_bkey_last(b, t - 1)); } + if (src != dst) + compacting = true; + if (!should_compact_bset(b, t, compacting, mode)) { if (src != dst) { memmove(dst, src, sizeof(*src) + @@ -169,18 +253,21 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, for (k = start; k != end; k = n) { n = bkey_next_skip_noops(k, end); - if (bkey_deleted(k) && btree_node_is_extents(b)) + if (bkey_deleted(k)) continue; + BUG_ON(bkey_whiteout(k) && + k->needs_whiteout && + bkey_written(b, k)); + if (bkey_whiteout(k) && !k->needs_whiteout) continue; if (bkey_whiteout(k)) { - unreserve_whiteout(b, k); memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); set_bkeyp_val_u64s(f, u_pos, 0); u_pos = bkey_next(u_pos); - } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { + } else { bkey_copy(out, k); out = bkey_next(out); } @@ -188,11 +275,9 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, sort_iter_add(&sort_iter, u_start, u_pos); - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - } + i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); } b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; @@ -200,13 +285,10 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, BUG_ON((void *) unwritten_whiteouts_start(c, b) < (void *) btree_bkey_last(b, bset_tree_last(b))); - u64s = (btree_node_is_extents(b) - ? bch2_sort_extent_whiteouts - : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), - &sort_iter); + u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), + &sort_iter); BUG_ON(u64s > b->whiteout_u64s); - BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); BUG_ON(u_pos != whiteouts && !u64s); if (u64s != b->whiteout_u64s) { @@ -218,12 +300,12 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, verify_no_dups(b, unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_end(c, b), + true); btree_bounce_free(c, order, used_mempool, whiteouts); - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) - bch2_btree_build_aux_trees(b); + bch2_btree_build_aux_trees(b); bch_btree_keys_u64s_remaining(c, b); bch2_verify_btree_nr_keys(b); @@ -231,7 +313,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, return true; } -static bool bch2_drop_whiteouts(struct btree *b) +static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { struct bset_tree *t; bool ret = false; @@ -239,21 +321,34 @@ static bool bch2_drop_whiteouts(struct btree *b) for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k, *n, *out, *start, *end; + struct btree_node_entry *src = NULL, *dst = NULL; + + if (t != b->set && !bset_written(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) btree_bkey_last(b, t - 1)); + } - if (!should_compact_bset(b, t, true, COMPACT_WRITTEN)) + if (src != dst) + ret = true; + + if (!should_compact_bset(b, t, ret, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + + le16_to_cpu(src->keys.u64s) * + sizeof(u64)); + i = &dst->keys; + set_btree_bset(b, t, i); + } continue; + } start = btree_bkey_first(b, t); end = btree_bkey_last(b, t); - if (!bset_written(b, i) && - t != b->set) { - struct bset *dst = - max_t(struct bset *, write_block(b), - (void *) btree_bkey_last(b, t -1)); - - memmove(dst, i, sizeof(struct bset)); - i = dst; + if (src != dst) { + memmove(dst, src, sizeof(*src)); + i = &dst->keys; set_btree_bset(b, t, i); } @@ -265,19 +360,32 @@ static bool bch2_drop_whiteouts(struct btree *b) if (!bkey_whiteout(k)) { bkey_copy(out, k); out = bkey_next(out); + } else { + BUG_ON(k->needs_whiteout); } } i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); bch2_bset_set_no_aux_tree(b, t); ret = true; } bch2_verify_btree_nr_keys(b); + bch2_btree_build_aux_trees(b); + return ret; } +bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) +{ + return !btree_node_old_extent_overwrite(b) + ? bch2_drop_whiteouts(b, mode) + : bch2_compact_extent_whiteouts(c, b, mode); +} + static void btree_node_sort(struct bch_fs *c, struct btree *b, struct btree_iter *iter, unsigned start_idx, @@ -313,10 +421,10 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - if (btree_node_is_extents(b)) + if (btree_node_old_extent_overwrite(b)) filter_whiteouts = bset_written(b, start_bset); - u64s = (btree_node_is_extents(b) + u64s = (btree_node_old_extent_overwrite(b) ? bch2_sort_extents : bch2_sort_keys)(out->keys.start, &sort_iter, @@ -509,7 +617,7 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); - nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE)); + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, @@ -602,7 +710,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, bool have_retry) { struct bkey_packed *k, *prev = NULL; - struct bpos prev_pos = POS_MIN; + struct bpos prev_pos = POS_MIN; + struct bpos prev_data = POS_MIN; bool seen_non_whiteout = false; unsigned version; const char *err; @@ -735,7 +844,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b, (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; - } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { + } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 || + bkey_cmp(prev_pos, u.k->p) > 0) { btree_err(BTREE_ERR_FATAL, c, b, i, "keys out of order: %llu:%llu > %llu:%llu", prev_pos.inode, @@ -745,7 +855,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b, /* XXX: repair this */ } + if (!bkey_deleted(u.k)) + prev_data = u.k->p; prev_pos = u.k->p; + prev = k; k = bkey_next_skip_noops(k, vstruct_last(i)); } @@ -758,7 +871,7 @@ fsck_err: int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) { struct btree_node_entry *bne; - struct btree_node_iter_large *iter; + struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; struct bset *i; @@ -767,7 +880,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry int ret, retry_read = 0, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - iter->used = 0; + sort_iter_init(iter, b); + iter->size = (btree_blocks(c) + 1) * 2; if (bch2_meta_read_fault("btree")) btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, @@ -803,6 +917,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bset_encrypt(c, i, b->written << 9); + if (btree_node_is_extents(b) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + set_btree_node_old_extent_overwrite(b); + sectors = vstruct_sectors(b->data, c->block_bits); btree_node_set_format(b, b->data->format); @@ -846,13 +964,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry if (blacklisted && !first) continue; - bch2_btree_node_iter_large_push(iter, b, - i->start, - vstruct_idx(i, whiteout_u64s)); + sort_iter_add(iter, i->start, + vstruct_idx(i, whiteout_u64s)); - bch2_btree_node_iter_large_push(iter, b, - vstruct_idx(i, whiteout_u64s), - vstruct_last(i)); + sort_iter_add(iter, + vstruct_idx(i, whiteout_u64s), + vstruct_last(i)); } for (bne = write_block(b); @@ -867,9 +984,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry set_btree_bset(b, b->set, &b->data->keys); - b->nr = btree_node_is_extents(b) - ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) - : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); + b->nr = (btree_node_old_extent_overwrite(b) + ? bch2_extent_sort_fix_overlapping + : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; @@ -1343,21 +1460,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - /* - * We can't block on six_lock_write() here; another thread might be - * trying to get a journal reservation with read locks held, and getting - * a journal reservation might be blocked on flushing the journal and - * doing btree writes: - */ - if (lock_type_held == SIX_LOCK_intent && - six_trylock_write(&b->lock)) { - __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); - six_unlock_write(&b->lock); - } else { - __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); - } - - BUG_ON(b->uncompacted_whiteout_u64s); + bch2_sort_whiteouts(c, b); sort_iter_init(&sort_iter, b); @@ -1396,7 +1499,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, i->journal_seq = cpu_to_le64(seq); i->u64s = 0; - if (!btree_node_is_extents(b)) { + if (!btree_node_old_extent_overwrite(b)) { sort_iter_add(&sort_iter, unwritten_whiteouts_start(c, b), unwritten_whiteouts_end(c, b)); @@ -1411,7 +1514,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->whiteout_u64s = 0; - u64s = btree_node_is_extents(b) + u64s = btree_node_old_extent_overwrite(b) ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) : bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); @@ -1545,7 +1648,6 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) return false; BUG_ON(b->whiteout_u64s); - BUG_ON(b->uncompacted_whiteout_u64s); clear_btree_node_just_written(b); @@ -1566,7 +1668,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) btree_node_sort(c, b, NULL, 0, b->nsets, true); invalidated_iter = true; } else { - invalidated_iter = bch2_drop_whiteouts(b); + invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); } for_each_bset(b, t) diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 955a80cafae3..e90e89eee273 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -54,16 +54,17 @@ static inline bool btree_node_may_write(struct btree *b) enum compact_mode { COMPACT_LAZY, - COMPACT_WRITTEN, - COMPACT_WRITTEN_NO_WRITE_LOCK, + COMPACT_ALL, }; -bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); +bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, + enum compact_mode); -static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t) +static inline bool should_compact_bset_lazy(struct btree *b, + struct bset_tree *t) { unsigned total_u64s = bset_u64s(t); - unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set]; + unsigned dead_u64s = bset_dead_u64s(b, t); return dead_u64s > 64 && dead_u64s * 3 > total_u64s; } @@ -74,7 +75,7 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree * for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) - return __bch2_compact_whiteouts(c, b, COMPACT_LAZY); + return bch2_compact_whiteouts(c, b, COMPACT_LAZY); return false; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index a4180124d7d1..ea0555b806f0 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -11,10 +11,6 @@ #include <linux/prefetch.h> #include <trace/events/bcachefs.h> -static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, - struct btree_iter_level *, - struct bkey *); - #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) #define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) #define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) @@ -29,37 +25,14 @@ static inline bool is_btree_node(struct btree_iter *iter, unsigned l) (unsigned long) iter->l[l].b >= 128; } -/* Returns < 0 if @k is before iter pos, > 0 if @k is after */ -static inline int __btree_iter_pos_cmp(struct btree_iter *iter, - const struct btree *b, - const struct bkey_packed *k, - bool interior_node) +static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { - int cmp = bkey_cmp_left_packed(b, k, &iter->pos); - - if (cmp) - return cmp; - if (bkey_deleted(k)) - return -1; + struct bpos pos = iter->pos; - /* - * Normally, for extents we want the first key strictly greater than - * the iterator position - with the exception that for interior nodes, - * we don't want to advance past the last key if the iterator position - * is POS_MAX: - */ - if (iter->flags & BTREE_ITER_IS_EXTENTS && - (!interior_node || - bkey_cmp_left_packed_byval(b, k, POS_MAX))) - return -1; - return 1; -} - -static inline int btree_iter_pos_cmp(struct btree_iter *iter, - const struct btree *b, - const struct bkey_packed *k) -{ - return __btree_iter_pos_cmp(iter, b, k, b->level != 0); + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(pos, POS_MAX)) + pos = bkey_successor(pos); + return pos; } /* Btree node locking: */ @@ -415,6 +388,7 @@ void bch2_trans_unlock(struct btree_trans *trans) static void __bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) { + struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[b->level]; struct btree_node_iter tmp = l->iter; struct bkey_packed *k; @@ -437,17 +411,17 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) : bch2_btree_node_iter_prev_all(&tmp, b); - if (k && btree_iter_pos_cmp(iter, b, k) > 0) { + if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); bch2_bkey_to_text(&PBUF(buf), &uk); - panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", + panic("iterator should be before prev key:\n%s\n%llu:%llu\n", buf, iter->pos.inode, iter->pos.offset); } k = bch2_btree_node_iter_peek_all(&l->iter, b); - if (k && btree_iter_pos_cmp(iter, b, k) < 0) { + if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) { char buf[100]; struct bkey uk = bkey_unpack_key(b, k); @@ -457,11 +431,6 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter, "cur key %s\n", iter->pos.inode, iter->pos.offset, buf); } - - BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && - btree_iter_type(iter) == BTREE_ITER_KEYS && - !bkey_whiteout(&iter->k) && - bch2_btree_node_iter_end(&l->iter)); } void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) @@ -500,15 +469,19 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, } static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, - struct btree *b, - struct bkey_packed *where) + struct btree *b, + struct bkey_packed *where) { - struct btree_node_iter *node_iter = &iter->l[0].iter; + struct btree_iter_level *l = &iter->l[b->level]; + struct bpos pos = btree_iter_search_key(iter); - if (where == bch2_btree_node_iter_peek_all(node_iter, b)) { - bkey_disassemble(b, where, &iter->k); - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - } + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + + if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); + + btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, @@ -540,6 +513,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, bool iter_current_key_modified = orig_iter_pos >= offset && orig_iter_pos <= offset + clobber_u64s; + struct bpos iter_pos = btree_iter_search_key(iter); btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -547,7 +521,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -562,7 +536,7 @@ found: return; if (new_u64s && - btree_iter_pos_cmp(iter, b, where) > 0) { + bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -707,11 +681,12 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) { + struct bpos pos = btree_iter_search_key(iter); struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - btree_iter_pos_cmp(iter, l->b, k) < 0) { + bkey_iter_pos_cmp(l->b, k, &pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -770,13 +745,7 @@ static inline bool btree_iter_pos_before_node(struct btree_iter *iter, static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - int cmp = bkey_cmp(b->key.k.p, iter->pos); - - if (!cmp && - (iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(b->key.k.p, POS_MAX)) - cmp = -1; - return cmp < 0; + return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -790,16 +759,10 @@ static inline bool btree_iter_pos_in_node(struct btree_iter *iter, static inline void __btree_iter_init(struct btree_iter *iter, unsigned level) { + struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - btree_iter_advance_to_pos(iter, l, -1); - - /* Skip to first non whiteout: */ - if (level) - bch2_btree_node_iter_peek(&l->iter, l->b); + bch2_btree_node_iter_init(&l->iter, l->b, &pos); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } @@ -1032,10 +995,7 @@ retry_all: for (i = 0; i < nr_sorted; i++) { iter = &trans->iters[sorted[i]]; - do { - ret = btree_iter_traverse_one(iter); - } while (ret == -EINTR); - + ret = btree_iter_traverse_one(iter); if (ret) goto retry_all; } @@ -1148,7 +1108,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) iter->uptodate = BTREE_ITER_NEED_PEEK; bch2_btree_trans_verify_locks(iter->trans); - __bch2_btree_iter_verify(iter, iter->l[iter->level].b); + if (btree_iter_node(iter, iter->level)) + __bch2_btree_iter_verify(iter, iter->l[iter->level].b); return 0; } @@ -1378,12 +1339,6 @@ static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) if (debug_check_iterators(iter->trans->c)) { struct bkey k = bkey_unpack_key(l->b, _k); - /* - * this flag is internal to the btree code, - * we don't care if it doesn't match - if it's now set - * it just means the key has been written out to disk: - */ - k.needs_whiteout = iter->k.needs_whiteout; BUG_ON(memcmp(&k, &iter->k, sizeof(k))); } @@ -1571,9 +1526,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) int ret; recheck: - while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && - bkey_cmp(k.k->p, iter->pos) <= 0) - bch2_btree_node_iter_advance(&l->iter, l->b); + btree_iter_advance_to_pos(iter, l, -1); /* * iterator is now at the correct position for inserting at iter->pos, @@ -1582,9 +1535,27 @@ recheck: */ node_iter = l->iter; - if (k.k && bkey_whiteout(k.k)) - k = __btree_iter_unpack(iter, l, &iter->k, - bch2_btree_node_iter_peek(&node_iter, l->b)); + k = __btree_iter_unpack(iter, l, &iter->k, + bch2_btree_node_iter_peek(&node_iter, l->b)); + + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { + /* + * If there wasn't actually a hole, want the iterator to be + * pointed at the key we found: + * + * XXX: actually, we shouldn't be changing the iterator here: + * the iterator needs to be correct for inserting at iter->pos, + * and there may be whiteouts between iter->pos and what this + * iterator points at: + */ + l->iter = node_iter; + + EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); + iter->uptodate = BTREE_ITER_UPTODATE; + + __bch2_btree_iter_verify(iter, l->b); + return k; + } /* * If we got to the end of the node, check if we need to traverse to the @@ -1599,24 +1570,6 @@ recheck: goto recheck; } - if (k.k && - !bkey_whiteout(k.k) && - bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - /* - * if we skipped forward to find the first non whiteout and - * there _wasn't_ actually a hole, we want the iterator to be - * pointed at the key we found: - */ - l->iter = node_iter; - - EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); - EBUG_ON(bkey_deleted(k.k)); - iter->uptodate = BTREE_ITER_UPTODATE; - - __bch2_btree_iter_verify(iter, l->b); - return k; - } - /* hole */ /* holes can't span inode numbers: */ @@ -1797,10 +1750,9 @@ int bch2_trans_iter_free(struct btree_trans *trans, static int bch2_trans_realloc_iters(struct btree_trans *trans, unsigned new_size) { - void *new_iters, *new_updates, *new_sorted; + void *new_iters, *new_updates; size_t iters_bytes; size_t updates_bytes; - size_t sorted_bytes; new_size = roundup_pow_of_two(new_size); @@ -1814,12 +1766,9 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, bch2_trans_unlock(trans); iters_bytes = sizeof(struct btree_iter) * new_size; - updates_bytes = sizeof(struct btree_insert_entry) * (new_size + 4); - sorted_bytes = sizeof(u8) * (new_size + 4); + updates_bytes = sizeof(struct btree_insert_entry) * new_size; - new_iters = kmalloc(iters_bytes + - updates_bytes + - sorted_bytes, GFP_NOFS); + new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS); if (new_iters) goto success; @@ -1829,7 +1778,6 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans, trans->used_mempool = true; success: new_updates = new_iters + iters_bytes; - new_sorted = new_updates + updates_bytes; memcpy(new_iters, trans->iters, sizeof(struct btree_iter) * trans->nr_iters); @@ -1846,7 +1794,6 @@ success: trans->iters = new_iters; trans->updates = new_updates; - trans->updates_sorted = new_sorted; trans->size = new_size; if (trans->iters_live) { @@ -1895,6 +1842,7 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) got_slot: BUG_ON(trans->iters_linked & (1ULL << idx)); trans->iters_linked |= 1ULL << idx; + trans->iters[idx].flags = 0; return &trans->iters[idx]; } @@ -1910,6 +1858,9 @@ static inline void btree_iter_copy(struct btree_iter *dst, if (btree_node_locked(dst, i)) six_lock_increment(&dst->l[i].b->lock, __btree_lock_want(dst, i)); + + dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; } static inline struct bpos bpos_diff(struct bpos l, struct bpos r) @@ -1960,7 +1911,6 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, iter = best; } - iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); @@ -1972,6 +1922,7 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, BUG_ON(iter->btree_id != btree_id); BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); + BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); BUG_ON(trans->iters_live & (1ULL << iter->idx)); trans->iters_live |= 1ULL << iter->idx; @@ -2034,7 +1985,6 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, * it's cheap to copy it again: */ trans->iters_touched &= ~(1ULL << iter->idx); - iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; return iter; } @@ -2094,7 +2044,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) struct btree_iter *iter; trans_for_each_iter(trans, iter) - iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; + iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| + BTREE_ITER_SET_POS_AFTER_COMMIT); bch2_trans_unlink_iters(trans); @@ -2103,12 +2054,21 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->iters_touched &= trans->iters_live; + trans->need_reset = 0; trans->nr_updates = 0; if (flags & TRANS_RESET_MEM) trans->mem_top = 0; - bch2_btree_iter_traverse_all(trans); + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset(&trans->fs_usage_deltas->memset_start, 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } + + if (!(flags & TRANS_RESET_NOTRAVERSE)) + bch2_btree_iter_traverse_all(trans); } void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, @@ -2122,7 +2082,6 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->size = ARRAY_SIZE(trans->iters_onstack); trans->iters = trans->iters_onstack; trans->updates = trans->updates_onstack; - trans->updates_sorted = trans->updates_sorted_onstack; trans->fs_usage_deltas = NULL; if (expected_nr_iters > trans->size) @@ -2159,6 +2118,6 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + - sizeof(struct btree_insert_entry) * (nr + 4) + - sizeof(u8) * (nr + 4)); + sizeof(struct btree_insert_entry) * nr + + sizeof(u8) * nr); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 4c5032222319..962380925511 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -291,6 +291,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, #define TRANS_RESET_ITERS (1 << 0) #define TRANS_RESET_MEM (1 << 1) +#define TRANS_RESET_NOTRAVERSE (1 << 2) void bch2_trans_reset(struct btree_trans *, unsigned); @@ -299,11 +300,6 @@ static inline void bch2_trans_begin(struct btree_trans *trans) return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM); } -static inline void bch2_trans_begin_updates(struct btree_trans *trans) -{ - return bch2_trans_reset(trans, TRANS_RESET_MEM); -} - void *bch2_trans_kmalloc(struct btree_trans *, size_t); void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); int bch2_trans_exit(struct btree_trans *); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index efa68bb578ab..b7af88e05837 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -94,7 +94,6 @@ struct btree { struct btree_nr_keys nr; u16 sib_u64s[2]; u16 whiteout_u64s; - u16 uncompacted_whiteout_u64s; u8 page_order; u8 unpack_fn_len; @@ -185,9 +184,25 @@ enum btree_iter_type { #define BTREE_ITER_TYPE ((1 << 2) - 1) +/* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ #define BTREE_ITER_SLOTS (1 << 2) +/* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ #define BTREE_ITER_INTENT (1 << 3) +/* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ #define BTREE_ITER_PREFETCH (1 << 4) +/* + * Indicates that this iterator should not be reused until transaction commit, + * either because a pending update references it or because the update depends + * on that particular key being locked (e.g. by the str_hash code, for hash + * table consistency) + */ #define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for @@ -195,6 +210,7 @@ enum btree_iter_type { */ #define BTREE_ITER_IS_EXTENTS (1 << 6) #define BTREE_ITER_ERROR (1 << 7) +#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -211,12 +227,13 @@ enum btree_iter_uptodate { * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - u8 idx; - struct btree_trans *trans; struct bpos pos; + struct bpos pos_after_commit; + + u16 flags; + u8 idx; - u8 flags; enum btree_iter_uptodate uptodate:4; enum btree_id btree_id:4; unsigned level:4, @@ -243,6 +260,8 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) } struct btree_insert_entry { + unsigned trigger_flags; + unsigned trans_triggers_run:1; struct bkey_i *k; struct btree_iter *iter; }; @@ -263,6 +282,7 @@ struct btree_trans { unsigned used_mempool:1; unsigned error:1; unsigned nounlock:1; + unsigned need_reset:1; unsigned mem_top; unsigned mem_bytes; @@ -270,7 +290,6 @@ struct btree_trans { struct btree_iter *iters; struct btree_insert_entry *updates; - u8 *updates_sorted; /* update path: */ struct journal_res journal_res; @@ -279,11 +298,11 @@ struct btree_trans { struct disk_reservation *disk_res; unsigned flags; unsigned journal_u64s; + unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; struct btree_iter iters_onstack[2]; - struct btree_insert_entry updates_onstack[6]; - u8 updates_sorted_onstack[6]; + struct btree_insert_entry updates_onstack[2]; }; #define BTREE_FLAG(flag) \ @@ -308,6 +327,7 @@ enum btree_flags { BTREE_NODE_just_written, BTREE_NODE_dying, BTREE_NODE_fake, + BTREE_NODE_old_extent_overwrite, }; BTREE_FLAG(read_in_flight); @@ -321,6 +341,7 @@ BTREE_FLAG(write_in_flight); BTREE_FLAG(just_written); BTREE_FLAG(dying); BTREE_FLAG(fake); +BTREE_FLAG(old_extent_overwrite); static inline struct btree_write *btree_current_write(struct btree *b) { @@ -421,6 +442,11 @@ static inline unsigned bset_u64s(struct bset_tree *t) sizeof(struct bset) / sizeof(u64); } +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) +{ + return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; +} + static inline unsigned bset_byte_offset(struct btree *b, void *i) { return i - (void *) b->data; @@ -474,6 +500,32 @@ static inline bool btree_node_is_extents(struct btree *b) (1U << BKEY_TYPE_INODES)| \ (1U << BKEY_TYPE_REFLINK)) +enum btree_trigger_flags { + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, + __BTREE_TRIGGER_OVERWRITE_SPLIT, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, + __BTREE_TRIGGER_ALLOC_READ, + __BTREE_TRIGGER_NOATOMIC, +}; + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) +#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) + +#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) +#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) + +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + static inline bool btree_node_type_needs_gc(enum btree_node_type type) { return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index ad8cbf3fb778..2c34bae64281 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -15,8 +15,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, struct bkey_i *); -enum { - __BTREE_INSERT_ATOMIC, +enum btree_insert_flags { __BTREE_INSERT_NOUNLOCK, __BTREE_INSERT_NOFAIL, __BTREE_INSERT_NOCHECK_RW, @@ -25,10 +24,6 @@ enum { __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, - __BTREE_INSERT_NOMARK_OVERWRITES, - __BTREE_INSERT_NOMARK, - __BTREE_INSERT_NO_CLEAR_REPLICAS, - __BTREE_INSERT_BUCKET_INVALIDATE, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, __BCH_HASH_SET_MUST_CREATE, @@ -36,12 +31,6 @@ enum { }; /* - * Don't drop/retake locks before doing btree update, instead return -EINTR if - * we had to drop locks for any reason - */ -#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC) - -/* * Don't drop locks _after_ successfully updating btree: */ #define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) @@ -61,16 +50,6 @@ enum { #define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -/* Don't mark overwrites, just new key: */ -#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES) - -/* Don't call mark new key at all: */ -#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK) - -#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS) - -#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE) - /* Don't block on allocation failure (for new btree nodes: */ #define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) #define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) @@ -93,6 +72,8 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, struct btree *, struct bkey_i_btree_ptr *); +int bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_trigger_flags); int __bch2_trans_commit(struct btree_trans *); /** @@ -101,8 +82,7 @@ int __bch2_trans_commit(struct btree_trans *); * This is main entry point for btree updates. * * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. + * -EINTR: locking changed, this function should be called again. * -EROFS: filesystem read only * -EIO: journal or btree node IO error */ @@ -118,37 +98,34 @@ static inline int bch2_trans_commit(struct btree_trans *trans, return __bch2_trans_commit(trans); } -static inline void bch2_trans_update(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *k) -{ - EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); - - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - - trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { - .iter = iter, .k = k - }; -} - -#define bch2_trans_do(_c, _journal_seq, _flags, _do) \ +#define __bch2_trans_do(_trans, _disk_res, _journal_seq, \ + _flags, _reset_flags, _do) \ ({ \ - struct btree_trans trans; \ int _ret; \ \ - bch2_trans_init(&trans, (_c), 0, 0); \ - \ do { \ - bch2_trans_begin(&trans); \ + bch2_trans_reset(_trans, _reset_flags); \ \ - _ret = (_do) ?: bch2_trans_commit(&trans, NULL, \ + _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ (_journal_seq), (_flags)); \ } while (_ret == -EINTR); \ \ - bch2_trans_exit(&trans); \ _ret; \ }) +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +({ \ + struct btree_trans trans; \ + int _ret, _ret2; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ + TRANS_RESET_MEM|TRANS_RESET_ITERS, _do); \ + _ret2 = bch2_trans_exit(&trans); \ + \ + _ret ?: _ret2; \ +}) + #define trans_for_each_update(_trans, _i) \ for ((_i) = (_trans)->updates; \ (_i) < (_trans)->updates + (_trans)->nr_updates; \ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f8a30cb34750..748e6356f3d6 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -193,8 +193,8 @@ found: gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key), 0, 0, NULL, 0, - BCH_BUCKET_MARK_OVERWRITE| - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_OVERWRITE| + BTREE_TRIGGER_GC); } static void __btree_node_free(struct bch_fs *c, struct btree *b) @@ -265,13 +265,13 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, BUG_ON(!pending->index_update_done); bch2_mark_key(c, bkey_i_to_s_c(&pending->key), - 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE); + 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE); if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE))) bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, 0, NULL, 0, - BCH_BUCKET_MARK_OVERWRITE| - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_OVERWRITE| + BTREE_TRIGGER_GC); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -374,6 +374,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev SET_BTREE_NODE_LEVEL(b->data, level); b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0]; + if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); + + if (btree_node_is_extents(b) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + set_btree_node_old_extent_overwrite(b); + bch2_btree_build_aux_trees(b); btree_node_will_make_reachable(as, b); @@ -1077,12 +1084,12 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), 0, 0, fs_usage, 0, - BCH_BUCKET_MARK_INSERT); + BTREE_TRIGGER_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), 0, 0, NULL, 0, - BCH_BUCKET_MARK_INSERT| - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_GC); if (old && !btree_node_fake(old)) bch2_btree_node_free_index(as, NULL, @@ -1175,16 +1182,16 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bch2_mark_key_locked(c, bkey_i_to_s_c(insert), 0, 0, fs_usage, 0, - BCH_BUCKET_MARK_INSERT); + BTREE_TRIGGER_INSERT); if (gc_visited(c, gc_pos_btree_node(b))) bch2_mark_key_locked(c, bkey_i_to_s_c(insert), 0, 0, NULL, 0, - BCH_BUCKET_MARK_INSERT| - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_GC); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - bkey_iter_pos_cmp(b, &insert->k.p, k) > 0) + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) bch2_btree_node_iter_advance(node_iter, b); /* @@ -1378,7 +1385,7 @@ static void btree_split(struct btree_update *as, struct btree *b, if (keys) btree_split_insert_keys(as, n1, iter, keys); - if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { + if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { trace_btree_split(c, b); n2 = __btree_split_node(as, n1, iter); @@ -1657,6 +1664,8 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, size_t sib_u64s; int ret = 0; + BUG_ON(!btree_node_locked(iter, level)); + closure_init_stack(&cl); retry: BUG_ON(!btree_node_locked(iter, level)); @@ -2022,12 +2031,12 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), 0, 0, fs_usage, 0, - BCH_BUCKET_MARK_INSERT); + BTREE_TRIGGER_INSERT); if (gc_visited(c, gc_pos_btree_root(b->btree_id))) bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), 0, 0, NULL, 0, - BCH_BUCKET_MARK_INSERT|| - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_INSERT|| + BTREE_TRIGGER_GC); bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c5a0ab5d7bb8..2d8e0b7f3aaf 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -251,8 +251,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + - b->whiteout_u64s + - b->uncompacted_whiteout_u64s; + b->whiteout_u64s; ssize_t total = c->opts.btree_node_size << 6; return total - used; @@ -302,23 +301,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k) +static inline void push_whiteout(struct bch_fs *c, struct btree *b, + struct bkey_packed *k) { - if (bkey_written(b, k)) { - EBUG_ON(b->uncompacted_whiteout_u64s < - bkeyp_key_u64s(&b->format, k)); - b->uncompacted_whiteout_u64s -= - bkeyp_key_u64s(&b->format, k); - } -} + unsigned u64s = bkeyp_key_u64s(&b->format, k); + struct bkey_packed *dst; -static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k) -{ - if (bkey_written(b, k)) { - BUG_ON(!k->needs_whiteout); - b->uncompacted_whiteout_u64s += - bkeyp_key_u64s(&b->format, k); - } + BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b)); + + b->whiteout_u64s += bkeyp_key_u64s(&b->format, k); + dst = unwritten_whiteouts_start(c, b); + memcpy_u64s(dst, k, u64s); + dst->u64s = u64s; + dst->type = KEY_TYPE_deleted; } /* diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index d37a95299240..afd2086edeff 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -21,18 +21,12 @@ #include <trace/events/bcachefs.h> static inline bool same_leaf_as_prev(struct btree_trans *trans, - unsigned idx) + struct btree_insert_entry *i) { - return idx && - trans->updates[trans->updates_sorted[idx]].iter->l[0].b == - trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b; + return i != trans->updates && + i[0].iter->l[0].b == i[-1].iter->l[0].b; } -#define trans_for_each_update_sorted(_trans, _i, _iter) \ - for (_iter = 0; \ - _iter < _trans->nr_updates && \ - (_i = _trans->updates + _trans->updates_sorted[_iter], 1); \ - _iter++) inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) @@ -51,28 +45,6 @@ inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, bch2_btree_init_next(c, b, iter); } -static inline void btree_trans_sort_updates(struct btree_trans *trans) -{ - struct btree_insert_entry *l, *r; - unsigned nr = 0, pos; - - trans_for_each_update(trans, l) { - for (pos = 0; pos < nr; pos++) { - r = trans->updates + trans->updates_sorted[pos]; - - if (btree_iter_cmp(l->iter, r->iter) <= 0) - break; - } - - memmove(&trans->updates_sorted[pos + 1], - &trans->updates_sorted[pos], - (nr - pos) * sizeof(trans->updates_sorted[0])); - - trans->updates_sorted[pos] = l - trans->updates; - nr++; - } -} - /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -92,58 +64,63 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bkey_cmp(insert->k.p, b->data->max_key) > 0); k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && !bkey_cmp_packed(b, k, &insert->k)) { - BUG_ON(bkey_whiteout(k)); + if (k && bkey_cmp_packed(b, k, &insert->k)) + k = NULL; - if (!bkey_written(b, k) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) && - !bkey_whiteout(&insert->k)) { - k->type = insert->k.type; - memcpy_u64s(bkeyp_val(f, k), &insert->v, - bkey_val_u64s(&insert->k)); - return true; - } + /* @k is the key being overwritten/deleted, if any: */ - insert->k.needs_whiteout = k->needs_whiteout; + EBUG_ON(k && bkey_whiteout(k)); + + if (bkey_whiteout(&insert->k)) { + /* Deleting: */ + + /* Not found? Nothing to do: */ + if (!k) + return false; btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) { + push_whiteout(iter->trans->c, b, k); + k->needs_whiteout = false; + } if (k >= btree_bset_last(b)->start) { clobber_u64s = k->u64s; - /* - * If we're deleting, and the key we're deleting doesn't - * need a whiteout (it wasn't overwriting a key that had - * been written to disk) - just delete it: - */ - if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { - bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, - k, clobber_u64s, 0); - return true; - } + bch2_bset_delete(b, k, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, k, + clobber_u64s, 0); + } else { + bch2_btree_iter_fix_key_modified(iter, b, k); + } - goto overwrite; + return true; + } + + if (k) { + /* Overwriting: */ + if (!bkey_written(b, k) && + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { + k->type = insert->k.type; + memcpy_u64s(bkeyp_val(f, k), &insert->v, + bkey_val_u64s(&insert->k)); + return true; } + btree_account_key_drop(b, k); k->type = KEY_TYPE_deleted; - bch2_btree_node_iter_fix(iter, b, node_iter, k, - k->u64s, k->u64s); - if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, k); - return true; + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; } else { - k->needs_whiteout = false; + bch2_btree_iter_fix_key_modified(iter, b, k); } - } else { - /* - * Deleting, but the key to delete wasn't found - nothing to do: - */ - if (bkey_whiteout(&insert->k)) - return false; - - insert->k.needs_whiteout = false; } k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); @@ -234,38 +211,39 @@ void bch2_btree_journal_key(struct btree_trans *trans, } static void bch2_insert_fixup_key(struct btree_trans *trans, - struct btree_insert_entry *insert) + struct btree_iter *iter, + struct bkey_i *insert) { - struct btree_iter *iter = insert->iter; struct btree_iter_level *l = &iter->l[0]; EBUG_ON(iter->level); - EBUG_ON(insert->k->k.u64s > + EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(trans->c, l->b)); - if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, - insert->k))) - bch2_btree_journal_key(trans, iter, insert->k); + if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert))) + bch2_btree_journal_key(trans, iter, insert); } /** * btree_insert_key - insert a key one key into a leaf node */ static void btree_insert_key_leaf(struct btree_trans *trans, - struct btree_insert_entry *insert) + struct btree_iter *iter, + struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct bset_tree *t = bset_tree_last(b); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + insert->k.needs_whiteout = false; + if (!btree_node_is_extents(b)) - bch2_insert_fixup_key(trans, insert); + bch2_insert_fixup_key(trans, iter, insert); else - bch2_insert_fixup_extent(trans, insert); + bch2_insert_fixup_extent(trans, iter, insert); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -279,26 +257,25 @@ static void btree_insert_key_leaf(struct btree_trans *trans, bch2_maybe_compact_whiteouts(c, b)) bch2_btree_iter_reinit_node(iter, b); - trace_btree_insert_key(c, b, insert->k); + trace_btree_insert_key(c, b, insert); } /* Normal update interface: */ static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) + struct btree_iter *iter, + struct bkey_i *insert) { struct bch_fs *c = trans->c; - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !(trans->flags & BTREE_INSERT_ATOMIC)); + BUG_ON(iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos)); + EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0); BUG_ON(debug_check_bkeys(c) && - !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); + !bkey_deleted(&insert->k) && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id)); } static noinline int @@ -339,11 +316,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, - struct btree_insert_entry *insert, + struct btree_iter *iter, + struct bkey_i *insert, unsigned *u64s) { struct bch_fs *c = trans->c; - struct btree *b = insert->iter->l[0].b; + struct btree *b = iter->l[0].b; static enum btree_insert_ret ret; if (unlikely(btree_node_fake(b))) @@ -351,7 +329,7 @@ btree_key_can_insert(struct btree_trans *trans, ret = !btree_node_is_extents(b) ? BTREE_INSERT_OK - : bch2_extent_can_insert(trans, insert, u64s); + : bch2_extent_can_insert(trans, iter, insert, u64s); if (ret) return ret; @@ -362,21 +340,22 @@ btree_key_can_insert(struct btree_trans *trans, } static inline void do_btree_insert_one(struct btree_trans *trans, - struct btree_insert_entry *insert) + struct btree_iter *iter, + struct bkey_i *insert) { - btree_insert_key_leaf(trans, insert); + btree_insert_key_leaf(trans, iter, insert); } -static inline bool update_has_trans_triggers(struct btree_insert_entry *i) +static inline bool iter_has_trans_triggers(struct btree_iter *iter) { - return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id); + return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); } -static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i) +static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) { return (BTREE_NODE_TYPE_HAS_TRIGGERS & ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & - (1U << i->iter->btree_id); + (1U << iter->btree_id); } static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) @@ -388,17 +367,11 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE - ? BCH_BUCKET_MARK_BUCKET_INVALIDATE - : 0; - - if (unlikely(trans->flags & BTREE_INSERT_NOMARK)) - return; trans_for_each_update(trans, i) if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i, NULL, - mark_flags|BCH_BUCKET_MARK_GC); + bch2_mark_update(trans, i->iter, i->k, NULL, + i->trigger_flags|BTREE_TRIGGER_GC); } static inline int @@ -408,10 +381,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; - unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE - ? BCH_BUCKET_MARK_BUCKET_INVALIDATE - : 0; - unsigned iter, u64s = 0; + unsigned u64s = 0; bool marking = false; int ret; @@ -428,13 +398,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); - trans_for_each_update_sorted(trans, i, iter) { + trans_for_each_update(trans, i) { /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, iter)) + if (!same_leaf_as_prev(trans, i)) u64s = 0; u64s += i->k->k.u64s; - ret = btree_key_can_insert(trans, i, &u64s); + ret = btree_key_can_insert(trans, i->iter, i->k, &u64s); if (ret) { *stopped_at = i; return ret; @@ -483,9 +453,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, } trans_for_each_update(trans, i) - if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - update_has_nontrans_triggers(i)) - bch2_mark_update(trans, i, fs_usage, mark_flags); + if (iter_has_nontrans_triggers(i->iter)) + bch2_mark_update(trans, i->iter, i->k, + fs_usage, i->trigger_flags); if (marking) bch2_trans_fs_usage_apply(trans, fs_usage); @@ -494,7 +464,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, bch2_trans_mark_gc(trans); trans_for_each_update(trans, i) - do_btree_insert_one(trans, i); + do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { bch2_fs_usage_scratch_put(c, fs_usage); @@ -512,44 +482,17 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, { struct btree_insert_entry *i; struct btree_iter *iter; - unsigned idx, u64s, journal_preres_u64s = 0; int ret; - /* - * note: running triggers will append more updates to the list of - * updates as we're walking it: - */ - trans_for_each_update(trans, i) { - /* we know trans->nounlock won't be set here: */ - if (unlikely(!(i->iter->locks_want < 1 - ? __bch2_btree_iter_upgrade(i->iter, 1) - : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) { - trace_trans_restart_upgrade(trans->ip); - return -EINTR; - } - - if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - update_has_trans_triggers(i)) { - ret = bch2_trans_mark_update(trans, i->iter, i->k); - if (unlikely(ret)) { - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip); - return ret; - } - } - - u64s = jset_u64s(i->k->k.u64s); - if (0) - journal_preres_u64s += u64s; - trans->journal_u64s += u64s; - } + trans_for_each_update(trans, i) + BUG_ON(!btree_node_intent_locked(i->iter, 0)); ret = bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, journal_preres_u64s, + &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, - journal_preres_u64s); + trans->journal_preres_u64s); if (unlikely(ret)) return ret; @@ -570,24 +513,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) trans_for_each_update(trans, i) - btree_insert_entry_checks(trans, i); + btree_insert_entry_checks(trans, i->iter, i->k); bch2_btree_trans_verify_locks(trans); - /* - * No more updates can be added - sort updates so we can take write - * locks in the correct order: - */ - btree_trans_sort_updates(trans); - - trans_for_each_update_sorted(trans, i, idx) - if (!same_leaf_as_prev(trans, idx)) + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) bch2_btree_node_lock_for_insert(trans->c, i->iter->l[0].b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); - trans_for_each_update_sorted(trans, i, idx) - if (!same_leaf_as_prev(trans, idx)) + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) bch2_btree_node_unlock_write_inlined(i->iter->l[0].b, i->iter); @@ -603,8 +540,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (trans->flags & BTREE_INSERT_NOUNLOCK) trans->nounlock = true; - trans_for_each_update_sorted(trans, i, idx) - if (!same_leaf_as_prev(trans, idx)) + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) bch2_foreground_maybe_merge(trans->c, i->iter, 0, trans->flags); @@ -636,8 +573,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, /* * if the split succeeded without dropping locks the insert will - * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the - * caller peeked() and is overwriting won't have changed) + * still be atomic (what the caller peeked() and is overwriting + * won't have changed) */ #if 0 /* @@ -708,13 +645,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, return ret2; } - /* - * BTREE_ITER_ATOMIC means we have to return -EINTR if we - * dropped locks: - */ - if (!(flags & BTREE_INSERT_ATOMIC)) - return 0; - trace_trans_restart_atomic(trans->ip); } @@ -744,79 +674,197 @@ int __bch2_trans_commit(struct btree_trans *trans) { struct btree_insert_entry *i = NULL; struct btree_iter *iter; - unsigned orig_nr_updates = trans->nr_updates; - unsigned orig_mem_top = trans->mem_top; + bool trans_trigger_run; + unsigned u64s; int ret = 0; + BUG_ON(trans->need_reset); + if (!trans->nr_updates) goto out_noupdates; - /* for the sake of sanity: */ - EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); - if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&trans->c->gc_lock); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + trans->journal_u64s = 0; + trans->journal_preres_u64s = 0; + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && unlikely(!percpu_ref_tryget(&trans->c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) return ret; } + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + trans_for_each_update(trans, i) { + if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) { + trace_trans_restart_traverse(trans->ip); + ret = -EINTR; + goto out; + } + + /* + * We're not using bch2_btree_iter_upgrade here because + * we know trans->nounlock can't be set: + */ + if (unlikely(i->iter->locks_want < 1 && + !__bch2_btree_iter_upgrade(i->iter, 1))) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto out; + } + + if (iter_has_trans_triggers(i->iter) && + !i->trans_triggers_run) { + i->trans_triggers_run = true; + trans_trigger_run = true; + + ret = bch2_trans_mark_update(trans, i->iter, i->k, + i->trigger_flags); + if (unlikely(ret)) { + if (ret == -EINTR) + trace_trans_restart_mark(trans->ip); + goto out; + } + } + } + } while (trans_trigger_run); + + trans_for_each_update(trans, i) { + u64s = jset_u64s(i->k->k.u64s); + if (0) + trans->journal_preres_u64s += u64s; + trans->journal_u64s += u64s; + } retry: memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - trans->journal_u64s = 0; ret = do_bch2_trans_commit(trans, &i); - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset(&trans->fs_usage_deltas->memset_start, 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } - /* make sure we didn't drop or screw up locks: */ bch2_btree_trans_verify_locks(trans); if (ret) goto err; + + trans_for_each_iter(trans, iter) + if ((trans->iters_live & (1ULL << iter->idx)) && + (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { + if (trans->flags & BTREE_INSERT_NOUNLOCK) + bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); + else + bch2_btree_iter_set_pos(iter, iter->pos_after_commit); + } out: bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); out_noupdates: - EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); - - trans_for_each_iter_all(trans, iter) - iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - - if (!ret) { - bch2_trans_unlink_iters(trans); - trans->iters_touched = 0; - } - trans->nr_updates = 0; - trans->mem_top = 0; + bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE); return ret; err: ret = bch2_trans_commit_error(trans, i, ret); - - /* can't loop if it was passed in and we changed it: */ - if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret) - ret = -EINTR; if (ret) goto out; - /* free updates and memory used by triggers, they'll be reexecuted: */ - trans->nr_updates = orig_nr_updates; - trans->mem_top = orig_mem_top; goto retry; } +int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_trigger_flags flags) +{ + struct btree_insert_entry *i, n = (struct btree_insert_entry) { + .trigger_flags = flags, .iter = iter, .k = k + }; + + EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k))); + + iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + iter->pos_after_commit = k->k.p; + iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + } + + /* + * Pending updates are kept sorted: first, find position of new update: + */ + trans_for_each_update(trans, i) + if (btree_iter_cmp(iter, i->iter) <= 0) + break; + + /* + * Now delete/trim any updates the new update overwrites: + */ + if (i > trans->updates && + i[-1].iter->btree_id == iter->btree_id && + bkey_cmp(iter->pos, i[-1].k->k.p) < 0) + bch2_cut_back(n.iter->pos, i[-1].k); + + while (i < trans->updates + trans->nr_updates && + iter->btree_id == i->iter->btree_id && + bkey_cmp(n.k->k.p, i->k->k.p) >= 0) + array_remove_item(trans->updates, trans->nr_updates, + i - trans->updates); + + if (i < trans->updates + trans->nr_updates && + iter->btree_id == i->iter->btree_id && + bkey_cmp(n.k->k.p, i->iter->pos) > 0) { + /* + * When we have an extent that overwrites the start of another + * update, trimming that extent will mean the iterator's + * position has to change since the iterator position has to + * match the extent's start pos - but we don't want to change + * the iterator pos if some other code is using it, so we may + * need to clone it: + */ + if (trans->iters_live & (1ULL << i->iter->idx)) { + i->iter = bch2_trans_copy_iter(trans, i->iter); + if (IS_ERR(i->iter)) { + trans->need_reset = true; + return PTR_ERR(i->iter); + } + + i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + bch2_trans_iter_put(trans, i->iter); + } + + bch2_cut_front(n.k->k.p, i->k); + bch2_btree_iter_set_pos(i->iter, n.k->k.p); + } + + EBUG_ON(trans->nr_updates >= trans->nr_iters); + + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + return 0; +} + +static int __bch2_btree_insert(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) +{ + struct btree_iter *iter; + + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + bch2_trans_update(trans, iter, k, 0); + return 0; +} + /** * bch2_btree_insert - insert keys into the extent btree * @c: pointer to struct bch_fs @@ -825,29 +873,12 @@ err: * @hook: insert callback */ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 *journal_seq, int flags) + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, int flags) { - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - bch2_trans_update(&trans, iter, k); - - ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); - if (ret == -EINTR) - goto retry; - bch2_trans_exit(&trans); - - return ret; + return bch2_trans_do(c, disk_res, journal_seq, flags, + __bch2_btree_insert(&trans, id, k)); } int bch2_btree_delete_at_range(struct btree_trans *trans, @@ -863,6 +894,8 @@ retry: bkey_cmp(iter->pos, end) < 0) { struct bkey_i delete; + bch2_trans_reset(trans, TRANS_RESET_MEM); + bkey_init(&delete.k); /* @@ -890,9 +923,8 @@ retry: break; } - bch2_trans_update(trans, iter, &delete); + bch2_trans_update(trans, iter, &delete, 0); ret = bch2_trans_commit(trans, NULL, journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); if (ret) break; @@ -917,7 +949,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k.k); k.k.p = iter->pos; - bch2_trans_update(trans, iter, &k); + bch2_trans_update(trans, iter, &k, 0); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE|flags); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8d223aa2bee5..731b93255876 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -161,7 +161,7 @@ struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) struct bch_fs_usage *ret; unsigned bytes = fs_usage_u64s(c) * sizeof(u64); - ret = kzalloc(bytes, GFP_NOWAIT); + ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); if (ret) return ret; @@ -628,7 +628,7 @@ unwind: percpu_rwsem_assert_held(&c->mark_lock); \ \ for (gc = 0; gc < 2 && !ret; gc++) \ - if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \ + if (!gc == !(flags & BTREE_TRIGGER_GC) || \ (gc && gc_visited(c, pos))) \ ret = fn(c, __VA_ARGS__, gc); \ ret; \ @@ -710,7 +710,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - bool gc = flags & BCH_BUCKET_MARK_GC; + bool gc = flags & BTREE_TRIGGER_GC; struct bkey_alloc_unpacked u; struct bch_dev *ca; struct bucket *g; @@ -719,8 +719,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, /* * alloc btree is read in by bch2_alloc_read, not gc: */ - if ((flags & BCH_BUCKET_MARK_GC) && - !(flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE)) + if ((flags & BTREE_TRIGGER_GC) && + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); @@ -743,7 +743,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, } })); - if (!(flags & BCH_BUCKET_MARK_ALLOC_READ)) + if (!(flags & BTREE_TRIGGER_ALLOC_READ)) bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); g->io_time[READ] = u.read_time; @@ -756,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, * not: */ - if ((flags & BCH_BUCKET_MARK_BUCKET_INVALIDATE) && + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old.cached_sectors) { update_cached_sectors(c, fs_usage, ca->dev_idx, -old.cached_sectors); @@ -842,13 +842,13 @@ static s64 __ptr_disk_sectors_delta(unsigned old_size, { BUG_ON(!n || !d); - if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) { + if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { BUG_ON(offset + -delta > old_size); return -disk_sectors_scaled(n, d, old_size) + disk_sectors_scaled(n, d, offset) + disk_sectors_scaled(n, d, old_size - offset + delta); - } else if (flags & BCH_BUCKET_MARK_OVERWRITE) { + } else if (flags & BTREE_TRIGGER_OVERWRITE) { BUG_ON(offset + -delta > old_size); return -disk_sectors_scaled(n, d, old_size) + @@ -874,8 +874,8 @@ static void bucket_set_stripe(struct bch_fs *c, u64 journal_seq, unsigned flags) { - bool enabled = !(flags & BCH_BUCKET_MARK_OVERWRITE); - bool gc = flags & BCH_BUCKET_MARK_GC; + bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); + bool gc = flags & BTREE_TRIGGER_GC; unsigned i; for (i = 0; i < v->nr_blocks; i++) { @@ -922,7 +922,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - bool gc = flags & BCH_BUCKET_MARK_GC; + bool gc = flags & BTREE_TRIGGER_GC; struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); @@ -970,7 +970,7 @@ static bool bch2_mark_pointer(struct bch_fs *c, new.data_type = data_type; } - if (flags & BCH_BUCKET_MARK_NOATOMIC) { + if (flags & BTREE_TRIGGER_NOATOMIC) { g->_mark = new; break; } @@ -1008,7 +1008,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, unsigned *nr_data, unsigned *nr_parity) { - bool gc = flags & BCH_BUCKET_MARK_GC; + bool gc = flags & BTREE_TRIGGER_GC; struct stripe *m; unsigned old, new; int blocks_nonempty_delta; @@ -1121,7 +1121,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { - bool gc = flags & BCH_BUCKET_MARK_GC; + bool gc = flags & BTREE_TRIGGER_GC; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); size_t idx = s.k->p.offset; struct stripe *m = genradix_ptr(&c->stripes[gc], idx); @@ -1129,14 +1129,14 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, spin_lock(&c->ec_stripes_heap_lock); - if (!m || ((flags & BCH_BUCKET_MARK_OVERWRITE) && !m->alive)) { + if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); return -1; } - if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) { + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { m->sectors = le16_to_cpu(s.v->sectors); m->algorithm = s.v->algorithm; m->nr_blocks = s.v->nr_blocks; @@ -1152,7 +1152,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, #endif /* gc recalculates these fields: */ - if (!(flags & BCH_BUCKET_MARK_GC)) { + if (!(flags & BTREE_TRIGGER_GC)) { for (i = 0; i < s.v->nr_blocks; i++) { m->block_sectors[i] = stripe_blockcount_get(s.v, i); @@ -1185,16 +1185,16 @@ int bch2_mark_key_locked(struct bch_fs *c, preempt_disable(); - if (!fs_usage || (flags & BCH_BUCKET_MARK_GC)) + if (!fs_usage || (flags & BTREE_TRIGGER_GC)) fs_usage = fs_usage_ptr(c, journal_seq, - flags & BCH_BUCKET_MARK_GC); + flags & BTREE_TRIGGER_GC); switch (k.k->type) { case KEY_TYPE_alloc: ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: - sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ? c->opts.btree_node_size : -c->opts.btree_node_size; @@ -1210,7 +1210,7 @@ int bch2_mark_key_locked(struct bch_fs *c, ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: - if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) + if (!(flags & BTREE_TRIGGER_OVERWRITE)) fs_usage->nr_inodes++; else fs_usage->nr_inodes--; @@ -1260,7 +1260,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, unsigned offset = 0; s64 sectors = 0; - flags |= BCH_BUCKET_MARK_OVERWRITE; + flags |= BTREE_TRIGGER_OVERWRITE; if (btree_node_is_extents(b) ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 @@ -1288,7 +1288,7 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, offset = bkey_start_offset(&new->k) - bkey_start_offset(old.k); sectors = -((s64) new->k.size); - flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; + flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; break; } @@ -1300,26 +1300,29 @@ inline int bch2_mark_overwrite(struct btree_trans *trans, } int bch2_mark_update(struct btree_trans *trans, - struct btree_insert_entry *insert, + struct btree_iter *iter, + struct bkey_i *insert, struct bch_fs_usage *fs_usage, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; int ret = 0; + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), - 0, insert->k->k.size, + bch2_mark_key_locked(c, bkey_i_to_s_c(insert), + 0, insert->k.size, fs_usage, trans->journal_res.seq, - BCH_BUCKET_MARK_INSERT|flags); + BTREE_TRIGGER_INSERT|flags); - if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) + if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) return 0; /* @@ -1328,7 +1331,7 @@ int bch2_mark_update(struct btree_trans *trans, */ if ((iter->btree_id == BTREE_ID_ALLOC || iter->btree_id == BTREE_ID_EC) && - !bkey_deleted(&insert->k->k)) + !bkey_deleted(&insert->k)) return 0; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, @@ -1336,7 +1339,7 @@ int bch2_mark_update(struct btree_trans *trans, struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); - ret = bch2_mark_overwrite(trans, iter, k, insert->k, + ret = bch2_mark_overwrite(trans, iter, k, insert, fs_usage, flags); if (ret <= 0) break; @@ -1430,30 +1433,6 @@ static int trans_get_key(struct btree_trans *trans, return ret; } -static void *trans_update_key(struct btree_trans *trans, - struct btree_iter *iter, - unsigned u64s) -{ - struct btree_insert_entry *i; - struct bkey_i *new_k; - - new_k = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(new_k)) - return new_k; - - bkey_init(&new_k->k); - new_k->k.p = iter->pos; - - trans_for_each_update(trans, i) - if (i->iter == iter) { - i->k = new_k; - return new_k; - } - - bch2_trans_update(trans, iter, new_k); - return new_k; -} - static int bch2_trans_mark_pointer(struct btree_trans *trans, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) @@ -1537,7 +1516,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, u.data_type = u.dirty_sectors || u.cached_sectors ? data_type : 0; - a = trans_update_key(trans, iter, BKEY_ALLOC_U64s_MAX); + a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; @@ -1545,6 +1524,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, bkey_alloc_init(&a->k_i); a->k.p = iter->pos; bch2_alloc_pack(a, u); + bch2_trans_update(trans, iter, &a->k_i, 0); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1559,9 +1539,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter *iter; - struct bkey_i *new_k; struct bkey_s_c k; - struct bkey_s_stripe s; + struct bkey_i_stripe *s; int ret = 0; ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); @@ -1576,21 +1555,21 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto out; } - new_k = trans_update_key(trans, iter, k.k->u64s); - ret = PTR_ERR_OR_ZERO(new_k); + s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(s); if (ret) goto out; - bkey_reassemble(new_k, k); - s = bkey_i_to_s_stripe(new_k); + bkey_reassemble(&s->k_i, k); - stripe_blockcount_set(s.v, p.block, - stripe_blockcount_get(s.v, p.block) + + stripe_blockcount_set(&s->v, p.block, + stripe_blockcount_get(&s->v, p.block) + sectors); - *nr_data = s.v->nr_blocks - s.v->nr_redundant; - *nr_parity = s.v->nr_redundant; - bch2_bkey_to_replicas(&r->e, s.s_c); + *nr_data = s->v.nr_blocks - s->v.nr_redundant; + *nr_parity = s->v.nr_redundant; + bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); + bch2_trans_update(trans, iter, &s->k_i, 0); out: bch2_trans_iter_put(trans, iter); return ret; @@ -1671,7 +1650,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter *iter; - struct bkey_i *new_k; struct bkey_s_c k; struct bkey_i_reflink_v *r_v; s64 ret; @@ -1689,7 +1667,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, goto err; } - if ((flags & BCH_BUCKET_MARK_OVERWRITE) && + if ((flags & BTREE_TRIGGER_OVERWRITE) && (bkey_start_offset(k.k) < idx || k.k->p.offset > idx + sectors)) goto out; @@ -1697,21 +1675,22 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - new_k = trans_update_key(trans, iter, k.k->u64s); - ret = PTR_ERR_OR_ZERO(new_k); + r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(r_v); if (ret) goto err; - bkey_reassemble(new_k, k); - r_v = bkey_i_to_reflink_v(new_k); + bkey_reassemble(&r_v->k_i, k); le64_add_cpu(&r_v->v.refcount, - !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); + !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); if (!r_v->v.refcount) { r_v->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&r_v->k, 0); } + + bch2_trans_update(trans, iter, &r_v->k_i, 0); out: ret = k.k->p.offset - idx; err: @@ -1750,7 +1729,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, switch (k.k->type) { case KEY_TYPE_btree_ptr: - sectors = !(flags & BCH_BUCKET_MARK_OVERWRITE) + sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ? c->opts.btree_node_size : -c->opts.btree_node_size; @@ -1763,7 +1742,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); - if (!(flags & BCH_BUCKET_MARK_OVERWRITE)) + if (!(flags & BTREE_TRIGGER_OVERWRITE)) d->nr_inodes++; else d->nr_inodes--; @@ -1791,22 +1770,26 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, int bch2_trans_mark_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert) + struct bkey_i *insert, + unsigned flags) { struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; struct bkey_packed *_k; int ret; + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + if (!btree_node_type_needs_gc(iter->btree_id)) return 0; ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), - 0, insert->k.size, BCH_BUCKET_MARK_INSERT); + 0, insert->k.size, BTREE_TRIGGER_INSERT); if (ret) return ret; - if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES)) + if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) return 0; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, @@ -1815,7 +1798,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, struct bkey_s_c k; unsigned offset = 0; s64 sectors = 0; - unsigned flags = BCH_BUCKET_MARK_OVERWRITE; + unsigned flags = BTREE_TRIGGER_OVERWRITE; k = bkey_disassemble(b, _k, &unpacked); @@ -1845,7 +1828,7 @@ int bch2_trans_mark_update(struct btree_trans *trans, offset = bkey_start_offset(&insert->k) - bkey_start_offset(k.k); sectors = -((s64) insert->k.size); - flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT; + flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; break; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ad6f731b1cea..4717a1a6f568 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -258,14 +258,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -#define BCH_BUCKET_MARK_INSERT (1 << 0) -#define BCH_BUCKET_MARK_OVERWRITE (1 << 1) -#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2) -#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3) -#define BCH_BUCKET_MARK_GC (1 << 4) -#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5) -#define BCH_BUCKET_MARK_NOATOMIC (1 << 6) - int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, @@ -276,17 +268,16 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, struct bkey_s_c, struct bkey_i *, struct bch_fs_usage *, unsigned); -int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *, - struct bch_fs_usage *, unsigned); +int bch2_mark_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bch_fs_usage *, unsigned); int bch2_replicas_delta_list_apply(struct bch_fs *, struct bch_fs_usage *, struct replicas_delta_list *); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, unsigned, s64, unsigned); -int bch2_trans_mark_update(struct btree_trans *, - struct btree_iter *iter, - struct bkey_i *insert); +int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, + struct bkey_i *insert, unsigned); void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); /* disk reservations: */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 059eca01ccc4..5028d0dcc2d6 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -6,6 +6,7 @@ #include "buckets.h" #include "chardev.h" #include "move.h" +#include "replicas.h" #include "super.h" #include "super-io.h" @@ -371,89 +372,116 @@ err: return ret; } -static long bch2_ioctl_usage(struct bch_fs *c, - struct bch_ioctl_usage __user *user_arg) +static long bch2_ioctl_fs_usage(struct bch_fs *c, + struct bch_ioctl_fs_usage __user *user_arg) { - struct bch_ioctl_usage arg; - struct bch_dev *ca; - unsigned i, j; - int ret; + struct bch_ioctl_fs_usage *arg = NULL; + struct bch_replicas_usage *dst_e, *dst_end; + struct bch_fs_usage *src; + u32 replica_entries_bytes; + unsigned i; + int ret = 0; if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EINVAL; - if (copy_from_user(&arg, user_arg, sizeof(arg))) + if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) return -EFAULT; - for (i = 0; i < arg.nr_devices; i++) { - struct bch_ioctl_dev_usage dst = { .alive = 0 }; + arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); + if (!arg) + return -ENOMEM; - ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); - if (ret) - return ret; + src = bch2_fs_usage_read(c); + if (!src) { + ret = -ENOMEM; + goto err; } - { - struct bch_fs_usage *src; - struct bch_ioctl_fs_usage dst = { - .capacity = c->capacity, - }; + arg->capacity = c->capacity; + arg->used = bch2_fs_sectors_used(c, src); + arg->online_reserved = src->online_reserved; - src = bch2_fs_usage_read(c); - if (!src) - return -ENOMEM; + for (i = 0; i < BCH_REPLICAS_MAX; i++) + arg->persistent_reserved[i] = src->persistent_reserved[i]; - dst.used = bch2_fs_sectors_used(c, src); - dst.online_reserved = src->online_reserved; + dst_e = arg->replicas; + dst_end = (void *) arg->replicas + replica_entries_bytes; - percpu_up_read(&c->mark_lock); + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *src_e = + cpu_replicas_entry(&c->replicas, i); - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - dst.persistent_reserved[i] = - src->persistent_reserved[i]; -#if 0 - for (j = 0; j < BCH_DATA_NR; j++) - dst.sectors[j][i] = src.replicas[i].data[j]; -#endif + if (replicas_usage_next(dst_e) > dst_end) { + ret = -ERANGE; + break; } - kfree(src); + dst_e->sectors = src->replicas[i]; + dst_e->r = *src_e; + + /* recheck after setting nr_devs: */ + if (replicas_usage_next(dst_e) > dst_end) { + ret = -ERANGE; + break; + } - ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst)); - if (ret) - return ret; + memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); + + dst_e = replicas_usage_next(dst_e); } - for_each_member_device(ca, c, i) { - struct bch_dev_usage src = bch2_dev_usage_read(c, ca); - struct bch_ioctl_dev_usage dst = { - .alive = 1, - .state = ca->mi.state, - .bucket_size = ca->mi.bucket_size, - .nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket, - }; - - if (ca->dev_idx >= arg.nr_devices) { - percpu_ref_put(&ca->ref); - return -ERANGE; - } + arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; - if (percpu_ref_tryget(&ca->io_ref)) { - dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev); - percpu_ref_put(&ca->io_ref); - } + percpu_up_read(&c->mark_lock); + kfree(src); - for (j = 0; j < BCH_DATA_NR; j++) { - dst.buckets[j] = src.buckets[j]; - dst.sectors[j] = src.sectors[j]; - } + if (!ret) + ret = copy_to_user(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes); +err: + kfree(arg); + return ret; +} + +static long bch2_ioctl_dev_usage(struct bch_fs *c, + struct bch_ioctl_dev_usage __user *user_arg) +{ + struct bch_ioctl_dev_usage arg; + struct bch_dev_usage src; + struct bch_dev *ca; + unsigned i; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; - ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst)); - if (ret) - return ret; + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(c, ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + + for (i = 0; i < BCH_DATA_NR; i++) { + arg.buckets[i] = src.buckets[i]; + arg.sectors[i] = src.sectors[i]; } - return 0; + percpu_ref_put(&ca->ref); + + return copy_to_user(user_arg, &arg, sizeof(arg)); } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -547,8 +575,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) switch (cmd) { case BCH_IOCTL_QUERY_UUID: return bch2_ioctl_query_uuid(c, arg); - case BCH_IOCTL_USAGE: - return bch2_ioctl_usage(c, arg); + case BCH_IOCTL_FS_USAGE: + return bch2_ioctl_fs_usage(c, arg); + case BCH_IOCTL_DEV_USAGE: + return bch2_ioctl_dev_usage(c, arg); } if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index ad6993b7565a..a5c947e8adf3 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -10,7 +10,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <keys/user-type.h> @@ -67,21 +67,21 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t } } -static inline void do_encrypt_sg(struct crypto_skcipher *tfm, +static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { - SKCIPHER_REQUEST_ON_STACK(req, tfm); + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; - skcipher_request_set_tfm(req, tfm); + skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); BUG_ON(ret); } -static inline void do_encrypt(struct crypto_skcipher *tfm, +static inline void do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { @@ -94,8 +94,8 @@ static inline void do_encrypt(struct crypto_skcipher *tfm, int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { - struct crypto_skcipher *chacha20 = - crypto_alloc_skcipher("chacha20", 0, 0); + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; if (!chacha20) { @@ -103,7 +103,8 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, return PTR_ERR(chacha20); } - ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); if (ret) { pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; @@ -111,7 +112,7 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, do_encrypt(chacha20, nonce, buf, len); err: - crypto_free_skcipher(chacha20); + crypto_free_sync_skcipher(chacha20); return ret; } @@ -198,7 +199,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crc = bch2_checksum_update(type, crc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -223,7 +224,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, kunmap_atomic(p); } #else - __bio_for_each_contig_segment(bv, bio, *iter, *iter) + __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); @@ -461,7 +462,7 @@ err: static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) - c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); if (IS_ERR(c->chacha20)) { bch_err(c, "error requesting chacha20 module: %li", PTR_ERR(c->chacha20)); @@ -544,7 +545,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) goto err; } - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; @@ -572,7 +573,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c) if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) - crypto_free_skcipher(c->chacha20); + crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } @@ -604,7 +605,7 @@ int bch2_fs_encryption_init(struct bch_fs *c) if (ret) goto out; - ret = crypto_skcipher_setkey(c->chacha20, + ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 0b359aba2526..ca9e45906dc8 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -7,7 +7,7 @@ #include "super-io.h" #include <linux/crc64.h> -#include <crypto/chacha20.h> +#include <crypto/chacha.h> static inline bool bch2_checksum_mergeable(unsigned type) { @@ -108,8 +108,8 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) } static const unsigned bch2_compression_opt_to_type[] = { -#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t, - BCH_COMPRESSION_TYPES() +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() #undef x }; @@ -138,9 +138,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) /* for skipping ahead and encrypting/decrypting at an offset: */ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) { - EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); - le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); return nonce; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f18266330687..d9de0d1302e2 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -18,6 +18,14 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) size_t i; spin_lock(&clock->timer_lock); + + if (time_after_eq((unsigned long) atomic_long_read(&clock->now), + timer->expire)) { + spin_unlock(&clock->timer_lock); + timer->fn(timer); + return; + } + for (i = 0; i < clock->timers.used; i++) if (clock->timers.data[i] == timer) goto out; @@ -135,26 +143,31 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, return ret; } -void __bch2_increment_clock(struct io_clock *clock) +void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; - unsigned long now; - unsigned sectors; + unsigned long now = atomic_long_add_return(sectors, &clock->now); - /* Buffer up one megabyte worth of IO in the percpu counter */ - preempt_disable(); + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +} - if (this_cpu_read(*clock->pcpu_buf) < IO_CLOCK_PCPU_SECTORS) { - preempt_enable(); - return; - } +ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) +{ + struct printbuf out = _PBUF(buf, PAGE_SIZE); + unsigned long now; + unsigned i; - sectors = this_cpu_xchg(*clock->pcpu_buf, 0); - preempt_enable(); - now = atomic_long_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); + now = atomic_long_read(&clock->now); - while ((timer = get_expired_timer(clock, now))) - timer->fn(timer); + for (i = 0; i < clock->timers.used; i++) + pr_buf(&out, "%pf:\t%li\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->expire - now); + spin_unlock(&clock->timer_lock); + + return out.pos - buf; } void bch2_io_clock_exit(struct io_clock *clock) @@ -168,6 +181,8 @@ int bch2_io_clock_init(struct io_clock *clock) atomic_long_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); + clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); if (!clock->pcpu_buf) return -ENOMEM; diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index bfbbca8a207b..da50afe206cc 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -7,7 +7,7 @@ void bch2_io_timer_del(struct io_clock *, struct io_timer *); void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, unsigned long); -void __bch2_increment_clock(struct io_clock *); +void __bch2_increment_clock(struct io_clock *, unsigned); static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw) @@ -16,7 +16,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= IO_CLOCK_PCPU_SECTORS)) - __bch2_increment_clock(clock); + __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); } void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); @@ -30,6 +30,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); __ret; \ }) +ssize_t bch2_io_timers_show(struct io_clock *, char *); + void bch2_io_clock_exit(struct io_clock *); int bch2_io_clock_init(struct io_clock *); diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index 2b5e499e12b4..92c740a47565 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -28,6 +28,7 @@ typedef HEAP(struct io_timer *) io_timer_heap; struct io_clock { atomic_long_t now; u16 __percpu *pcpu_buf; + unsigned max_slop; spinlock_t timer_lock; io_timer_heap timers; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 24f565614cd9..bb557eda111b 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -66,7 +66,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); #ifndef CONFIG_HIGHMEM - __bio_for_each_contig_segment(bv, bio, iter, start) { + __bio_for_each_bvec(bv, bio, iter, start) { if (bv.bv_len == start.bi_size) return (struct bbuf) { .b = page_address(bv.bv_page) + bv.bv_offset, @@ -158,14 +158,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { - case BCH_COMPRESSION_LZ4_OLD: - case BCH_COMPRESSION_LZ4: + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: ret = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); if (ret != dst_len) goto err; break; - case BCH_COMPRESSION_GZIP: { + case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { .next_in = src_data.b, .avail_in = src_len, @@ -185,7 +185,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, goto err; break; } - case BCH_COMPRESSION_ZSTD: { + case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t len; @@ -290,10 +290,10 @@ static int attempt_compress(struct bch_fs *c, void *workspace, void *dst, size_t dst_len, void *src, size_t src_len, - unsigned compression_type) + enum bch_compression_type compression_type) { switch (compression_type) { - case BCH_COMPRESSION_LZ4: { + case BCH_COMPRESSION_TYPE_lz4: { int len = src_len; int ret = LZ4_compress_destSize( src, dst, @@ -305,7 +305,7 @@ static int attempt_compress(struct bch_fs *c, return ret; } - case BCH_COMPRESSION_GZIP: { + case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { .next_in = src, .avail_in = src_len, @@ -326,7 +326,7 @@ static int attempt_compress(struct bch_fs *c, return strm.total_out; } - case BCH_COMPRESSION_ZSTD: { + case BCH_COMPRESSION_TYPE_zstd: { ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); @@ -348,14 +348,14 @@ static int attempt_compress(struct bch_fs *c, static unsigned __bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, - unsigned compression_type) + enum bch_compression_type compression_type) { struct bbuf src_data = { NULL }, dst_data = { NULL }; void *workspace; unsigned pad; int ret = 0; - BUG_ON(compression_type >= BCH_COMPRESSION_NR); + BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); /* If it's only one block, don't bother trying to compress: */ @@ -452,8 +452,8 @@ unsigned bch2_bio_compress(struct bch_fs *c, /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - if (compression_type == BCH_COMPRESSION_LZ4_OLD) - compression_type = BCH_COMPRESSION_LZ4; + if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) + compression_type = BCH_COMPRESSION_TYPE_lz4; compression_type = __bio_compress(c, dst, dst_len, src, src_len, compression_type); @@ -465,15 +465,15 @@ unsigned bch2_bio_compress(struct bch_fs *c, static int __bch2_fs_compress_init(struct bch_fs *, u64); -#define BCH_FEATURE_NONE 0 +#define BCH_FEATURE_none 0 static const unsigned bch2_compression_opt_to_feature[] = { -#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, - BCH_COMPRESSION_TYPES() +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, + BCH_COMPRESSION_OPTS() #undef x }; -#undef BCH_FEATURE_NONE +#undef BCH_FEATURE_none static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) { @@ -537,11 +537,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) size_t compress_workspace; size_t decompress_workspace; } compression_types[] = { - { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 }, - { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP, + { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, - { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD, + { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, ZSTD_CCtxWorkspaceBound(params.cParams), ZSTD_DCtxWorkspaceBound() }, }, *i; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 38017699c04a..623b6c3eda95 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -246,7 +246,7 @@ int bch2_dirent_rename(struct btree_trans *trans, */ new_dst->k.p = src_iter->pos; bch2_trans_update(trans, src_iter, - &new_dst->k_i); + &new_dst->k_i, 0); return 0; } else { /* If we're overwriting, we can't insert new_dst @@ -268,8 +268,8 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, src_iter, &new_src->k_i); - bch2_trans_update(trans, dst_iter, &new_dst->k_i); + bch2_trans_update(trans, src_iter, &new_src->k_i, 0); + bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); return 0; } @@ -281,18 +281,6 @@ int bch2_dirent_delete_at(struct btree_trans *trans, hash_info, iter); } -int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name, - u64 *journal_seq) -{ - return bch2_trans_do(c, journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL, - bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info, - dir_inum, name)); -} - struct btree_iter * __bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, const struct bch_hash_info *hash_info, @@ -343,7 +331,9 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) break; } } - bch2_trans_iter_put(trans, iter); + + if (!IS_ERR(iter)) + bch2_trans_iter_put(trans, iter); return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index e6184dc796d3..34769371dd13 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -36,8 +36,6 @@ int bch2_dirent_create(struct btree_trans *, u64, int bch2_dirent_delete_at(struct btree_trans *, const struct bch_hash_info *, struct btree_iter *); -int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *, u64 *); enum bch_rename_mode { BCH_RENAME, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 5287b5ee7d4a..a49d0745c720 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -736,10 +736,9 @@ found_slot: stripe->k.p = iter->pos; - bch2_trans_update(&trans, iter, &stripe->k_i); + bch2_trans_update(&trans, iter, &stripe->k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); err: if (ret == -EINTR) @@ -819,10 +818,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, extent_stripe_ptr_add(e, s, ec_ptr, idx); - bch2_trans_update(&trans, iter, sk.k); + bch2_trans_update(&trans, iter, sk.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); if (ret == -EINTR) @@ -1232,7 +1230,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); - bch2_trans_update(trans, iter, &new_key->k_i); + bch2_trans_update(trans, iter, &new_key->k_i, 0); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags); @@ -1259,8 +1257,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) if (!m->dirty) continue; - ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos, - new_key, flags); + do { + bch2_trans_reset(&trans, TRANS_RESET_MEM); + + ret = __bch2_stripe_write_key(&trans, iter, m, + giter.pos, new_key, flags); + } while (ret == -EINTR); + if (ret) break; @@ -1313,8 +1316,8 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_mark_key(c, btree ? btree_k : journal_k, 0, 0, NULL, 0, - BCH_BUCKET_MARK_ALLOC_READ| - BCH_BUCKET_MARK_NOATOMIC); + BTREE_TRIGGER_ALLOC_READ| + BTREE_TRIGGER_NOATOMIC); if (btree) btree_k = bch2_btree_iter_next(btree_iter); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7dcb0f6552fc..de319794ccd1 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -17,26 +17,6 @@ struct work_struct; /* Error messages: */ /* - * Very fatal logic/inconsistency errors: these indicate that we've majorly - * screwed up at runtime, i.e. it's not likely that it was just caused by the - * data on disk being inconsistent. These BUG(): - * - * XXX: audit and convert to inconsistent() checks - */ - -#define bch2_fs_bug(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - BUG(); \ -} while (0) - -#define bch2_fs_bug_on(cond, c, ...) \ -do { \ - if (cond) \ - bch2_fs_bug(c, __VA_ARGS__); \ -} while (0) - -/* * Inconsistency errors: The on disk data is inconsistent. If these occur during * initial recovery, they don't indicate a bug in the running code - we walk all * the metadata before modifying anything. If they occur at runtime, they diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 742b4d78cb3a..846d77dc2530 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -166,54 +166,72 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *trans, - struct btree_insert_entry *insert, + struct btree_iter *iter, + struct bkey_i *insert, unsigned *u64s) { - struct btree_iter_level *l = &insert->iter->l[0]; + struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; - enum bch_extent_overlap overlap; struct bkey_packed *_k; struct bkey unpacked; - struct bkey_s_c k; int sectors; - /* - * We avoid creating whiteouts whenever possible when deleting, but - * those optimizations mean we may potentially insert two whiteouts - * instead of one (when we overlap with the front of one extent and the - * back of another): - */ - if (bkey_whiteout(&insert->k->k)) - *u64s += BKEY_U64s; - - _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, - KEY_TYPE_discard); - if (!_k) - return BTREE_INSERT_OK; - - k = bkey_disassemble(l->b, _k, &unpacked); - - overlap = bch2_extent_overlap(&insert->k->k, k.k); - - /* account for having to split existing extent: */ - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) - *u64s += _k->u64s; - - if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bch2_bkey_sectors_compressed(k))) { - int flags = trans->flags & BTREE_INSERT_NOFAIL - ? BCH_DISK_RESERVATION_NOFAIL : 0; - - switch (bch2_disk_reservation_add(trans->c, - trans->disk_res, - sectors, flags)) { - case 0: + while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, + KEY_TYPE_discard))) { + struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); + enum bch_extent_overlap overlap = + bch2_extent_overlap(&insert->k, k.k); + + if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; - case -ENOSPC: - return BTREE_INSERT_ENOSPC; - default: - BUG(); + + overlap = bch2_extent_overlap(&insert->k, k.k); + + /* + * If we're overwriting an existing extent, we may need to emit + * a whiteout - unless we're inserting a new extent at the same + * position: + */ + if (k.k->needs_whiteout && + (!bkey_whiteout(&insert->k) || + bkey_cmp(k.k->p, insert->k.p))) + *u64s += BKEY_U64s; + + /* + * If we're partially overwriting an existing extent which has + * been written out to disk, we'll need to emit a new version of + * that extent: + */ + if (bkey_written(l->b, _k) && + overlap != BCH_EXTENT_OVERLAP_ALL) + *u64s += _k->u64s; + + /* And we may be splitting an existing extent: */ + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) + *u64s += _k->u64s; + + if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && + (sectors = bch2_bkey_sectors_compressed(k))) { + int flags = trans->flags & BTREE_INSERT_NOFAIL + ? BCH_DISK_RESERVATION_NOFAIL : 0; + + switch (bch2_disk_reservation_add(trans->c, + trans->disk_res, + sectors, flags)) { + case 0: + break; + case -ENOSPC: + return BTREE_INSERT_ENOSPC; + default: + BUG(); + } } + + if (overlap == BCH_EXTENT_OVERLAP_FRONT || + overlap == BCH_EXTENT_OVERLAP_MIDDLE) + break; + + bch2_btree_node_iter_advance(&node_iter, l->b); } return BTREE_INSERT_OK; @@ -284,6 +302,52 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); } +static void pack_push_whiteout(struct bch_fs *c, struct btree *b, + struct bpos pos) +{ + struct bkey_packed k; + + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey_i tmp; + + bkey_init(&tmp.k); + tmp.k.p = pos; + bkey_copy(&k, &tmp); + } + + k.needs_whiteout = true; + push_whiteout(c, b, &k); +} + +static void +extent_drop(struct bch_fs *c, struct btree_iter *iter, + struct bkey_packed *_k, struct bkey_s k) +{ + struct btree_iter_level *l = &iter->l[0]; + + if (!bkey_whiteout(k.k)) + btree_account_key_drop(l->b, _k); + + k.k->size = 0; + k.k->type = KEY_TYPE_deleted; + + if (!btree_node_old_extent_overwrite(l->b) && + k.k->needs_whiteout) { + pack_push_whiteout(c, l->b, k.k->p); + k.k->needs_whiteout = false; + } + + if (_k >= btree_bset_last(l->b)->start) { + unsigned u64s = _k->u64s; + + bch2_bset_delete(l->b, _k, _k->u64s); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); + } else { + extent_save(l->b, _k, k.k); + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + } +} + static void extent_squash(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert, @@ -291,95 +355,117 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, enum bch_extent_overlap overlap) { struct btree_iter_level *l = &iter->l[0]; - int u64s_delta; + struct bkey_on_stack tmp, split; + + bkey_on_stack_init(&tmp); + bkey_on_stack_init(&split); + + if (!btree_node_old_extent_overwrite(l->b)) { + if (!bkey_whiteout(&insert->k) && + !bkey_cmp(k.k->p, insert->k.p)) { + insert->k.needs_whiteout = k.k->needs_whiteout; + k.k->needs_whiteout = false; + } + } else { + insert->k.needs_whiteout |= k.k->needs_whiteout; + } switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: - /* insert overlaps with start of k: */ - u64s_delta = bch2_cut_front_s(insert->k.p, k); - btree_keys_account_val_delta(l->b, _k, u64s_delta); + if (bkey_written(l->b, _k)) { + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_front(insert->k.p, tmp.k); + + /* + * needs_whiteout was propagated to new version of @k, + * @tmp: + */ + if (!btree_node_old_extent_overwrite(l->b)) + k.k->needs_whiteout = false; + + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); + } else { + btree_keys_account_val_delta(l->b, _k, + bch2_cut_front_s(insert->k.p, k)); - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); + extent_save(l->b, _k, k.k); + /* + * No need to call bset_fix_invalidated_key, start of + * extent changed but extents are indexed by where they + * end + */ + bch2_btree_iter_fix_key_modified(iter, l->b, _k); + } break; - case BCH_EXTENT_OVERLAP_BACK: - /* insert overlaps with end of k: */ - u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k); - btree_keys_account_val_delta(l->b, _k, u64s_delta); + if (bkey_written(l->b, _k)) { + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); + + /* + * @tmp has different position than @k, needs_whiteout + * should not be propagated: + */ + if (!btree_node_old_extent_overwrite(l->b)) + tmp.k->k.needs_whiteout = false; + + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); + } else { + /* + * position of @k is changing, emit a whiteout if + * needs_whiteout is set: + */ + if (!btree_node_old_extent_overwrite(l->b) && + k.k->needs_whiteout) { + pack_push_whiteout(c, l->b, k.k->p); + k.k->needs_whiteout = false; + } - EBUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); + btree_keys_account_val_delta(l->b, _k, + bch2_cut_back_s(bkey_start_pos(&insert->k), k)); + extent_save(l->b, _k, k.k); - /* - * As the auxiliary tree is indexed by the end of the - * key and we've just changed the end, update the - * auxiliary tree. - */ - bch2_bset_fix_invalidated_key(l->b, _k); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, _k->u64s, _k->u64s); + bch2_bset_fix_invalidated_key(l->b, _k); + bch2_btree_node_iter_fix(iter, l->b, &l->iter, + _k, _k->u64s, _k->u64s); + } + break; + case BCH_EXTENT_OVERLAP_ALL: + extent_drop(c, iter, _k, k); break; + case BCH_EXTENT_OVERLAP_MIDDLE: + bkey_on_stack_reassemble(&split, c, k.s_c); + bch2_cut_back(bkey_start_pos(&insert->k), split.k); - case BCH_EXTENT_OVERLAP_ALL: { - /* The insert key completely covers k, invalidate k */ - if (!bkey_whiteout(k.k)) - btree_account_key_drop(l->b, _k); + if (!btree_node_old_extent_overwrite(l->b)) + split.k->k.needs_whiteout = false; - k.k->size = 0; - k.k->type = KEY_TYPE_deleted; + /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ + if (bkey_written(l->b, _k)) { + bkey_on_stack_reassemble(&tmp, c, k.s_c); + bch2_cut_front(insert->k.p, tmp.k); - if (_k >= btree_bset_last(l->b)->start) { - unsigned u64s = _k->u64s; + if (!btree_node_old_extent_overwrite(l->b)) + k.k->needs_whiteout = false; - bch2_bset_delete(l->b, _k, _k->u64s); - bch2_btree_node_iter_fix(iter, l->b, &l->iter, - _k, u64s, 0); + extent_drop(c, iter, _k, k); + extent_bset_insert(c, iter, tmp.k); } else { + btree_keys_account_val_delta(l->b, _k, + bch2_cut_front_s(insert->k.p, k)); + extent_save(l->b, _k, k.k); bch2_btree_iter_fix_key_modified(iter, l->b, _k); } - break; - } - case BCH_EXTENT_OVERLAP_MIDDLE: { - struct bkey_on_stack split; - - bkey_on_stack_init(&split); - bkey_on_stack_reassemble(&split, c, k.s_c); - - /* - * The insert key falls 'in the middle' of k - * The insert key splits k in 3: - * - start only in k, preserve - * - middle common section, invalidate in k - * - end only in k, preserve - * - * We update the old key to preserve the start, - * insert will be the new common section, - * we manually insert the end that we are preserving. - * - * modify k _before_ doing the insert (which will move - * what k points to) - */ - split.k->k.needs_whiteout |= bkey_written(l->b, _k); - - bch2_cut_back(bkey_start_pos(&insert->k), split.k); - BUG_ON(bkey_deleted(&split.k->k)); - - u64s_delta = bch2_cut_front_s(insert->k.p, k); - btree_keys_account_val_delta(l->b, _k, u64s_delta); - - BUG_ON(bkey_deleted(k.k)); - extent_save(l->b, _k, k.k); - bch2_btree_iter_fix_key_modified(iter, l->b, _k); - extent_bset_insert(c, iter, split.k); - bkey_on_stack_exit(&split, c); break; } - } + + bkey_on_stack_exit(&split, c); + bkey_on_stack_exit(&tmp, c); } /** @@ -422,17 +508,13 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter, * key insertion needs to continue/be retried. */ void bch2_insert_fixup_extent(struct btree_trans *trans, - struct btree_insert_entry *insert_entry) + struct btree_iter *iter, + struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct btree_iter *iter = insert_entry->iter; - struct bkey_i *insert = insert_entry->k; struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; - bool deleting = bkey_whiteout(&insert->k); - bool update_journal = !deleting; - bool update_btree = !deleting; - struct bkey_i whiteout = *insert; + bool do_update = !bkey_whiteout(&insert->k); struct bkey_packed *_k; struct bkey unpacked; @@ -443,7 +525,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, KEY_TYPE_discard))) { struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); - struct bpos cur_end = bpos_min(insert->k.p, k.k->p); enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); @@ -451,52 +532,17 @@ void bch2_insert_fixup_extent(struct btree_trans *trans, break; if (!bkey_whiteout(k.k)) - update_journal = true; + do_update = true; + + if (!do_update) { + struct bpos cur_end = bpos_min(insert->k.p, k.k->p); - if (!update_journal) { bch2_cut_front(cur_end, insert); - bch2_cut_front(cur_end, &whiteout); bch2_btree_iter_set_pos_same_leaf(iter, cur_end); - goto next; - } - - /* - * When deleting, if possible just do it by switching the type - * of the key we're deleting, instead of creating and inserting - * a new whiteout: - */ - if (deleting && - !update_btree && - !bkey_cmp(insert->k.p, k.k->p) && - !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) { - if (!bkey_whiteout(k.k)) { - btree_account_key_drop(l->b, _k); - _k->type = KEY_TYPE_discard; - reserve_whiteout(l->b, _k); - bch2_btree_iter_fix_key_modified(iter, - l->b, _k); - } - break; - } - - if (k.k->needs_whiteout || bkey_written(l->b, _k)) { - insert->k.needs_whiteout = true; - update_btree = true; - } - - if (update_btree && - overlap == BCH_EXTENT_OVERLAP_ALL && - bkey_whiteout(k.k) && - k.k->needs_whiteout) { - unreserve_whiteout(l->b, _k); - _k->needs_whiteout = false; + } else { + extent_squash(c, iter, insert, _k, k, overlap); } - extent_squash(c, iter, insert, _k, k, overlap); - - if (!update_btree) - bch2_cut_front(cur_end, insert); -next: node_iter = l->iter; if (overlap == BCH_EXTENT_OVERLAP_FRONT || @@ -507,24 +553,15 @@ next: l->iter = node_iter; bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); - if (update_btree) { - if (deleting) + if (do_update) { + if (insert->k.type == KEY_TYPE_deleted) insert->k.type = KEY_TYPE_discard; - EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); - - extent_bset_insert(c, iter, insert); - } - - if (update_journal) { - struct bkey_i *k = !deleting ? insert : &whiteout; - - if (deleting) - k->k.type = KEY_TYPE_discard; - - EBUG_ON(bkey_deleted(&k->k) || !k->k.size); + if (!bkey_whiteout(&insert->k) || + btree_node_old_extent_overwrite(l->b)) + extent_bset_insert(c, iter, insert); - bch2_btree_journal_key(trans, iter, k); + bch2_btree_journal_key(trans, iter, insert); } bch2_cut_front(insert->k.p, insert); diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 89d18e4b6758..e9dc8091ba3f 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -10,9 +10,10 @@ int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); enum btree_insert_ret -bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, - unsigned *); +bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, + struct bkey_i *, unsigned *); void bch2_insert_fixup_extent(struct btree_trans *, - struct btree_insert_entry *); + struct btree_iter *, + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 6bcc178604b0..c4b0b9e15a8f 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -172,14 +172,17 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) struct bucket_mark mark; struct bch_dev *ca; - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) return; + if (!percpu_down_read_trylock(&c->mark_lock)) + return; + + bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); @@ -194,13 +197,15 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) mark.dirty_sectors < c->opts.btree_node_size) goto err; } - +out: + percpu_up_read(&c->mark_lock); return; err: - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, buf, PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); + bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", + err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + PTR_BUCKET_NR(ca, ptr), + mark.gen, (unsigned) mark.v.counter); + goto out; } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -223,29 +228,18 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; char buf[160]; - /* - * XXX: we should be doing most/all of these checks at startup time, - * where we check bch2_bkey_invalid() in btree_node_read_done() - * - * But note that we can't check for stale pointers or incorrect gc marks - * until after journal replay is done (it might be an extent that's - * going to get overwritten during replay) - */ - - if (percpu_down_read_trylock(&c->mark_lock)) { - bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - percpu_up_read(&c->mark_lock); - } - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) return; + if (!percpu_down_read_trylock(&c->mark_lock)) + return; + + bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); + extent_for_each_ptr_decode(e, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); @@ -255,21 +249,24 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ? mark.cached_sectors : mark.dirty_sectors; - bch2_fs_bug_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); - - bch2_fs_bug_on(!stale && - (mark.data_type != BCH_DATA_USER || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); + bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); + + bch2_fs_inconsistent_on(stale > 96, c, + "key too stale: %i", stale); + + bch2_fs_inconsistent_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); } + + percpu_up_read(&c->mark_lock); } void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, @@ -614,7 +611,7 @@ unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ret += !p.ptr.cached && - p.crc.compression_type == BCH_COMPRESSION_NONE; + p.crc.compression_type == BCH_COMPRESSION_TYPE_none; } return ret; @@ -629,7 +626,7 @@ unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE) + p.crc.compression_type != BCH_COMPRESSION_TYPE_none) ret += p.crc.compressed_size; return ret; @@ -1054,7 +1051,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) if (!bch2_checksum_type_valid(c, crc.csum_type)) return "invalid checksum type"; - if (crc.compression_type >= BCH_COMPRESSION_NR) + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) return "invalid compression type"; if (bch2_csum_type_is_encryption(crc.csum_type)) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 1140d01a42ab..7c5a41e6d79d 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,6 @@ struct bch_fs; struct btree_trans; -struct btree_insert_entry; /* extent entries: */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index a4497eeb1f1b..96f7bbe0a3ed 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -76,11 +76,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, } int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, - u64 inum, struct bch_inode_unpacked *inode_u, - const struct qstr *name) + u64 inum, struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, const struct qstr *name) { struct btree_iter *dir_iter, *inode_iter; - struct bch_inode_unpacked dir_u; struct bch_hash_info dir_hash; u64 now = bch2_current_time(trans->c); @@ -91,18 +90,19 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); - dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); if (IS_ERR(dir_iter)) return PTR_ERR(dir_iter); - /* XXX: shouldn't we be updating mtime/ctime on the directory? */ + dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_hash = bch2_hash_info_init(trans->c, &dir_u); + dir_hash = bch2_hash_info_init(trans->c, dir_u); bch2_trans_iter_put(trans, dir_iter); return bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(inode_u->bi_mode), name, inum, BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); } diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index c1621485a526..2273b7961c9b 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -14,6 +14,7 @@ int bch2_create_trans(struct btree_trans *, u64, int bch2_link_trans(struct btree_trans *, u64, u64, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, const struct qstr *); int bch2_unlink_trans(struct btree_trans *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 274a8b364702..f393ecb93b7e 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -602,7 +602,7 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(newpage)); - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -627,10 +627,10 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, static void bch2_readpages_end_io(struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, iter) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -782,11 +782,8 @@ static void readpage_bio_extend(struct readpages_iter *iter, if (!get_more) break; - rcu_read_lock(); - page = radix_tree_lookup(&iter->mapping->i_pages, page_offset); - rcu_read_unlock(); - - if (page && !radix_tree_exceptional_entry(page)) + page = xa_load(&iter->mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) break; page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); @@ -1037,32 +1034,33 @@ static void bch2_writepage_io_done(struct closure *cl) struct bch_writepage_io, cl); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bvec; - unsigned i, j; + unsigned i; if (io->op.error) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; SetPageError(bvec->bv_page); - mapping_set_error(io->inode->v.i_mapping, -EIO); + mapping_set_error(bvec->bv_page->mapping, -EIO); s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; s = __bch2_page_state(bvec->bv_page); spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; + for (i = 0; i < PAGE_SECTORS; i++) + s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } @@ -1086,7 +1084,7 @@ static void bch2_writepage_io_done(struct closure *cl) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s = __bch2_page_state(bvec->bv_page); if (atomic_dec_and_test(&s->write_count)) @@ -1240,7 +1238,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio) || + bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); @@ -1805,8 +1803,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct address_space *mapping = req->ki_filp->f_mapping; struct bch_inode_info *inode = file_bch_inode(req->ki_filp); struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i, unaligned; + unsigned unaligned; u64 new_i_size; bool sync; long ret; @@ -1838,7 +1837,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) * bio_iov_iter_get_pages was only able to get < * blocksize worth of pages: */ - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); ret = -EFAULT; goto err; @@ -1886,7 +1885,7 @@ loop: i_size_write(&inode->v, new_i_size); spin_unlock(&inode->v.i_lock); - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); if (!dio->iter.count || dio->op.error) break; @@ -2288,6 +2287,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (ret) goto err; + /* + * check this before next assertion; on filesystem error our normal + * invariants are a bit broken (truncate has to truncate the page cache + * before the inode). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + BUG_ON(inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { @@ -2403,7 +2411,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct address_space *mapping = inode->v.i_mapping; struct bkey_on_stack copy; struct btree_trans trans; - struct btree_iter *src, *dst, *del = NULL; + struct btree_iter *src, *dst; loff_t shift, new_size; u64 src_start; int ret; @@ -2485,9 +2493,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct bpos next_pos; struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); struct bpos atomic_end; - unsigned commit_flags = BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE; + unsigned trigger_flags = 0; k = insert ? bch2_btree_iter_peek_prev(src) @@ -2535,38 +2541,12 @@ reassemble: next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; - /* - * If the new and old keys overlap (because we're moving an - * extent that's bigger than the amount we're collapsing by), - * we need to trim the delete key here so they don't overlap - * because overlaps on insertions aren't handled before - * triggers are run, so the overwrite will get double counted - * by the triggers machinery: - */ - if (insert && - bkey_cmp(bkey_start_pos(©.k->k), delete.k.p) < 0) { - bch2_cut_back(bkey_start_pos(©.k->k), &delete); - } else if (!insert && - bkey_cmp(copy.k->k.p, - bkey_start_pos(&delete.k)) > 0) { - bch2_cut_front(copy.k->k.p, &delete); - - del = bch2_trans_copy_iter(&trans, src); - BUG_ON(IS_ERR_OR_NULL(del)); - - bch2_btree_iter_set_pos(del, - bkey_start_pos(&delete.k)); - } - - bch2_trans_update(&trans, dst, copy.k); - bch2_trans_update(&trans, del ?: src, &delete); - if (copy.k->k.size == k.k->size) { /* * If we're moving the entire extent, we can skip * running triggers: */ - commit_flags |= BTREE_INSERT_NOMARK; + trigger_flags |= BTREE_TRIGGER_NORUN; } else { /* We might end up splitting compressed extents: */ unsigned nr_ptrs = @@ -2578,15 +2558,13 @@ reassemble: BUG_ON(ret); } - ret = bch2_trans_commit(&trans, &disk_res, - &inode->ei_journal_seq, - commit_flags); + ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: + bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, + &inode->ei_journal_seq, + BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(c, &disk_res); bkey_err: - if (del) - bch2_trans_iter_put(&trans, del); - del = NULL; - if (!ret) bch2_btree_iter_set_pos(src, next_pos); @@ -2670,6 +2648,8 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, struct bkey_i_reservation reservation; struct bkey_s_c k; + bch2_trans_reset(&trans, TRANS_RESET_MEM); + k = bch2_btree_iter_peek_slot(iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -2716,8 +2696,6 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } - bch2_trans_begin_updates(&trans); - ret = bch2_extent_update(&trans, iter, &reservation.k_i, &disk_res, &inode->ei_journal_seq, 0, &i_sectors_delta); @@ -2844,235 +2822,6 @@ static void mark_range_unallocated(struct bch_inode_info *inode, } while (index <= end_index); } -static int generic_access_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - *count = min(*count, max_size - pos); - return 0; -} - -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) -{ - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - return generic_access_check_limits(file, pos, count); -} - -static int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_access_check_limits(file_in, pos_in, &count); - if (ret) - return ret; - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if ((remap_flags & REMAP_FILE_DEDUP) || - pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* Update the timestamps, since we can alter file contents. */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - return ret; - } - - /* - * Clear the security bits if the process is not being run by - * root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - return ret; - } - - return 0; -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -3265,7 +3014,7 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) loff_t ret = -1; page = find_lock_entry(mapping, index); - if (!page || radix_tree_exception(page)) + if (!page || xa_is_value(page)) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 1b593ea707d5..7063556d289b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -35,10 +35,6 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -#define REMAP_FILE_ADVISORY (0) -#define REMAP_FILE_DEDUP (1 << 0) -#define REMAP_FILE_CAN_SHORTEN (1 << 1) - loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t, loff_t, unsigned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index e627044c3409..f14f880534fd 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -140,7 +140,6 @@ retry: bch2_inode_write(&trans, iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); if (ret == -EINTR) @@ -269,7 +268,6 @@ retry: goto err_before_quota; ret = bch2_trans_commit(&trans, NULL, &journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -379,7 +377,7 @@ static int __bch2_link(struct bch_fs *c, struct dentry *dentry) { struct btree_trans trans; - struct bch_inode_unpacked inode_u; + struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); @@ -389,16 +387,21 @@ static int __bch2_link(struct bch_fs *c, bch2_trans_begin(&trans); ret = bch2_link_trans(&trans, dir->v.i_ino, - inode->v.i_ino, &inode_u, + inode->v.i_ino, &dir_u, &inode_u, &dentry->d_name) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); } while (ret == -EINTR); - if (likely(!ret)) + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + + journal_seq_copy(inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + } bch2_trans_exit(&trans); mutex_unlock(&inode->ei_update_lock); @@ -444,7 +447,6 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) &inode_u, &dentry->d_name) ?: bch2_trans_commit(&trans, NULL, &dir->ei_journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); } while (ret == -EINTR); @@ -573,7 +575,6 @@ retry: mode) ?: bch2_trans_commit(&trans, NULL, &journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); if (ret == -EINTR) goto retry; @@ -706,7 +707,6 @@ retry: ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); btree_err: @@ -963,15 +963,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_readdir(c, inode->v.i_ino, ctx); } -static int bch2_clone_file_range(struct file *file_src, loff_t pos_src, - struct file *file_dst, loff_t pos_dst, - u64 len) -{ - return bch2_remap_file_range(file_src, pos_src, - file_dst, pos_dst, - len, 0); -} - static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, @@ -989,7 +980,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif - .clone_file_range = bch2_clone_file_range, + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { @@ -1520,7 +1511,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, sb->s_bdi->congested_fn = bch2_congested; sb->s_bdi->congested_data = c; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(ca, c, i) { struct block_device *bdev = ca->disk_sb.bdev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 0f2308e53d65..9ef532d875e8 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -37,8 +37,8 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) return ret ?: sectors; } -static int remove_dirent(struct btree_trans *trans, - struct bkey_s_c_dirent dirent) +static int __remove_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent dirent) { struct bch_fs *c = trans->c; struct qstr name; @@ -49,38 +49,47 @@ static int remove_dirent(struct btree_trans *trans, char *buf; name.len = bch2_dirent_name_bytes(dirent); - buf = kmalloc(name.len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + buf = bch2_trans_kmalloc(trans, name.len + 1); + if (IS_ERR(buf)) + return PTR_ERR(buf); memcpy(buf, dirent.v->d_name, name.len); buf[name.len] = '\0'; name.name = buf; - /* Unlock so we don't deadlock, after copying name: */ - bch2_trans_unlock(trans); - - ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode); - if (ret) { + ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); + if (ret && ret != -EINTR) bch_err(c, "remove_dirent: err %i looking up directory inode", ret); - goto err; - } + if (ret) + return ret; dir_hash_info = bch2_hash_info_init(c, &dir_inode); - ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); - if (ret) + ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, + &dir_hash_info, dir_inum, &name); + if (ret && ret != -EINTR) bch_err(c, "remove_dirent: err %i deleting dirent", ret); -err: - kfree(buf); - return ret; + if (ret) + return ret; + + return 0; +} + +static int remove_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent dirent) +{ + return __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, + __remove_dirent(trans, dirent)); } static int reattach_inode(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode, u64 inum) { - struct bch_inode_unpacked inode_u; + struct bch_inode_unpacked dir_u, inode_u; char name_buf[20]; struct qstr name; int ret; @@ -88,11 +97,10 @@ static int reattach_inode(struct bch_fs *c, snprintf(name_buf, sizeof(name_buf), "%llu", inum); name = (struct qstr) QSTR(name_buf); - ret = bch2_trans_do(c, NULL, - BTREE_INSERT_ATOMIC| + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, bch2_link_trans(&trans, lostfound_inode->bi_inum, - inum, &inode_u, &name)); + inum, &dir_u, &inode_u, &name)); if (ret) bch_err(c, "error %i reattaching inode %llu", ret, inum); @@ -171,27 +179,26 @@ static int hash_redo_key(const struct bch_hash_desc desc, struct btree_iter *k_iter, struct bkey_s_c k, u64 hashed) { + struct bkey_i delete; struct bkey_i *tmp; - int ret = 0; - tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!tmp) - return -ENOMEM; + bch2_trans_reset(trans, TRANS_RESET_MEM); + + tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); bkey_reassemble(tmp, k); - ret = bch2_btree_delete_at(trans, k_iter, 0); - if (ret) - goto err; + bkey_init(&delete.k); + delete.k.p = k_iter->pos; + bch2_trans_update(trans, k_iter, &delete, 0); - bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, - tmp, BCH_HASH_SET_MUST_CREATE); - ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); -err: - kfree(tmp); - return ret; + return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, + tmp, BCH_HASH_SET_MUST_CREATE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); } static int fsck_hash_delete_at(struct btree_trans *trans, @@ -203,7 +210,6 @@ static int fsck_hash_delete_at(struct btree_trans *trans, retry: ret = bch2_hash_delete_at(trans, desc, info, iter) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); if (ret == -EINTR) { @@ -313,9 +319,11 @@ static int hash_check_key(struct btree_trans *trans, "hashed to %llu chain starts at %llu\n%s", desc.btree_id, k.k->p.offset, hashed, h->chain->pos.offset, - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { + do { + ret = hash_redo_key(desc, trans, h, k_iter, k, hashed); + } while (ret == -EINTR); + if (ret) { bch_err(c, "hash_redo_key err %i", ret); return ret; @@ -376,11 +384,11 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", buf, strlen(buf), d->v.d_name, len)) { - bch2_trans_update(trans, iter, &d->k_i); - - ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, + (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); if (ret) goto err; @@ -402,8 +410,11 @@ err_redo: k->k->p.offset, hash, h->chain->pos.offset, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - ret = hash_redo_key(bch2_dirent_hash_desc, trans, - h, iter, *k, hash); + do { + ret = hash_redo_key(bch2_dirent_hash_desc, trans, + h, iter, *k, hash); + } while (ret == -EINTR); + if (ret) bch_err(c, "hash_redo_key err %i", ret); else @@ -646,11 +657,11 @@ retry: bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = mode_to_type(target.bi_mode); - bch2_trans_update(&trans, iter, &n->k_i); - - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, + (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); kfree(n); if (ret) goto err; @@ -790,8 +801,7 @@ fsck_err: create_lostfound: bch2_inode_init_early(c, lostfound_inode); - ret = bch2_trans_do(c, NULL, - BTREE_INSERT_ATOMIC| + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, bch2_create_trans(&trans, @@ -1114,7 +1124,7 @@ static int check_inode_nlink(struct bch_fs *c, if (!link->count && !(u->bi_flags & BCH_INODE_UNLINKED) && - (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", u->bi_inum, mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) @@ -1149,7 +1159,7 @@ static int check_inode_nlink(struct bch_fs *c, } if (i_nlink != real_i_nlink && - (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { if (fsck_err(c, "inode %llu has wrong i_nlink " "(type %u i_nlink %u, should be %u)", u->bi_inum, mode_to_type(u->bi_mode), @@ -1261,12 +1271,13 @@ static int check_inode(struct btree_trans *trans, struct bkey_inode_buf p; bch2_inode_pack(&p, &u); - bch2_trans_update(trans, iter, &p.inode.k_i); - ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret && ret != -EINTR) + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + TRANS_RESET_MEM, + (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); + if (ret) bch_err(c, "error in fsck: error %i " "updating inode", ret); } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index c0642ff46ba0..e811b98d0f03 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -223,7 +223,7 @@ int bch2_inode_write(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(inode_p, inode); - bch2_trans_update(trans, iter, &inode_p->inode.k_i); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -411,7 +411,7 @@ again: inode_u->bi_generation = bkey_generation(k); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } } @@ -493,10 +493,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) delete.v.bi_generation = cpu_to_le32(bi_generation); } - bch2_trans_update(&trans, iter, &delete.k_i); + bch2_trans_update(&trans, iter, &delete.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); } while (ret == -EINTR); @@ -533,7 +532,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, struct bch_inode_unpacked *inode) { - return bch2_trans_do(c, NULL, 0, + return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 1d6dd19458ff..4c7dd0994a28 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -124,10 +124,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; @@ -292,18 +292,17 @@ int bch2_extent_update(struct btree_trans *trans, if (delta || new_i_size) { bch2_inode_pack(&inode_p, &inode_u); bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i); + &inode_p.inode.k_i, 0); } bch2_trans_iter_put(trans, inode_iter); } - bch2_trans_update(trans, iter, k); + bch2_trans_update(trans, iter, k, 0); ret = bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| BTREE_INSERT_USE_RESERVE); if (!ret && i_sectors_delta) *i_sectors_delta += delta; @@ -326,6 +325,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_disk_reservation_init(c, 0); struct bkey_i delete; + bch2_trans_reset(trans, TRANS_RESET_MEM); + ret = bkey_err(k); if (ret) goto btree_err; @@ -337,8 +338,6 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete); - bch2_trans_begin_updates(trans); - ret = bch2_extent_update(trans, iter, &delete, &disk_res, journal_seq, 0, i_sectors_delta); @@ -400,14 +399,14 @@ int bch2_write_index_default(struct bch_write_op *op) BTREE_ITER_SLOTS|BTREE_ITER_INTENT); do { + bch2_trans_reset(&trans, TRANS_RESET_MEM); + k = bch2_keylist_front(keys); bkey_on_stack_realloc(&sk, c, k->k.u64s); bkey_copy(sk.k, k); bch2_cut_front(iter->pos, sk.k); - bch2_trans_begin_updates(&trans); - ret = bch2_extent_update(&trans, iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta); @@ -501,12 +500,13 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - if (op->end_io) + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); op->end_io(op); - if (cl->parent) + } else { closure_return(cl); - else - closure_debug_destroy(cl); + } } /** @@ -1139,7 +1139,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; - bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA); + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), @@ -1233,12 +1233,14 @@ void bch2_write(struct closure *cl) err: if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) bch2_disk_reservation_put(c, &op->res); - if (op->end_io) + + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); op->end_io(op); - if (cl->parent) + } else { closure_return(cl); - else - closure_debug_destroy(cl); + } } /* Cache promotion on read */ @@ -1736,9 +1738,8 @@ retry: if (!bch2_bkey_narrow_crcs(new.k, new_crc)) goto out; - bch2_trans_update(&trans, iter, new.k); + bch2_trans_update(&trans, iter, new.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_NOWAIT); if (ret == -EINTR) @@ -1785,7 +1786,7 @@ static void __bch2_read_endio(struct work_struct *work) crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - if (crc.compression_type != BCH_COMPRESSION_NONE) { + if (crc.compression_type != BCH_COMPRESSION_TYPE_none) { bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; @@ -1978,7 +1979,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, goto hole; iter.bi_size = pick.crc.compressed_size << 9; - goto noclone; + goto get_bio; } if (!(flags & BCH_READ_LAST_FRAGMENT) || @@ -1993,7 +1994,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none || (pick.crc.csum_type != BCH_CSUM_NONE && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && @@ -2025,7 +2026,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, pick.crc.live_size = bvec_iter_sectors(iter); offset_into_extent = 0; } - +get_bio: if (rbio) { /* * promote already allocated bounce rbio: @@ -2063,7 +2064,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, rbio->bio.bi_iter = iter; rbio->split = true; } else { -noclone: rbio = orig; rbio->bio.bi_iter = iter; EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 787d9f7638d0..a21de0088753 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -121,7 +121,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl->start[nr].end = cpu_to_le64(end); out_write_sb: c->disk_sb.sb->features[0] |= - 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; + 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; ret = bch2_write_super(c); out: @@ -309,7 +309,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) if (!new_nr) c->disk_sb.sb->features[0] &= - ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); + ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); bch2_write_super(c); } diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 4b59dcd04cce..1ef62a189e33 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -53,9 +53,6 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { if (!bch2_bkey_has_device(k, dev_idx)) { - ret = bch2_mark_bkey_replicas(c, k); - if (ret) - break; bch2_btree_iter_next(iter); continue; } @@ -76,10 +73,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - bch2_trans_update(&trans, iter, sk.k); + bch2_trans_update(&trans, iter, sk.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); /* @@ -130,34 +126,27 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) struct bkey_i_btree_ptr *new_key; retry: if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), - dev_idx)) { - /* - * we might have found a btree node key we - * needed to update, and then tried to update it - * but got -EINTR after upgrading the iter, but - * then raced and the node is now gone: - */ - bch2_btree_iter_downgrade(iter); - - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); - if (ret) - goto err; - } else { - bkey_copy(&tmp.k, &b->key); - new_key = bkey_i_to_btree_ptr(&tmp.k); - - ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), - dev_idx, flags, true); - if (ret) - goto err; - - ret = bch2_btree_node_update_key(c, iter, b, new_key); - if (ret == -EINTR) { - b = bch2_btree_iter_peek_node(iter); - goto retry; - } - if (ret) - goto err; + dev_idx)) + continue; + + bkey_copy(&tmp.k, &b->key); + new_key = bkey_i_to_btree_ptr(&tmp.k); + + ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i), + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); + goto err; + } + + ret = bch2_btree_node_update_key(c, iter, b, new_key); + if (ret == -EINTR) { + b = bch2_btree_iter_peek_node(iter); + goto retry; + } + if (ret) { + bch_err(c, "Error updating btree node key: %i", ret); + goto err; } } bch2_trans_iter_free(&trans, iter); @@ -168,9 +157,10 @@ retry: closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c) || c->btree_roots_dirty); + if (c->btree_roots_dirty) + bch2_journal_meta(&c->journal); if (!bch2_btree_interior_updates_nr_pending(c)) break; - bch2_journal_meta(&c->journal); } ret = 0; @@ -185,6 +175,5 @@ err: int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) { return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags) ?: - bch2_replicas_gc2(c); + bch2_dev_metadata_drop(c, dev_idx, flags); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7a2646b9d622..257e00ae6fa7 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -150,11 +150,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto next; } - bch2_trans_update(&trans, iter, insert); + bch2_trans_update(&trans, iter, insert, 0); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), - BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| m->data_opts.btree_insert_flags); @@ -273,7 +272,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE && + p.crc.compression_type != BCH_COMPRESSION_TYPE_none && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) compressed_sectors += p.crc.compressed_size; @@ -301,12 +300,12 @@ static void move_free(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->write.ctxt; + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) + bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) if (bv->bv_page) __free_page(bv->bv_page); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index abdeef20fde9..e9cb2304576f 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -212,14 +212,36 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) buckets_to_move, buckets_not_moved); } +/* + * Copygc runs when the amount of fragmented data is above some arbitrary + * threshold: + * + * The threshold at the limit - when the device is full - is the amount of space + * we reserved in bch2_recalc_capacity; we can't have more than that amount of + * disk space stranded due to fragmentation and store everything we have + * promised to store. + * + * But we don't want to be running copygc unnecessarily when the device still + * has plenty of free space - rather, we want copygc to smoothly run every so + * often and continually reduce the amount of fragmented space as the device + * fills up. So, we increase the threshold by half the current free space. + */ +unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); + u64 fragmented_allowed = ca->copygc_threshold + + ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); + + return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); +} + static int bch2_copygc_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; struct io_clock *clock = &c->io_clock[WRITE]; - struct bch_dev_usage usage; - unsigned long last; - u64 available, fragmented, reserve, next; + unsigned long last, wait; set_freezable(); @@ -228,28 +250,10 @@ static int bch2_copygc_thread(void *arg) break; last = atomic_long_read(&clock->now); + wait = bch2_copygc_wait_amount(ca); - reserve = ca->copygc_threshold; - - usage = bch2_dev_usage_read(c, ca); - - available = __dev_buckets_available(ca, usage) * - ca->mi.bucket_size; - if (available > reserve) { - next = last + available - reserve; - bch2_kthread_io_clock_wait(clock, next, - MAX_SCHEDULE_TIMEOUT); - continue; - } - - /* - * don't start copygc until there's more than half the copygc - * reserve of fragmented space: - */ - fragmented = usage.sectors_fragmented; - if (fragmented < reserve) { - next = last + reserve - fragmented; - bch2_kthread_io_clock_wait(clock, next, + if (wait > clock->max_slop) { + bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); continue; } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index cbacd2f36799..94d6c044a27d 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -16,18 +16,24 @@ const char * const bch2_error_actions[] = { NULL }; -const char * const bch2_csum_types[] = { +const char * const bch2_sb_features[] = { +#define x(f, n) #f, + BCH_SB_FEATURES() +#undef x + NULL +}; + +const char * const bch2_csum_opts[] = { "none", "crc32c", "crc64", NULL }; -const char * const bch2_compression_types[] = { - "none", - "lz4", - "gzip", - "zstd", +const char * const bch2_compression_opts[] = { +#define x(t, n) #t, + BCH_COMPRESSION_OPTS() +#undef x NULL }; @@ -300,7 +306,7 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) break; case Opt_erasure_code: if (v) - bch2_check_set_feature(c, BCH_FEATURE_EC); + bch2_check_set_feature(c, BCH_FEATURE_ec); break; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 1f11f4152a6f..1c05effa71e6 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -9,8 +9,9 @@ #include "bcachefs_format.h" extern const char * const bch2_error_actions[]; -extern const char * const bch2_csum_types[]; -extern const char * const bch2_compression_types[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_data_types[]; extern const char * const bch2_cache_replacement_policies[]; @@ -112,23 +113,23 @@ enum opt_type { "#", NULL) \ x(metadata_checksum, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_types), \ + OPT_STR(bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_STR(bch2_csum_types), \ + OPT_STR(bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ NULL, NULL) \ x(compression, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_STR(bch2_compression_types), \ - BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ - OPT_STR(bch2_compression_types), \ - BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE, \ + OPT_STR(bch2_compression_opts), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(str_hash, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 0fa6f33c049b..e7787c5063ce 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, if (qdq->d_fieldmask & QC_INO_HARD) new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - bch2_trans_update(&trans, iter, &new_quota.k_i); + bch2_trans_update(&trans, iter, &new_quota.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 84b3fb6eb101..612385e9d4e4 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -183,6 +183,8 @@ static int bch2_rebalance_thread(void *arg) prev_cputime = curr_cputime(); while (!kthread_wait_freezable(r->enabled)) { + cond_resched(); + start = jiffies; cputime = curr_cputime(); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e6b51131cff2..8ecd4abc8eeb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -300,31 +300,24 @@ retry: bch2_cut_front(split_iter->pos, split); bch2_cut_back(atomic_end, split); - bch2_trans_update(&trans, split_iter, split); + bch2_trans_update(&trans, split_iter, split, !remark + ? BTREE_TRIGGER_NORUN + : BTREE_TRIGGER_NOOVERWRITES); bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); if (remark) { ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), 0, -((s64) k->k.size), - BCH_BUCKET_MARK_OVERWRITE) ?: - bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOMARK_OVERWRITES| - BTREE_INSERT_NO_CLEAR_REPLICAS); - } else { - ret = bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); + BTREE_TRIGGER_OVERWRITE); + if (ret) + goto err; } - if (ret) - goto err; + ret = bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY); err: if (ret == -EINTR) goto retry; @@ -334,6 +327,30 @@ err: return bch2_trans_exit(&trans) ?: ret; } +static int __bch2_journal_replay_key(struct btree_trans *trans, + enum btree_id id, struct bkey_i *k) +{ + struct btree_iter *iter; + + iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + return 0; +} + +static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, + struct bkey_i *k) +{ + return bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY, + __bch2_journal_replay_key(&trans, id, k)); +} + static int bch2_journal_replay(struct bch_fs *c, struct journal_keys keys) { @@ -351,12 +368,7 @@ static int bch2_journal_replay(struct bch_fs *c, else if (btree_node_type_is_extents(i->btree_id)) ret = bch2_extent_replay_key(c, i->btree_id, i->k); else - ret = bch2_btree_insert(c, i->btree_id, i->k, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); + ret = bch2_journal_replay_key(c, i->btree_id, i->k); if (ret) { bch_err(c, "journal replay: error %d while replaying key", @@ -869,7 +881,7 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!c->sb.clean) { - if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) { + if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { bch_info(c, "checking inode link counts"); err = "error in recovery"; ret = bch2_fsck_inode_nlink(c); @@ -910,6 +922,8 @@ int bch2_fs_recovery(struct bch_fs *c) c->disk_sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_min); c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; write_sb = true; } @@ -920,7 +934,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && !test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); write_sb = true; } @@ -989,11 +1003,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_fs_journal_start(&c->journal, 1, &journal); bch2_journal_set_replay_done(&c->journal); - err = "error going read write"; - ret = __bch2_fs_read_write(c, true); - if (ret) - goto err; - bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; @@ -1002,14 +1011,14 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed_inode.inode.k_i, - NULL, NULL, 0); + NULL, NULL, BTREE_INSERT_LAZY_RW); if (ret) goto err; bch2_inode_init_early(c, &lostfound_inode); err = "error creating lost+found"; - ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + ret = bch2_trans_do(c, NULL, NULL, 0, bch2_create_trans(&trans, BCACHEFS_ROOT_INO, &root_inode, &lostfound_inode, &lostfound, @@ -1032,7 +1041,9 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_lock(&c->sb_lock); c->disk_sb.sb->version = c->disk_sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash; + c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 53bd0e0ea058..3b8c74ca3725 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -115,7 +115,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->v.refcount = 0; memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); - bch2_trans_update(trans, reflink_iter, &r_v->k_i); + bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); if (IS_ERR(r_p)) @@ -126,7 +126,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); - bch2_trans_update(trans, extent_iter, &r_p->k_i); + bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); err: if (!IS_ERR(reflink_iter)) { c->reflink_hint = reflink_iter->pos.offset; @@ -171,7 +171,7 @@ s64 bch2_remap_range(struct bch_fs *c, if (!percpu_ref_tryget(&c->writes)) return -EROFS; - bch2_check_set_feature(c, BCH_FEATURE_REFLINK); + bch2_check_set_feature(c, BCH_FEATURE_reflink); dst_end.offset += remap_sectors; src_end.offset += remap_sectors; @@ -185,7 +185,8 @@ s64 bch2_remap_range(struct bch_fs *c, BTREE_ITER_INTENT); while (1) { - bch2_trans_begin_updates(&trans); + bch2_trans_reset(&trans, TRANS_RESET_MEM); + trans.mem_top = 0; if (fatal_signal_pending(current)) { @@ -287,8 +288,7 @@ err: inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, journal_seq, - BTREE_INSERT_ATOMIC); + bch2_trans_commit(&trans, NULL, journal_seq, 0); } } while (ret2 == -EINTR); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cb5ebb87c701..366888b1b36d 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -84,10 +84,10 @@ static void extent_to_replicas(struct bkey_s_c k, if (p.ptr.cached) continue; - if (p.has_ec) + if (!p.has_ec) + r->devs[r->nr_devs++] = p.ptr.dev; + else r->nr_required = 0; - - r->devs[r->nr_devs++] = p.ptr.dev; } } diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 0d6e19126021..8527d82841bb 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -72,9 +72,6 @@ int bch2_replicas_set_usage(struct bch_fs *, /* iterate over superblock replicas - used by userspace tools: */ -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - #define replicas_entry_next(_i) \ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 582e718b6bd1..f2779159a6b8 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -23,7 +23,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) case BCH_STR_HASH_OPT_CRC64: return BCH_STR_HASH_CRC64; case BCH_STR_HASH_OPT_SIPHASH: - return c->sb.features & (1ULL << BCH_FEATURE_NEW_SIPHASH) + return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ? BCH_STR_HASH_SIPHASH : BCH_STR_HASH_SIPHASH_OLD; default: @@ -281,7 +281,7 @@ not_found: swap(iter, slot); insert->k.p = iter->pos; - bch2_trans_update(trans, iter, insert); + bch2_trans_update(trans, iter, insert, 0); } goto out; @@ -308,7 +308,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; - bch2_trans_update(trans, iter, delete); + bch2_trans_update(trans, iter, delete, 0); return 0; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index daaeaf0446a3..43927853210a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -51,7 +51,9 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > sb->page_order); - if (!f) { + if (!f && !u64s) { + /* nothing to do: */ + } else if (!f) { f = vstruct_last(sb->sb); memset(f, 0, sizeof(u64) * u64s); f->u64s = cpu_to_le32(u64s); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 17bdf985559c..38920fff4500 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -404,7 +404,7 @@ static int bch2_fs_read_write_late(struct bch_fs *c) return 0; } -int __bch2_fs_read_write(struct bch_fs *c, bool early) +static int __bch2_fs_read_write(struct bch_fs *c, bool early) { struct bch_dev *ca; unsigned i; @@ -735,9 +735,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (bch2_fs_init_fault("fs_alloc")) goto err; - iter_size = sizeof(struct btree_node_iter_large) + + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * - sizeof(struct btree_node_iter_set); + sizeof(struct sort_iter_set); if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || @@ -1416,7 +1416,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_lock(&c->state_lock); - percpu_ref_put(&ca->ref); /* XXX */ + /* + * We consume a reference to ca->ref, regardless of whether we succeed + * or fail: + */ + percpu_ref_put(&ca->ref); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { bch_err(ca, "Cannot remove without losing data"); @@ -1425,11 +1429,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - /* - * XXX: verify that dev_idx is really not in use anymore, anywhere - * - * flag_data_bad() does not check btree pointers - */ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); if (ret) { bch_err(ca, "Remove failed: error %i dropping data", ret); @@ -1442,17 +1441,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - data = bch2_dev_has_data(c, ca); - if (data) { - char data_has_str[100]; - - bch2_flags_to_text(&PBUF(data_has_str), - bch2_data_types, data); - bch_err(ca, "Remove failed, still has data (%s)", data_has_str); - ret = -EBUSY; - goto err; - } - ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), POS(ca->dev_idx + 1, 0), @@ -1467,12 +1455,33 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) * (overwritten) keys that point to the device we're removing: */ bch2_journal_flush_all_pins(&c->journal); + /* + * hack to ensure bch2_replicas_gc2() clears out entries to this device + */ + bch2_journal_meta(&c->journal); ret = bch2_journal_error(&c->journal); if (ret) { bch_err(ca, "Remove failed, journal error"); goto err; } + ret = bch2_replicas_gc2(c); + if (ret) { + bch_err(ca, "Remove failed: error %i from replicas gc", ret); + goto err; + } + + data = bch2_dev_has_data(c, ca); + if (data) { + char data_has_str[100]; + + bch2_flags_to_text(&PBUF(data_has_str), + bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has_str); + ret = -EBUSY; + goto err; + } + __bch2_dev_offline(c, ca); mutex_lock(&c->sb_lock); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 41992e891391..4aa5dd7917cf 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -219,7 +219,6 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); -int __bch2_fs_read_write(struct bch_fs *, bool); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index e7699afd99fc..602def1ee95a 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -18,6 +18,7 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "clock.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -198,6 +199,9 @@ rw_attribute(pd_controllers_update_seconds); read_attribute(meta_replicas_have); read_attribute(data_replicas_have); +read_attribute(io_timers_read); +read_attribute(io_timers_write); + #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ @@ -272,7 +276,7 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) struct extent_ptr_decoded p; extent_for_each_ptr_decode(e, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_NONE) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else { @@ -404,6 +408,11 @@ SHOW(bch2_fs) if (attr == &sysfs_new_stripes) return bch2_new_stripes(c, buf); + if (attr == &sysfs_io_timers_read) + return bch2_io_timers_show(&c->io_clock[READ], buf); + if (attr == &sysfs_io_timers_write) + return bch2_io_timers_show(&c->io_clock[WRITE], buf); + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM @@ -581,6 +590,9 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, + &sysfs_io_timers_read, + &sysfs_io_timers_write, + &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -904,8 +916,6 @@ SHOW(bch2_dev) bch2_disk_path_to_text(&out, &c->disk_sb, ca->mi.group - 1); mutex_unlock(&c->sb_lock); - } else { - pr_buf(&out, "none"); } pr_buf(&out, "\n"); diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 724f41e6590c..8f9b0cca17da 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, iter, &k.k_i); + bch2_trans_update(&trans, iter, &k.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, iter, &k.k_i); + bch2_trans_update(&trans, iter, &k.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p = iter->pos; - bch2_trans_update(&trans, iter, &k.k_i); + bch2_trans_update(&trans, iter, &k.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } @@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; - bch2_trans_update(&trans, iter, &insert.k_i); + bch2_trans_update(&trans, iter, &insert.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, iter, &u.k_i); + bch2_trans_update(&trans, iter, &u.k_i, 0); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 2b19a0038045..0128daba5970 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -664,35 +664,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -static inline struct bio_vec next_contig_bvec(struct bio *bio, - struct bvec_iter *iter) -{ - struct bio_vec bv = bio_iter_iovec(bio, *iter); - - bio_advance_iter(bio, iter, bv.bv_len); -#ifndef CONFIG_HIGHMEM - while (iter->bi_size) { - struct bio_vec next = bio_iter_iovec(bio, *iter); - - if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len != - page_address(next.bv_page) + next.bv_offset) - break; - - bv.bv_len += next.bv_len; - bio_advance_iter(bio, iter, next.bv_len); - } -#endif - return bv; -} - -#define __bio_for_each_contig_segment(bv, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bv = next_contig_bvec((bio), &(iter))), 1);) - -#define bio_for_each_contig_segment(bv, bio, iter) \ - __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter) - void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 9b8f6f1f9a77..725a6f3ef8ce 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -327,7 +327,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC, + return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, bch2_xattr_set(&trans, inode->v.i_ino, &inode->ei_str_hash, name, value, size, |